### Imports

In [1]:
import os
import re
import numpy as np
import sys
from pathlib import Path
import pickle
import csv
from pathlib import Path
sys.path.append("..")
from project_paths import ProjectPaths
ProjectPaths.set_path_to_repository("..")

#### Available project.paths

In [2]:
ProjectPaths.print_paths()

data_dir                                : ../../data
dr_detektor_automatic_fact_checking_dir : ../../data/DRDetektorAutomaticFactChecking
annotated_subtitles                     : ../../data/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles
deep_fact_dir                           : ../../data/DeepFactData
annotated                               : ../../data/DeepFactData/annotated
data_matrix_path                        : ../../data/DeepFactData/annotated/data_matrix_sample_programs.csv
nlp_data_dir                            : ../../data/DeepFactData/nlp_data
embeddings_file                         : ../../data/DeepFactData/nlp_data/embeddings.csv
pos_tags_file                           : ../../data/DeepFactData/nlp_data/pos_tags.csv
speller_dir                             : ../../data/DeepFactData/spelling_model
speller_char_vocab_file                 : ../../data/DeepFactData/spelling_model/char_embedding.json
speller_encoder_checkpoint_file         : ../../data/Dee

#### Set paths

In [3]:
save_loc = ProjectPaths.annotated
loc_ann_data = ProjectPaths.annotated_subtitles

#### Data Cleaner Class

In [4]:
class DebattenAnnotatedDatacleaner:
    """
    Takes the annotated programs, ...
    """
    
    # Initialises class and input, output locations
    def __init__(self, loc_ann=None, loc_out=None):
        self.loc_ann_subtitles = loc_ann
        self.loc_out_subtitles = loc_out
    
    def setAnnotatedFilesLocation(self, new_loc):
        self.loc_ann_subtitles = new_loc
        
    def setOutputFilesLocation(self, new_loc):
        self.loc_out_subtitles = new_loc
    
    def getFileLocation(self, disp=True):
        
        if disp:
            if self.loc_ann_subtitles is None:
                print('Annotated subtitles are not specified!')
            else:
                print('Annotated subtitles are loaded from "{:s}"'.format(self.loc_ann_subtitles))

            if self.loc_out_subtitles is None:
                print('Save location is not specified!')
            else:
                print('Save location is "{:s}"'.format(self.loc_out_subtitles))
       
        return self.loc_ann_subtitles, self.loc_out_subtitles
    
    def getFilePaths(self):
        files = self.loc_ann_subtitles.glob("*.txt")
        return list(files)
    
    def getProgramAndSentences(self, f_path):
        """Gets the program id, sentences id and sentences from a document"""
        with f_path.open('r', encoding="utf-8") as f:
            doc = f.read()

        #Find program id
        #m_program_id = re.compile('program[\d]+')
        m_program_id = re.compile('ram[ \d]+"')
        #print('\n ')
        
        m = re.search(m_program_id, doc)
        program_id = m.group()
        program_id = program_id[3:-1].strip()

        sentences = doc.split('<p ')
        m_sentence_id = re.compile('id="[\d]+">')

        # Finds the sentence ids and removes html stuff from the begining of each sentence
        sentences_id = []
        for i in range(len(sentences)):
            match = re.search(m_sentence_id, sentences[i])
            if not match:
                sentences[i] = None
            else:
                sentences_id.append(int(match.group()[4:-2]))

                start_from = sentences[i].find('>')+1
                sentences[i] = sentences[i][start_from:]

        sentences = list(filter(None, sentences)) # Remove None elements
        assert(len(sentences)==len(sentences_id))

        return program_id, sentences_id, sentences
    
    # Finds highligted text including its surrounding patttern
    def findHighlights(self,s):
        m_highlight = re.compile('<span id="highlight["\w\d ]+class="highlight[\w"]+>[\w\d. ,!?%]+</span>')
        return re.findall(m_highlight, s)
    
    # Extracts highlighted text only
    def extractHighlights(self, s_matches):#Extracted the text highlighted
        m_high_text = re.compile('">[\w\d ,.!?%]+</')
        high_text = [re.findall(m_high_text, s_matches[i])[0][2:-2] for i in range(len(s_matches))]
        return [s.lstrip().rstrip() for s in high_text]
    
    # Removes html tags (and crap) from the string.
    def cleanSentence(self, s, disp=False):

        m_crap = re.compile('<[\w\d "=/]+>')
        s_crap_free = s
        for pattern in re.findall(m_crap, s): 
            if disp: print(pattern)
            s_crap_free = s_crap_free.replace(pattern,'')

        #s_crap_free = re.sub('id="[\d]+">','',s_crap_free) # only during dev

        s_crap_free = s_crap_free.replace('\t',' ') # removes tabs
        s_crap_free = re.sub(' +',' ', s_crap_free) # removes excess spaces
        return s_crap_free.lstrip().rstrip()

    def getHighlight_indices(self,s,s_highlighted):
        
        # Two heuristic for correcting partially highlighted words.
        def getLeadingSpace(s,start_idx):
            # Finds first leading space before index "start_idx" in s
            if start_idx < 0:
                return 0
            elif s[start_idx] is ' ' :
                return start_idx+1
            else:
                return getLeadingSpace(s,start_idx-1)

        def getTailingSpace(s,end_idx):
            # Finds first trailing space after index "end_idx" in s
            if end_idx >= len(s):
                return len(s)
            elif s[end_idx] is ' ' or end_idx == len(s):
                return end_idx
            else:
                return getTailingSpace(s,end_idx+1)
        
        # Find the indicies of highlighted words
        indices = []
        # Get matched indices
        for m in s_highlighted:
            
            if m is not '?':

                m_pattern = re.compile(m)
                match = re.search(m_pattern, s)
                #print(type(match))
                if match:
                    indices.append([getLeadingSpace(s, match.start()), 
                                    getTailingSpace(s, match.end())])
                else:

                    print(match)
                    print(m)
                    print(s_highlighted)
                    print(s+'\n')
            else:
                print('Annotation bug. A single question mark was annotated..')
            
                
        #print('\n\n')
        return indices
    
    def getCleanedProgramSentences(self, sentences): 
        sentences_processed = [None]*len(sentences)
        sentences_highlight = [None]*len(sentences)
        sentences_highlight_ind = [None]*len(sentences)
        
        for i in range(len(sentences)):
            sen = sentences[i]
            raw_highlights = self.findHighlights(sen)
            text_highlights = self.extractHighlights(raw_highlights)
            
            #Crap free verion
            sentences_processed[i] = self.cleanSentence(sen)
            #print('cleaned: '+sentences_processed[i])
            indices_highlights = self.getHighlight_indices(sentences_processed[i], 
                                                                         text_highlights)
            sentences_highlight_ind[i] = indices_highlights
            
            for idx in indices_highlights:
                if sentences_highlight[i]:
                     sentences_highlight[i] = sentences_highlight[i]+ ' [new claim]: '\
                                              +sentences_processed[i][idx[0]:idx[1]]
                else:
                    sentences_highlight[i] = sentences_processed[i][idx[0]:idx[1]]
            
            
        return sentences_processed, sentences_highlight, sentences_highlight_ind
    
    # EXPERIMENTAL!!! Processing multi-claim paragraphs
    def processMultiClaim(self,s,idx):
        merge_claims = []
        for c in range(len(idx)-1):
            if idx[c][1]-idx[c+1][0] >= -1: #It is the same claim
                merge_claims.append(True)
            else:
                merge_claims.append(False)

        new_s = []
        new_idx = []
        for c in range(len(idx)-1):
            if merge_claims[c]:
                start_id = idx[c][0]
                end_id = idx[c+1][1]
                new_idx.append([start_id, end_id])
                new_s.append(s[start_id:end_id])
            else:
                if c > 0:
                    new_s.append(' [new claim]: ')

                start_id = idx[c][0]
                end_id = idx[c][1]
                new_idx.append([start_id, end_id])
                new_s.append(s[start_id:end_id])

        if not merge_claims[-1]:

            new_s.append(' [new claim]: ')

            start_id = idx[-1][0]
            end_id = idx[-1][1]
            new_idx.append([start_id, end_id])
            new_s.append(s[start_id:end_id])


        new_s = ''.join(new_s)
        return new_s, new_idx
    
    def getAllCleanedProgramSentences(self,disp=False):
        file_paths = self.getFilePaths()

        all_program_id = [None]*len(file_paths)
        all_sentences = [None]*len(file_paths)
        all_sentences_id = [None]*len(file_paths)
        all_highlights = [None]*len(file_paths)
        all_highlights_ind = [None]*len(file_paths)
        
        total_claims = 0;
        total_sentences = 0;
        
        for f in range(len(file_paths)):
            all_program_id[f], all_sentences_id[f], sentences = \
                        self.getProgramAndSentences(file_paths[f])
            if disp: print('Program id {:s}'.format(all_program_id[f]))
            
            all_sentences[f], all_highlights[f], all_highlights_ind[f] = \
                        self.getCleanedProgramSentences(sentences)
            
            num_claims = len(list(filter(None,all_highlights[f])))
            if disp: print('\tThere were {:d} claims out of {:d} sentences ({:2.2f}%)'.format(num_claims\
                    ,len(sentences), num_claims/float(len(sentences))*100))
                
            total_claims = total_claims+num_claims
            total_sentences = total_sentences + len(sentences)
            
        if disp: print('\nIn total there were {:d} claims out of {:d} sentences ({:2.2f}%)'.format(total_claims\
                , total_sentences, total_claims/float(total_sentences)*100))
        
        # ...
        labels = ['program_id', 'sentence_id', 'sentence', 'claim_idx', 'claim']
        
        data = [ [None]*len(labels) for i in range(total_sentences)]
        
        
        i = 0
        for p in range(len(all_program_id)):
            
            for si in range(len(all_sentences[p])):
                data[i][0] = all_program_id[p]
                
                data[i][1] = si+1#Sentence ID is not correct in all files#all_sentences_id[p][si]
                data[i][2] = all_sentences[p][si]
                
                if len(all_highlights_ind[p][si]) == 1:
                    data[i][3] = all_highlights_ind[p][si]
                    data[i][4] = all_highlights[p][si]
                    
                elif all_highlights_ind[p][si]:
                    
                    print('HELP')
                    print(all_program_id[p])
                    #print(all_sentences[p][si])
                    print(all_highlights_ind[p][si])
                    print(all_highlights[p][si])
                    new_s, new_idx = self.processMultiClaim(all_sentences[p][si],\
                                                      all_highlights_ind[p][si])
                    
                    print('Trying to handle this multi-claim, is the output correct?')
                    print(new_idx)
                    print(new_s)
                    print()
                    
                    data[i][3] = new_idx
                    data[i][4] = new_s
                
                i = i+1
            
        return data, labels

#### Load directory

In [5]:
annotatedData = DebattenAnnotatedDatacleaner(loc_ann_data)
file_paths = annotatedData.getFilePaths()

##### Test of file-loading

In [6]:
program_id, sentences_id, sentences = annotatedData.getProgramAndSentences(file_paths[1])
print('Id is '+program_id)

s = sentences[111]

match_high = annotatedData.findHighlights(s)
print(match_high)

match_text_high = annotatedData.extractHighlights(match_high)
print(match_text_high)

s_crap_free = annotatedData.cleanSentence(s, disp=True)
print('\n\n' +s_crap_free)

print(annotatedData.getHighlight_indices(s_crap_free, match_text_high))

Id is 10
['<span id="highlight48" class="highlight">Så når de store lande vælger sådan en mand er det, fordi han står for deres synspunkter.</span>']
['Så når de store lande vælger sådan en mand er det, fordi han står for deres synspunkter.']
<span id="highlight48" class="highlight">
</span>
</p>


Det er dem, der støtter ham. Så når de store lande vælger sådan en mand er det, fordi han står for deres synspunkter.
[[29, 117]]


## Process all files

In [7]:
data, labels = annotatedData.getAllCleanedProgramSentences(disp=True)

Program id 8568658
	There were 33 claims out of 523 sentences (6.31%)
Program id 10
	There were 7 claims out of 231 sentences (3.03%)
Program id 8689224
	There were 33 claims out of 541 sentences (6.10%)
Program id 7
	There were 12 claims out of 307 sentences (3.91%)
Program id 1
	There were 46 claims out of 516 sentences (8.91%)
Program id 8720741
	There were 36 claims out of 541 sentences (6.65%)
Program id 9
	There were 22 claims out of 307 sentences (7.17%)
Program id 8567181
	There were 29 claims out of 324 sentences (8.95%)
Program id 5
	There were 42 claims out of 309 sentences (13.59%)
Program id 8568906
	There were 28 claims out of 518 sentences (5.41%)
Program id 4
	There were 30 claims out of 317 sentences (9.46%)
Program id 3
	There were 20 claims out of 442 sentences (4.52%)
Program id 9284846
	There were 42 claims out of 577 sentences (7.28%)
Program id 8665813
	There were 32 claims out of 561 sentences (5.70%)
Program id 6
	There were 22 claims out of 334 sentences (6.59

# Combine timestamps from earlier

In [8]:
with Path(save_loc, 'sample_programs.pickle').open('rb') as f:
    sample_dict = pickle.load(f)
    
print(sample_dict.keys())
print()
print(sample_dict['program1'].keys())

dict_keys(['program1', 'program10', 'program8', 'program5', 'program7', 'program4', 'program9', 'program6', 'program3', 'program2'])

dict_keys(['end time', 'start time', 'sentences'])


In [9]:
with Path('../../data/DeepFactData/preannotated/all_programs.pickle').open('rb') as f:
    full_dict = pickle.load(f)

In [10]:
N = len(data) # Observations
features = ['start time', 'end time']
[features.append(lab) for lab in labels]

print(features)

['start time', 'end time', 'program_id', 'sentence_id', 'sentence', 'claim_idx', 'claim']


In [11]:
full_dict.keys()

dict_keys(['2222122', '4042060', '4056784', '4914022', '2020168', '4254890', '5559662', '8567636', '2593416', '4879003', '1767521', '4753167', '8415978', '4810657', '2304494', '4121766', '8524981', '4624779', '4717985', '3445438', '7103349', '6500671', '8568658', '4171702', '2271997', '2370770', '2337314', '6528165', '7115504', '3588676', '3550014', '7186318', '8490432', '3693287', '4845589', '2466477', '1748819', '3632763', '1425182', '1777290', '8564875', '4103318', '6066238', '3648942', '2294023', '2455625', '2250789', '5300885', '2261432', '6129385', '6570340', '6486894', '3505973', '2669333', '4223161', '6113249', '2359717', '2523604', '8364372', '1932144', '4071354', '1817762', '6556275', '1739323', '3411204', '2500720', '6443809', '1989107', '5714306', '5171937', '8610238', '2092024', '1425144', '2535266', '2040383', '5085179', '2282900', '5742743', '2061692', '7127382', '6032988', '4266979', '9284846', '2488632', '5694754', '4155377', '2443022', '6542606', '1837743', '2560372',

In [12]:
# Get the time from the processed data
start_times = []
end_times = []
processed_programs = []

sample_previousID = ['1', '2', '3','4','5','6','7','8','9','10']
#sample_programID = ['7308025','2294023','2315222','2337314','2359717',\
#                  '2304494','2348260', '3411204', '3570949', '3662558']


for i in range(N):  
    pro_id = data[i][0]
    
    # Note sample_dict[pro_id]['start time'] is a list of start times for all sentences
    if pro_id not in processed_programs:
        
        if pro_id in sample_previousID:
            [start_times.append(t) for t in sample_dict['program'+pro_id]['start time']]
            [end_times.append(t) for t in sample_dict['program'+pro_id]['end time']]
            
        else:
            [start_times.append(t) for t in full_dict[pro_id]['start time']]
            [end_times.append(t) for t in full_dict[pro_id]['end time']]
        
        processed_programs.append(pro_id)
# Sanity check        
assert(len(start_times) == len(end_times))
assert(len(start_times) == N)

# Concat data
X = np.concatenate((np.asarray([start_times,end_times]).T, np.asarray(data)),axis=1)

In [13]:
len(start_times)

8466

# Ret manglende bindestreg i både de observeret sætning og de annoteret sætninger

Grundet en fejl i præprocessering, er fx 'Europa-Parlementet' blevet til 'EuropaParlementet". Det skal rettes.

Derudover skal program1, program2, ... rettes til deres programID i stedet.

In [14]:
with Path(save_loc, 'sample_programs-mbindestreg.pickle').open('rb') as f:
    sample_dict_mbind = pickle.load(f)

In [15]:
def getLeadingSpace(s,start_idx):
            # Finds first leading space before index "start_idx" in s
            if start_idx < 0:
                return 0
            elif s[start_idx] is ' ' :
                return start_idx+1
            else:
                return getLeadingSpace(s,start_idx-1)

def getTailingSpace(s,end_idx):
    # Finds first trailing space after index "end_idx" in s
    if end_idx >= len(s):
        return len(s)
    elif s[end_idx] is ' ' or end_idx == len(s):
        return end_idx
    else:
        return getTailingSpace(s,end_idx+1)

In [16]:
sample_dict_mbind.keys()

dict_keys(['program1', 'program10', 'program8', 'program3', 'program7', 'program4', 'program9', 'program6', 'program5', 'program2'])

In [17]:
sample_dict.keys()

dict_keys(['program1', 'program10', 'program8', 'program5', 'program7', 'program4', 'program9', 'program6', 'program3', 'program2'])

In [18]:
set(X[:,2])

{'1',
 '10',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '8567181',
 '8567636',
 '8568658',
 '8568906',
 '8610238',
 '8635201',
 '8665813',
 '8689224',
 '8720741',
 '9',
 '9284846'}

In [19]:
# Convert the fake program names to real program ids
bugged_programs = ['program1','program2','program3','program4','program5',\
                  'program6','program7', 'program8', 'program9', 'program10']
#bugged_programs = ['1', '2', '3','4','5','6','7','8','9','10']

actual_programID = ['7308025','2294023','2315222','2337314','2359717',\
                  '2304494','2348260', '3411204', '3570949', '3662558']
program_mapping = dict(zip(bugged_programs, actual_programID))

## FIX inconsistencies related to the inclusion of "-" in the paragraphs
for program in bugged_programs: # Fix each of the bugged programs
    
    idx_X = np.where('program'+X[:,2] == program)[0] #Index in X
    
    for elem in range(idx_X.shape[0]): # For each paragraph
        
        X[idx_X[elem],2] = program_mapping[program]
        
        para_bugged = X[idx_X[elem], 4]
        para_true = sample_dict_mbind[program]['sentences'][elem]
        # Replace the bugged sentence with the corrected one
        X[idx_X[elem], 4] = para_true
        
        
        if X[idx_X[elem],6]: # If there is a claim
            #print(X[idx_X[elem],5])
            
            if len(X[idx_X[elem],5]) == 1:
                start_id = X[idx_X[elem],5][0][0]
                end_id = X[idx_X[elem],5][0][1]
                
                claim_idx = [getLeadingSpace(para_true,start_id),\
                             getTailingSpace(para_true,end_id)]
                
                X[idx_X[elem],5][0] = claim_idx
                X[idx_X[elem],6] = para_true[claim_idx[0]:claim_idx[1]]
            else:
                claim_idx = []
                
                print('Found:\n%s' %X[idx_X[elem],6])
                print(X[idx_X[elem],5])
                
                for c in range(len(X[idx_X[elem],5])):
                    
                    start_id = X[idx_X[elem],5][c][0]
                    end_id = X[idx_X[elem],5][c][1]

                    claim_idx.append([getLeadingSpace(para_true,start_id),\
                                 getTailingSpace(para_true,end_id)])
                    
                for idx in claim_idx:
                    print(para_true[idx[0]:idx[1]]+'\n')

                X[idx_X[elem],5] = claim_idx
                #X[idx_X[elem],6] = ?????
                
                print()
                print(para_true)
                
                print()
                print(claim_idx)
                print()
            
        # Replace the bugged claim with the corrected one
        # Correct the claim index

Found:
Først når vi får udviklet elmotoren, kan vi knække den belastning som det er på bilerne. Den udvikling dækker 15% af den samlede CO2udledning. [new claim]: vi kan gøre noget ved der dækker det lidt over halvdelen. [new claim]: De dækker 56% i alt.
[[0, 142], [155, 212], [285, 305]]
Først når vi får udviklet elmotoren, kan vi knække den belastning som det er på bilerne. Den udvikling dækker 15% af den samlede CO2-udledning.

vi kan gøre noget ved der dækker det lidt over halvdelen.

De dækker 5-6% i alt.


Først når vi får udviklet elmotoren, kan vi knække den belastning som det er på bilerne. Den udvikling dækker 15% af den samlede CO2-udledning. Af den del, vi kan gøre noget ved der dækker det lidt over halvdelen. Det er der, nøglen ligger. Jo, Svend. Husholdningerne er også med. Nej. De dækker 5-6% i alt.

[[0, 143], [156, 213], [286, 307]]



In [20]:
all_claims = np.where([elem is not None for elem in X[:,6]])[0]

### Two claims where actually just mistakes

In [21]:
remove_claim_idx = np.where(X[:,2]=='7308025')[0][87-1]

if X[remove_claim_idx,6]:
    print('An errornous annotation')
    print(X[remove_claim_idx,:])
    print('The claim is removed')
    assert('diskuterer,'==X[remove_claim_idx,6])
    X[remove_claim_idx,5] = None
    X[remove_claim_idx,6] = None
    print(X[remove_claim_idx,:])
    print()
else:
    print('No claim to remove\n')

An errornous annotation
['20:09:03:23' '20:09:10:03' '7308025' 87
 'Man diskuterer, om DONG skal privatiseres eller styres offentligt.'
 list([[4, 15]]) 'diskuterer,']
The claim is removed
['20:09:03:23' '20:09:10:03' '7308025' 87
 'Man diskuterer, om DONG skal privatiseres eller styres offentligt.' None
 None]



In [22]:
remove_claim_idx = np.where(X[:,2]=='3411204')[0][120-1]

if X[remove_claim_idx,6]:
    print('An errornous annotation')
    print(X[remove_claim_idx,:])
    print('The claim is removed')
    assert('ledigheden'==X[remove_claim_idx,6])
    X[remove_claim_idx,5] = None
    X[remove_claim_idx,6] = None
    print(X[remove_claim_idx,:])
    print()
else:
    print('No claim to remove\n')

An errornous annotation
['00:13:13:24' '00:13:22:19' '3411204' 120
 'For et år siden sagde Fogh, at vi ville komme til at mangle hænder. Jeg sagde, ledigheden ville stige. Fogh er her ikke mere.'
 list([[79, 89]]) 'ledigheden']
The claim is removed
['00:13:13:24' '00:13:22:19' '3411204' 120
 'For et år siden sagde Fogh, at vi ville komme til at mangle hænder. Jeg sagde, ledigheden ville stige. Fogh er her ikke mere.'
 None None]



In [23]:
'''for i in all_claims:
    print('Program %s, \t sentence %i'%(X[i,2], X[i,3]))
    print(X[i,4])
    print(X[i,5])
    print(X[i,6])
    print()
    ''';

In [24]:
def remove_claims(X, r_program, r_sentence, r_claim):
    
    
    for i in range(len(r_program)):
        #print(r_program[i])
        #print(np.where(X[:,2]==r_program[i]))
        remove_claim_idx = np.where(X[:,2]==r_program[i])[0][r_sentence[i]-1]

        if X[remove_claim_idx,6]:
            print('An errornous annotation')
            print(X[remove_claim_idx,:])
            print(r_claim[i])
            assert(r_claim[i]==X[remove_claim_idx,6])
            X[remove_claim_idx,5] = None
            X[remove_claim_idx,6] = None
            print('The claim is removed')
            print(X[remove_claim_idx,:])
            print()
        else:
            print('No claim to remove\n')
        
        print('---------------------------------------------------------')
    return X

programs_id_of_the_claim = ['8567636','8665813','8665813','9284846', '8567181', '8567181',\
                            '8567181', '7308025']
sentence_id_of_the_claim = [62,123,2,14, 12, 2, \
                            1, 185]
the_claim_to_be_removed = ['Klarlund,',\
                           'Vil',\
                           'Hvorfor',\
                           'DF.',\
                           'jeg har',\
                           'udrensninger',\
                           'Sukker, [new claim]: Hvedemel,',\
                           'Vi accepterer en struktur,']

X = remove_claims(X, programs_id_of_the_claim, sentence_id_of_the_claim,\
                  the_claim_to_be_removed)

An errornous annotation
['00:05:59:18' '00:06:03:02' '8567636' 62
 'Bente Klarlund, du forsker i fedme. Er det de fedes egen skyld?'
 list([[6, 15]]) 'Klarlund,']
Klarlund,
The claim is removed
['00:05:59:18' '00:06:03:02' '8567636' 62
 'Bente Klarlund, du forsker i fedme. Er det de fedes egen skyld?' None
 None]

---------------------------------------------------------
An errornous annotation
['13:43:28:20' '13:43:34:07' '8665813' 123
 'Vil vi opdage, at dem, der ikke får 4-taller, er fra ikke-boglige hjem?'
 list([[0, 3]]) 'Vil']
Vil
The claim is removed
['13:43:28:20' '13:43:34:07' '8665813' 123
 'Vil vi opdage, at dem, der ikke får 4-taller, er fra ikke-boglige hjem?'
 None None]

---------------------------------------------------------
An errornous annotation
['13:31:04:15' '13:31:17:13' '8665813' 2
 'Men nu vil regeringen indføre karakterkrav til gymnasiet så nogle vil måske blive valgt fra. Hvorfor er det i Danmarks interesse at have færre unge med en studentereksamen?'
 list(

## Save results

In [25]:
description = """Claim detection in the television program DR Debatten

In several Debatten programs interesting/relevant claims were annotated by DR.
All sentences (or paragraphs) of these programs were extracted and marked as containing a
claim or not. The claims themselves are also extracted.

Currently the data contains N={:d} sentences/paragraphs from {:d} programs. The data is represented
as a matrix of size N x M, where M is a number of attributes for each sentence.

These attributes are:
    'start time'      The start time of a sentence/paragraph (h:m:s:ms)
    'end time'        The end time of a sentence/paragraph
    'program_id'      Indicates which Debatten program is the origin of the sentence
    'sentences_id'    Indicates which sentence in the program it is
                      (ordered from 1 to 2 to .. to the last sentence)
    'sentence'        A string with the full sentence/paragraph
    'claim_idx'       [start, end]-index of the claim (in the sentence)
    'claim'           A string with the claim

The data is available as a .csv file and .pickle file (python3).

Copyright(R): This data is made available in connection with the course "02456 Deep Learning" 
at the Technical University of Denmark, during the Fall 2017. Redistribution or commercial use
of the dataset is not allowed without prior agreement.


-------- Python3 Example: Load data, transform to Bag-of-Words and fit a logistic regression ----
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import linear_model


with open("data_matrix_sample_programs.pickle",'rb') as f:
        data = pickle.load(f)

X = data['data'][:,4]
y = data['data'][:,6]

# Now convert y to a binary indicator matrix (1 is claim, 0 no claim)
y = np.asarray([y[i] is not None for i in range(len(X))])       

# Make a Bag-of-Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
confusion_matrix(y, ypred)

""".format(N,len(processed_programs))


print('Total number of claims: %i'%(len(np.where([elem is not None for elem in X[:,6]])[0])))
print('Total number of paragraphs: %i'%(len(X[:,6])))
print('Procentage claims is {:2.2f}%'.format(len(np.where([elem is not None for elem in X[:,6]])[0])/float(len(X[:,6]))*100))

Total number of claims: 568
Total number of paragraphs: 8466
Procentage claims is 6.71%


In [26]:
with Path(save_loc, 'readme_sample_programs.txt').open('w') as f:
    f.write(description)

# Add features as top row
Y = np.concatenate((np.asarray(features).reshape(1,-1), X), axis=0);

with Path(save_loc, "data_matrix_sample_programs.csv").open("w", newline="\n", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter=",")
    for row in Y:
        # row = [str(val).encode("utf-8") for val in row]
        # print(row[4])
        # row[4] = row[4].encode("utf-8")
        # print("   ", row[4])
        # print("   ", row[4].decode())
        writer.writerow(row)


with Path(save_loc, "data_matrix_sample_programs.pickle").open('wb') as f:
        pickle.dump(dict(zip(['data','features'],[X, features])), f)