In [17]:
%%writefile ../util/DebattenAnnotatedDatacleaner.py
import os
import re
import numpy as np

class DebattenAnnotatedDatacleaner:
    """
    Takes the annotated programs, ...
    """
    
    # Initialises class and input, output locations
    def __init__(self, loc_ann=[], loc_out=[]):
        self.loc_ann_subtitles = loc_ann
        self.loc_out_subtitles = loc_out
    
    def setAnnotatedFilesLocation(self, new_loc):
        self.loc_ann_subtitles = new_loc
        
    def setOutputFilesLocation(self, new_loc):
        self.loc_out_subtitles = new_loc
    
    def getFileLocation(self, disp=True):
        
        if disp:
            if not self.loc_ann_subtitles:
                print('Annotated subtitles are not specified!')
            else:
                print('Annotated subtitles are loaded from "{:s}"'.format(self.loc_ann_subtitles))

            if not self.loc_out_subtitles:
                print('Save location is not specified!')
            else:
                print('Save location is "{:s}"'.format(self.loc_out_subtitles))
       
        return self.loc_ann_subtitles, self.loc_out_subtitles
    
    def getFilePaths(self):
        files = os.listdir(self.loc_ann_subtitles)
        return [self.loc_ann_subtitles+f for f in files]
    
    
    def getProgramAndSentences(self,f_path):
        """Gets the program id, sentences id and sentences from a document"""
        with open(f_path,'r') as f:
            doc = f.read()

        #Find program id
        m_program_id = re.compile('program[\d]+')
        m = re.search(m_program_id, doc)
        program_id = m.group()

        
        sentences = doc.split('<p ')
        m_sentence_id = re.compile('id="[\d]+">')

        # Finds the sentence ids and removes html stuff from the begining of each sentence
        sentences_id = []
        for i in range(len(sentences)):
            match = re.search(m_sentence_id, sentences[i])
            if not match:
                sentences[i] = None
            else:
                sentences_id.append(int(match.group()[4:-2]))

                start_from = sentences[i].find('>')+1
                sentences[i] = sentences[i][start_from:]

        sentences = list(filter(None, sentences)) # Remove None elements
        assert(len(sentences)==len(sentences_id))

        return program_id, sentences_id, sentences
    
    # Finds highligted text including its surrounding patttern
    def findHighlights(self,s):
        m_highlight = re.compile('<span id="highlight["\w\d ]+class="highlight[\w"]+>[\w\d. ,!?%]+</span>')
        return re.findall(m_highlight, s)
    
    # Extracts highlighted text only
    def extractHighlights(self, s_matches):#Extracted the text highlighted
        m_high_text = re.compile('">[\w\d ,.!?%]+</')
        high_text = [re.findall(m_high_text, s_matches[i])[0][2:-2] for i in range(len(s_matches))]
        return [s.lstrip().rstrip() for s in high_text]
    
    # Removes html tags (and crap) from the string.
    def cleanSentence(self, s, disp=False):

        m_crap = re.compile('<[\w\d "=/]+>')
        s_crap_free = s
        for pattern in re.findall(m_crap, s): 
            if disp: print(pattern)
            s_crap_free = s_crap_free.replace(pattern,'')

        #s_crap_free = re.sub('id="[\d]+">','',s_crap_free) # only during dev

        s_crap_free = s_crap_free.replace('\t',' ') # removes tabs
        s_crap_free = re.sub(' +',' ', s_crap_free) # removes excess spaces
        return s_crap_free.lstrip().rstrip()

    def getHighlight_indices(self,s,s_highlighted):
        
        # Two heuristic for correcting partially highlighted words.
        def getLeadingSpace(s,start_idx):
            # Finds first leading space before index "start_idx" in s
            if start_idx < 0:
                return 0
            elif s[start_idx] is ' ' :
                return start_idx+1
            else:
                return getLeadingSpace(s,start_idx-1)

        def getTailingSpace(s,end_idx):
            # Finds first trailing space after index "end_idx" in s
            if end_idx >= len(s):
                return len(s)
            elif s[end_idx] is ' ' or end_idx == len(s)-1:
                return end_idx
            else:
                return getTailingSpace(s,end_idx+1)
        
        # Find the indicies of highlighted words
        indices = []
        # Get matched indices
        for m in s_highlighted:
            m_pattern = re.compile(m)
            match = re.search(m_pattern, s)
            if match:
                indices.append([getLeadingSpace(s, match.start()), 
                                getTailingSpace(s, match.end())])
            else:
                print(match)
                print(m)
                print(s_highlighted)
                print(s+'\n')
                
        #print('\n\n')
        return indices
    
    def getCleanedProgramSentences(self, sentences): 
        sentences_processed = [None]*len(sentences)
        sentences_highlight = [None]*len(sentences)
        sentences_highlight_ind = [None]*len(sentences)
        
        for i in range(len(sentences)):
            sen = sentences[i]
            raw_highlights = self.findHighlights(sen)
            text_highlights = self.extractHighlights(raw_highlights)
            
            #Crap free verion
            sentences_processed[i] = self.cleanSentence(sen)
            #print('cleaned: '+sentences_processed[i])
            indices_highlights = self.getHighlight_indices(sentences_processed[i], 
                                                                         text_highlights)
            sentences_highlight_ind[i] = indices_highlights
            
            for idx in indices_highlights:
                if sentences_highlight[i]:
                     sentences_highlight[i] = sentences_highlight[i]+ ' [new claim]: '\
                                              +sentences_processed[i][idx[0]:idx[1]]
                else:
                    sentences_highlight[i] = sentences_processed[i][idx[0]:idx[1]]
            
            
        return sentences_processed, sentences_highlight, sentences_highlight_ind
    
    def getAllCleanedProgramSentences(self,disp=False):
        file_paths = self.getFilePaths()

        all_program_id = [None]*len(file_paths)
        all_sentences = [None]*len(file_paths)
        all_sentences_id = [None]*len(file_paths)
        all_highlights = [None]*len(file_paths)
        all_highlights_ind = [None]*len(file_paths)
        
        total_claims = 0;
        total_sentences = 0;
        
        for f in range(len(file_paths)):
            all_program_id[f], all_sentences_id[f], sentences = \
                        self.getProgramAndSentences(file_paths[f])
            if disp: print('Program id {:s}'.format(all_program_id[f]))
            
            all_sentences[f], all_highlights[f], all_highlights_ind[f] = \
                        self.getCleanedProgramSentences(sentences)
            
            num_claims = len(list(filter(None,all_highlights[f])))
            if disp: print('\tThere were {:d} claims out of {:d} sentences ({:2.2f}%)'.format(num_claims\
                    ,len(sentences), num_claims/float(len(sentences))*100))
                
            total_claims = total_claims+num_claims
            total_sentences = total_sentences + len(sentences)
            
        if disp: print('\nIn total there were {:d} claims out of {:d} sentences ({:2.2f}%)'.format(total_claims\
                , total_sentences, total_claims/float(total_sentences)*100))
        
        # ...
        labels = ['program_id', 'sentence_id', 'sentence', 'claim_idx', 'claim']
        
        data = [ [None]*len(labels) for i in range(total_sentences)]
        
        
        i = 0
        for p in range(len(all_program_id)):
            
            for si in range(len(all_sentences[p])):
                data[i][0] = all_program_id[p]
                
                data[i][1] = all_sentences_id[p][si]
                data[i][2] = all_sentences[p][si]
                
                if len(all_highlights_ind[p][si]) == 1:
                    data[i][3] = all_highlights_ind[p][si]
                    data[i][4] = all_highlights[p][si]
                    
                elif all_highlights_ind[p][si]:
                    print('HELP')
                    print(all_program_id[p])
                    print(all_highlights_ind[p][si])
                    print(all_highlights[p][si])
                
                i = i+1
            
        return data, labels

Overwriting ../util/DebattenAnnotatedDatacleaner.py


In [2]:
import sys
sys.path.append("../util")

In [3]:
from DebattenAnnotatedDatacleaner import DebattenAnnotatedDatacleaner

loc_ann_data = '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/'
annotatedData = DebattenAnnotatedDatacleaner(loc_ann_data)

In [4]:
file_paths = annotatedData.getFilePaths()

program_id, sentences_id, sentences = annotatedData.getProgramAndSentences(file_paths[2])
print('Id is '+program_id)

Id is program1


In [5]:
file_paths

['/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program10.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program7.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program1.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program9.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program5.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program4.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program3.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program6.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles/program8.txt',
 '/home/jehi/Dropbox/DRDetektorAutomaticFactChecking/annotatorP

In [6]:
s = sentences[6]+sentences[47]
s

' <span id="highlight0" class="highlightGreen"><span id="highlight1" class="highlightYellow">Programmet blev debatteret af flere end 32.000 danskere på nettet.</span></span> </p>\t Finanseksperter vurderer at d<span id="highlight3" class="highlight">e kan tjene 100% af den investering, de laver, på få år.</span> </p>\t'

In [7]:
match_high = annotatedData.findHighlights(s)
print(match_high)

['<span id="highlight1" class="highlightYellow">Programmet blev debatteret af flere end 32.000 danskere på nettet.</span>', '<span id="highlight3" class="highlight">e kan tjene 100% af den investering, de laver, på få år.</span>']


In [8]:
match_text_high = annotatedData.extractHighlights(match_high)
print(match_text_high)

['Programmet blev debatteret af flere end 32.000 danskere på nettet.', 'e kan tjene 100% af den investering, de laver, på få år.']


In [9]:
s_crap_free = annotatedData.cleanSentence(s, disp=True)
print('\n\n' +s_crap_free)

<span id="highlight0" class="highlightGreen">
<span id="highlight1" class="highlightYellow">
</span>
</span>
</p>
<span id="highlight3" class="highlight">
</span>
</p>


Programmet blev debatteret af flere end 32.000 danskere på nettet. Finanseksperter vurderer at de kan tjene 100% af den investering, de laver, på få år.


In [10]:
annotatedData.getHighlight_indices(s_crap_free, match_text_high)

[[0, 66], [95, 152]]

## Process all files

In [11]:
data, labels = annotatedData.getAllCleanedProgramSentences(disp=True)

Program id program10
	There were 7 claims out of 231 sentences (3.03%)
Program id program7
	There were 12 claims out of 307 sentences (3.91%)
Program id program1
	There were 46 claims out of 516 sentences (8.91%)
Program id program9
	There were 22 claims out of 307 sentences (7.17%)
Program id program5
	There were 42 claims out of 309 sentences (13.59%)
Program id program4
	There were 30 claims out of 317 sentences (9.46%)
Program id program3
	There were 20 claims out of 442 sentences (4.52%)
Program id program6
	There were 22 claims out of 334 sentences (6.59%)
Program id program8
	There were 32 claims out of 406 sentences (7.88%)
Program id program2
	There were 33 claims out of 320 sentences (10.31%)

In total there were 266 claims out of 3489 sentences (7.62%)
HELP
program5
[[0, 71], [72, 112]]
Det Europæiske Energiagentur har gjort op at Danmark er et af de lande, [new claim]: der er længst væk fra at opfylde målene.
HELP
program5
[[0, 142], [155, 212], [285, 305]]
Først når vi får

# Combine timestamps from earlier

In [12]:
import pickle
save_loc = '/home/jehi/Dropbox/DTU/DeepFactData/annotated/'

with open(save_loc+'sample_programs.pickle', 'rb') as f:
    sample_dict = pickle.load(f)
    
print(sample_dict.keys())
print()
print(sample_dict['program1'].keys())

dict_keys(['program8', 'program9', 'program2', 'program4', 'program7', 'program6', 'program5', 'program10', 'program1', 'program3'])

dict_keys(['start time', 'end time', 'sentences'])


In [13]:
N = len(data) # Observations
features = ['start time', 'end time']
[features.append(lab) for lab in labels]

print(features)

['start time', 'end time', 'program_id', 'sentence_id', 'sentence', 'claim_idx', 'claim']


In [14]:
import numpy as np
# Get the time from the processed data
start_times = []
end_times = []
processed_programs = []

for i in range(N):  
    pro_id = data[i][0]
    
    # Note sample_dict[pro_id]['start time'] is a list of start times for all sentences
    if pro_id not in processed_programs:
        [start_times.append(t) for t in sample_dict[pro_id]['start time']]
        [end_times.append(t) for t in sample_dict[pro_id]['end time']]
        
        processed_programs.append(pro_id)
# Sanity check        
assert(len(start_times) == len(end_times))
assert(len(start_times) == N)

#Concat data
X = np.concatenate((np.asarray([start_times,end_times]).T, np.asarray(data)),axis=1)

## Save results

In [15]:
description = """Claim detection in the television program DR Debatten

In several Debatten programs interesting/relevant claims were annotated by DR.
All sentences (or paragraphs) of these programs were extracted and marked as containing a
claim or not. The claims themselves are also extracted.

Currently the data contains N={:d} sentences from {:d} programs. The data is represented
as a matrix of size N x M, where M is a number of attributes for each sentence.

These attributes are:
    'start time'      The start time of a sentence/paragraph (h:m:s:ms)
    'end time'        The end time of a sentence/paragraph
    'program_id'      Indicates which Debatten program is the origin of the sentence
    'sentences_id'    Indicates which sentence in the program it is
                      (ordered from 1 to 2 to .. to the last sentence)
    'sentence'        A string with the full sentence/paragraph
    'claim_idx'       [start, end]-index of the claim (in the sentence)
    'claim'           A string with the claim

The data is available as a .csv file and .pickle file (python3).

Copyright(R): This data is made available in connection with the course "02456 Deep Learning" 
at the Technical University of Denmark, during the Fall 2017. Redistribution or commercial use
of the dataset is not allowed without prior agreement.


-------- Python3 Example: Load data, transform to Bag-of-Words and fit a logistic regression ----
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import linear_model


with open("data_matrix_sample_programs.pickle",'rb') as f:
        data = pickle.load(f)

X = data['data'][:,4]
y = data['data'][:,6]

# Now convert y to a binary indicator matrix (1 is claim, 0 no claim)
y = np.asarray([y[i] is not None for i in range(len(X))])       

# Make a Bag-of-Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
confusion_matrix(y, ypred)

""".format(N,len(processed_programs))

In [16]:
with open(save_loc+'readme_sample_programs.txt','w') as f:
    f.write(description)

#Add features as top row
Y = np.concatenate((np.asarray(features).reshape(1,-1),X), axis=0);
np.savetxt(save_loc+"data_matrix_sample_programs.csv", Y , delimiter=",", fmt='%s')

with open(save_loc+"data_matrix_sample_programs.pickle",'wb') as f:
        pickle.dump(dict(zip(['data','features'],[X, features])), f)