# Cleaning HTML and outputting preannotation files

In [1]:
%%writefile DebattenDatacleaner.py

import os
import re

class DebattenDatacleaner:
    """
    Takes raw html files and extractes the sentences and their timestamps.
    """
    
    # Initialises class and input, output locations
    def __init__(self, loc_raw=[], loc_pro=[]):
        self.loc_raw_subtitles = loc_raw
        self.loc_pro_subtitles = loc_pro
    
    def setRawFilesLocation(self, new_loc):
        self.loc_raw_subtitles = new_loc
        
    def setProcessedFilesLocation(self, new_loc):
        self.loc_pro_subtitles = new_loc
    
    def getFileLocation(self, disp=True):
        
        if disp:
            if not self.loc_raw_subtitles:
                print('Raw subtitles are not specified!')
            else:
                print('Raw subtitles are loaded from "{:s}"'.format(self.loc_raw_subtitles))

            if not self.loc_pro_subtitles:
                print('Save location for processed subtitles is not specified!')
            else:
                print('Save location for processed subtitles is "{:s}"'.format(self.loc_pro_subtitles))
       
        return self.loc_raw_subtitles, self.loc_pro_subtitles
    
    # Returns the full (or relative) path all the raw files
    def getRawFilePaths(self, filename='page.html', subset=None):
        
        program_ids = os.listdir(self.loc_raw_subtitles)
        files = ['{:s}{:s}/{:s}'.format(self.loc_raw_subtitles, p_id, filename) for p_id in program_ids]
        
        if subset and abs(subset) < len(files):
            if subset < 0:
                files = files[subset:]
                program_ids = program_ids[subset:]
            else:
                files = files[:subset]
                program_ids = program_ids[:subset]
        
        return files, program_ids
    
    
    def html_decode(self, s):
        """
        Returns the ASCII decoded version of the given HTML string. This does
        NOT remove normal HTML tags like <p>.
        """
        htmlCodes = (
                ("'", '&#39;'),
                ('"', '&quot;'),#("'", '&quot;'),#
                ('>', '&gt;'),
                ('<', '&lt;'),
                ('&', '&amp;')
            )
        for code in htmlCodes:
            s = s.replace(code[1], code[0])
        return s

    # Extract the relevant HTML text for each sentence
    def getHtmlSentences(self, file_loc, re_subs_pattern='category="Undertekster"[\S\W<>]+<script src="/',
                                split_pattern='<span class="digits ng-binding">'):
        subtitle_file = open(file_loc,'r', encoding='iso-8859-1')
        #subtitle_file = open(file_loc,'r', encoding='utf-8', errors='ignore')
        doc = subtitle_file.read()
        doc = self.html_decode(doc)

        subtitle_file.close()

        # Find part of the html which contain the subtitles
        p_subs = re.compile(re_subs_pattern)
        match = re.search(p_subs,doc)
        doc_subs = match.group()

        #Split into the sentences
        sentences = doc_subs.split(split_pattern)
        sentences = [sentences[i] for i in range(1,len(sentences))]

        print('\tProgram has {:d} sentences'.format(len(sentences)))
        return sentences
    
    
    def getTimeAndText(self, text, re_time='[\d:]+', re_text='ma-highlight="[\w?%!:-; \',.\d-]+'):
        p_time = re.compile(re_time)
        p_text = re.compile(re_text)
        
        text_at_this_timepoint = re.finditer(p_text,text)
        s = ''
        for match in text_at_this_timepoint:
            s=s+' '+(match.group()[14:])

        s = re.sub(' +',' ',s) # Removes repeating whitespaces
        s=s[1:] # Removes first white space

        nums = re.finditer(p_time,text)
        nums = [n.group() for n in nums]

        time_start = nums[0]
        time_end = nums[1]

        return s, time_start, time_end
    
    def getCleanedSentences(self, sentences):
        program = []
        start_times = []
        end_times = []

        for sen in sentences:
            text, start, end = self.getTimeAndText(sen)

            if len(program) is 0:
                program.append(text)
                start_times.append(start)
                end_times.append(end)
            elif len(text)>0:
                try:

                    if text[0] is '-':
                        last_text = program.pop()
                        s = last_text+text
                        s = re.sub('- ',' ',s)
                        s = re.sub(' -',' ',s)
                        s = re.sub(' +',' ',s)

                        program.append(s)

                        end_times.pop()
                        end_times.append(end)

                    else:
                        program.append(text)
                        start_times.append(start)
                        end_times.append(end)

                except Exception as e:
                    print('Woops.. ')
                    print('"',text,'"')
                    print(len(text))

        print('\tProgram has {:d} cleaned sentences'.format(len(program)))
        return dict(zip(['sentences','start time','end time'],[program, start_times, end_times]))


    def getCleanedPrograms(self, file_paths, program_ids):
        total_sent = 0
        all_programs = dict()
        count = 1;
        for i in range(len(file_paths)):
            print('Program {:d} of {:d} ({:s})'.format(i+1,len(file_paths),program_ids[i]))
            program_sentences = self.getHtmlSentences(file_paths[i])
            all_programs[program_ids[i]] = self.getCleanedSentences(program_sentences)
            total_sent += len(all_programs[program_ids[i]]['sentences'])
            
        print('\nA total of {:d} sentences was found.'.format(total_sent))
            
        return all_programs
    
    def export_debatten_programs(self, all_programs, program_ids ):
                                 
        for p_id in all_programs:
            sentenc_id = 1
            with open(self.loc_pro_subtitles+'program'+str(p_id), 'w') as f:
                f.write('<span id="program '+str(p_id)+'">\n')

                for s in all_programs[p_id]['sentences']:
                    f.write('\t<p id="'+str(p_id)+'"> '+s+' </p>\n')
                    sentenc_id+=1

                f.write('</span>')
            print('Sucessfully wrote program'+str(p_id))
    
    
#for i in range(len(all_programs)):
    #export_debatten_program(all_programs['program-'+directories[i]],directories[i])

Overwriting DebattenDatacleaner.py


In [2]:
from DebattenDatacleaner import DebattenDatacleaner

Datacleaner = DebattenDatacleaner('/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/',
                                  '../Results/DeepFact/2017-10-17-Instance/')
Datacleaner.getFileLocation();

Raw subtitles are loaded from "/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/"
Save location for processed subtitles is "../Results/DeepFact/2017-10-17-Instance/"


In [3]:
files, program_id = Datacleaner.getRawFilePaths()

print('The ten first examples are')
Datacleaner.getRawFilePaths(subset=10)

The ten first examples are


(['/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/6047910/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/5779637/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/7162934/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/7115504/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/4027193/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/6066238/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/8610238/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/2512262/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/5984995/page.html',
  '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/6556275/page.html'],
 ['6047910',
  '5779637',
  '7162934',
  '7115504',
  '4027193',
  '6066238',
  '8610238',
  '2512262',
  '5984995',
  '6556275'])

In [4]:
# For debugging
files[0] = '/home/jehi/Dropbox/DTU/DeepFactData/Subtitles Crawl/9024801/page.html'
program_id[0] = '9024801'

In [5]:
program_sentences = Datacleaner.getHtmlSentences(files[0])

	Program has 3424 sentences


In [6]:
Datacleaner.getTimeAndText(program_sentences[0])

('Politikerne tog fejl, både demokrater og republikanere.',
 '21:00:25:23',
 '21:00:26:18')

In [7]:
cleaned_sentences = Datacleaner.getCleanedSentences(program_sentences)

	Program has 3314 cleaned sentences


In [8]:
all_programs = Datacleaner.getCleanedPrograms(files, program_id)

Program 1 of 198 (9024801)
	Program has 3424 sentences
	Program has 3314 cleaned sentences
Program 2 of 198 (5779637)
	Program has 473 sentences
	Program has 360 cleaned sentences
Program 3 of 198 (7162934)
	Program has 736 sentences
	Program has 529 cleaned sentences
Program 4 of 198 (7115504)
	Program has 742 sentences
	Program has 538 cleaned sentences
Program 5 of 198 (4027193)
	Program has 510 sentences
	Program has 391 cleaned sentences
Program 6 of 198 (6066238)
	Program has 630 sentences
	Program has 457 cleaned sentences
Program 7 of 198 (8610238)
	Program has 792 sentences
	Program has 526 cleaned sentences
Program 8 of 198 (2512262)
	Program has 461 sentences
	Program has 338 cleaned sentences
Program 9 of 198 (5984995)
	Program has 577 sentences
	Program has 426 cleaned sentences
Program 10 of 198 (6556275)
	Program has 609 sentences
	Program has 429 cleaned sentences
Program 11 of 198 (1808661)
	Program has 429 sentences
	Program has 321 cleaned sentences
Program 12 of 198

In [9]:
[(len(all_programs[program]['sentences']), program) for program in all_programs]

[(352, '2271997'),
 (307, '3570949'),
 (295, '1767521'),
 (495, '7198080'),
 (507, '7103349'),
 (359, '4071354'),
 (3314, '9024801'),
 (426, '5984995'),
 (559, '8490432'),
 (324, '8567181'),
 (334, '2304494'),
 (326, '5803916'),
 (352, '2250789'),
 (493, '4121766'),
 (305, '1739323'),
 (406, '3411204'),
 (304, '2547764'),
 (321, '1808661'),
 (413, '5270879'),
 (306, '2282900'),
 (391, '4027193'),
 (419, '5694754'),
 (398, '6443809'),
 (491, '4254890'),
 (428, '5762858'),
 (541, '8720741'),
 (523, '7174872'),
 (439, '4859285'),
 (509, '4207467'),
 (514, '7077142'),
 (327, '1817762'),
 (385, '4810657'),
 (396, '5964390'),
 (469, '5675832'),
 (1, '8573626'),
 (294, '1942591'),
 (426, '6628084'),
 (436, '6542606'),
 (366, '4042060'),
 (510, '6909844'),
 (311, '2455625'),
 (561, '8665813'),
 (432, '5728721'),
 (403, '5158985'),
 (593, '8524981'),
 (433, '4753167'),
 (301, '1292098'),
 (374, '2222122'),
 (300, '1012335'),
 (399, '6458522'),
 (313, '1962590'),
 (319, '2716313'),
 (382, '64868

# Output til Philip

In [10]:
!ls ../Results/DeepFact/2017-10-17-Instance/

program1012335	program2315222	program4027193	program5559662	program6642870
program1037984	program2325751	program4042060	program5578714	program6909844
program1292098	program2337314	program4056784	program5624194	program7077142
program1335758	program2348260	program4071354	program5639601	program7090401
program1347559	program2359717	program4087367	program5655525	program7103349
program1369970	program2370770	program4103318	program5675832	program7115504
program1374585	program2443022	program4121766	program5694754	program7127382
program1425144	program2455625	program4155377	program5714306	program7139464
program1425182	program2466477	program4171702	program5728721	program7146484
program1739323	program2488632	program4207467	program5742743	program7162934
program1748819	program2500720	program4223161	program5762858	program7174872
program1758460	program2512262	program4254890	program5779637	program7186318
program1767521	program2523604	program4266979	program5803916	program7198080
program17772

In [11]:
Datacleaner.export_debatten_programs(all_programs, program_id)

Sucessfully wrote program2271997
Sucessfully wrote program3570949
Sucessfully wrote program1767521
Sucessfully wrote program7198080
Sucessfully wrote program7103349
Sucessfully wrote program4071354
Sucessfully wrote program9024801
Sucessfully wrote program5984995
Sucessfully wrote program8490432
Sucessfully wrote program8567181
Sucessfully wrote program2304494
Sucessfully wrote program5803916
Sucessfully wrote program2250789
Sucessfully wrote program4121766
Sucessfully wrote program1739323
Sucessfully wrote program3411204
Sucessfully wrote program2547764
Sucessfully wrote program1808661
Sucessfully wrote program5270879
Sucessfully wrote program2282900
Sucessfully wrote program4027193
Sucessfully wrote program5694754
Sucessfully wrote program6443809
Sucessfully wrote program4254890
Sucessfully wrote program5762858
Sucessfully wrote program8720741
Sucessfully wrote program7174872
Sucessfully wrote program4859285
Sucessfully wrote program4207467
Sucessfully wrote program7077142
Sucessfull

In [12]:
!ls -l -h ../Results/DeepFact/2017-10-17-Instance/

total 9.3M
-rw-r--r-- 1 jehi unixusers  30K Oct 17 15:18 program1012335
-rw-r--r-- 1 jehi unixusers  30K Oct 17 15:18 program1037984
-rw-r--r-- 1 jehi unixusers  31K Oct 17 15:18 program1292098
-rw-r--r-- 1 jehi unixusers  31K Oct 17 15:18 program1335758
-rw-r--r-- 1 jehi unixusers  31K Oct 17 15:18 program1347559
-rw-r--r-- 1 jehi unixusers  30K Oct 17 15:18 program1369970
-rw-r--r-- 1 jehi unixusers  32K Oct 17 15:18 program1374585
-rw-r--r-- 1 jehi unixusers  32K Oct 17 15:18 program1425144
-rw-r--r-- 1 jehi unixusers  33K Oct 17 15:18 program1425182
-rw-r--r-- 1 jehi unixusers  32K Oct 17 15:18 program1739323
-rw-r--r-- 1 jehi unixusers  33K Oct 17 15:18 program1748819
-rw-r--r-- 1 jehi unixusers  32K Oct 17 15:18 program1758460
-rw-r--r-- 1 jehi unixusers  31K Oct 17 15:18 program1767521
-rw-r--r-- 1 jehi unixusers  40K Oct 17 15:18 program1777290
-rw-r--r-- 1 jehi unixusers  31K Oct 17 15:18 program1787519
-rw-r--r-- 1 jehi unixusers  31K Oct 17 15:18 program17976