#### Imports

In [1]:
import os
import re
import numpy as np
import sys
from pathlib import Path
import pickle
import csv
from pathlib import Path
sys.path.append("../util")

#### Paths

In [2]:
save_loc = Path('../../data/DeepFactData/annotated/')
loc_ann_data = Path('../../data/DRDetektorAutomaticFactChecking/annotatorProgram/annotatedSubtitles').resolve()

#### Data Cleaner Class

In [3]:
class DebattenAnnotatedDatacleaner:
    """
    Takes the annotated programs, ...
    """
    
    # Initialises class and input, output locations
    def __init__(self, loc_ann=None, loc_out=None):
        self.loc_ann_subtitles = loc_ann
        self.loc_out_subtitles = loc_out
    
    def setAnnotatedFilesLocation(self, new_loc):
        self.loc_ann_subtitles = new_loc
        
    def setOutputFilesLocation(self, new_loc):
        self.loc_out_subtitles = new_loc
    
    def getFileLocation(self, disp=True):
        
        if disp:
            if self.loc_ann_subtitles is None:
                print('Annotated subtitles are not specified!')
            else:
                print('Annotated subtitles are loaded from "{:s}"'.format(self.loc_ann_subtitles))

            if self.loc_out_subtitles is None:
                print('Save location is not specified!')
            else:
                print('Save location is "{:s}"'.format(self.loc_out_subtitles))
       
        return self.loc_ann_subtitles, self.loc_out_subtitles
    
    def getFilePaths(self):
        files = self.loc_ann_subtitles.glob("*.txt")
        return list(files)
    
    def getProgramAndSentences(self, f_path):
        """Gets the program id, sentences id and sentences from a document"""
        with f_path.open('r', encoding="utf-8") as f:
            doc = f.read()

        #Find program id
        m_program_id = re.compile('program[\d]+')
        m = re.search(m_program_id, doc)
        program_id = m.group()

        sentences = doc.split('<p ')
        m_sentence_id = re.compile('id="[\d]+">')

        # Finds the sentence ids and removes html stuff from the begining of each sentence
        sentences_id = []
        for i in range(len(sentences)):
            match = re.search(m_sentence_id, sentences[i])
            if not match:
                sentences[i] = None
            else:
                sentences_id.append(int(match.group()[4:-2]))

                start_from = sentences[i].find('>')+1
                sentences[i] = sentences[i][start_from:]

        sentences = list(filter(None, sentences)) # Remove None elements
        assert(len(sentences)==len(sentences_id))

        return program_id, sentences_id, sentences
    
    # Finds highligted text including its surrounding patttern
    def findHighlights(self,s):
        m_highlight = re.compile('<span id="highlight["\w\d ]+class="highlight[\w"]+>[\w\d. ,!?%]+</span>')
        return re.findall(m_highlight, s)
    
    # Extracts highlighted text only
    def extractHighlights(self, s_matches):#Extracted the text highlighted
        m_high_text = re.compile('">[\w\d ,.!?%]+</')
        high_text = [re.findall(m_high_text, s_matches[i])[0][2:-2] for i in range(len(s_matches))]
        return [s.lstrip().rstrip() for s in high_text]
    
    # Removes html tags (and crap) from the string.
    def cleanSentence(self, s, disp=False):

        m_crap = re.compile('<[\w\d "=/]+>')
        s_crap_free = s
        for pattern in re.findall(m_crap, s): 
            if disp: print(pattern)
            s_crap_free = s_crap_free.replace(pattern,'')

        #s_crap_free = re.sub('id="[\d]+">','',s_crap_free) # only during dev

        s_crap_free = s_crap_free.replace('\t',' ') # removes tabs
        s_crap_free = re.sub(' +',' ', s_crap_free) # removes excess spaces
        return s_crap_free.lstrip().rstrip()

    def getHighlight_indices(self,s,s_highlighted):
        
        # Two heuristic for correcting partially highlighted words.
        def getLeadingSpace(s,start_idx):
            # Finds first leading space before index "start_idx" in s
            if start_idx < 0:
                return 0
            elif s[start_idx] is ' ' :
                return start_idx+1
            else:
                return getLeadingSpace(s,start_idx-1)

        def getTailingSpace(s,end_idx):
            # Finds first trailing space after index "end_idx" in s
            if end_idx >= len(s):
                return len(s)
            elif s[end_idx] is ' ' or end_idx == len(s):
                return end_idx
            else:
                return getTailingSpace(s,end_idx+1)
        
        # Find the indicies of highlighted words
        indices = []
        # Get matched indices
        for m in s_highlighted:
            m_pattern = re.compile(m)
            match = re.search(m_pattern, s)
            if match:
                indices.append([getLeadingSpace(s, match.start()), 
                                getTailingSpace(s, match.end())])
            else:
                print(match)
                print(m)
                print(s_highlighted)
                print(s+'\n')
                
        #print('\n\n')
        return indices
    
    def getCleanedProgramSentences(self, sentences): 
        sentences_processed = [None]*len(sentences)
        sentences_highlight = [None]*len(sentences)
        sentences_highlight_ind = [None]*len(sentences)
        
        for i in range(len(sentences)):
            sen = sentences[i]
            raw_highlights = self.findHighlights(sen)
            text_highlights = self.extractHighlights(raw_highlights)
            
            #Crap free verion
            sentences_processed[i] = self.cleanSentence(sen)
            #print('cleaned: '+sentences_processed[i])
            indices_highlights = self.getHighlight_indices(sentences_processed[i], 
                                                                         text_highlights)
            sentences_highlight_ind[i] = indices_highlights
            
            for idx in indices_highlights:
                if sentences_highlight[i]:
                     sentences_highlight[i] = sentences_highlight[i]+ ' [new claim]: '\
                                              +sentences_processed[i][idx[0]:idx[1]]
                else:
                    sentences_highlight[i] = sentences_processed[i][idx[0]:idx[1]]
            
            
        return sentences_processed, sentences_highlight, sentences_highlight_ind
    
    # EXPERIMENTAL!!! Processing multi-claim paragraphs
    def processMultiClaim(self,s,idx):
        merge_claims = []
        for c in range(len(idx)-1):
            if abs(idx[c][1]-idx[c+1][0]) == 1: #It is the same claim
                merge_claims.append(True)
            else:
                merge_claims.append(False)

        new_s = []
        new_idx = []
        for c in range(len(idx)-1):
            if merge_claims[c]:
                start_id = idx[c][0]
                end_id = idx[c+1][1]
                new_idx.append([start_id, end_id])
                new_s.append(s[start_id:end_id])
            else:
                if c > 0:
                    new_s.append(' [new claim]: ')

                start_id = idx[c][0]
                end_id = idx[c][1]
                new_idx.append([start_id, end_id])
                new_s.append(s[start_id:end_id])

        if not merge_claims[-1]:

            new_s.append(' [new claim]: ')

            start_id = idx[-1][0]
            end_id = idx[-1][1]
            new_idx.append([start_id, end_id])
            new_s.append(s[start_id:end_id])


        new_s = ''.join(new_s)
        return new_s, new_idx
    
    def getAllCleanedProgramSentences(self,disp=False):
        file_paths = self.getFilePaths()

        all_program_id = [None]*len(file_paths)
        all_sentences = [None]*len(file_paths)
        all_sentences_id = [None]*len(file_paths)
        all_highlights = [None]*len(file_paths)
        all_highlights_ind = [None]*len(file_paths)
        
        total_claims = 0;
        total_sentences = 0;
        
        for f in range(len(file_paths)):
            all_program_id[f], all_sentences_id[f], sentences = \
                        self.getProgramAndSentences(file_paths[f])
            if disp: print('Program id {:s}'.format(all_program_id[f]))
            
            all_sentences[f], all_highlights[f], all_highlights_ind[f] = \
                        self.getCleanedProgramSentences(sentences)
            
            num_claims = len(list(filter(None,all_highlights[f])))
            if disp: print('\tThere were {:d} claims out of {:d} sentences ({:2.2f}%)'.format(num_claims\
                    ,len(sentences), num_claims/float(len(sentences))*100))
                
            total_claims = total_claims+num_claims
            total_sentences = total_sentences + len(sentences)
            
        if disp: print('\nIn total there were {:d} claims out of {:d} sentences ({:2.2f}%)'.format(total_claims\
                , total_sentences, total_claims/float(total_sentences)*100))
        
        # ...
        labels = ['program_id', 'sentence_id', 'sentence', 'claim_idx', 'claim']
        
        data = [ [None]*len(labels) for i in range(total_sentences)]
        
        
        i = 0
        for p in range(len(all_program_id)):
            
            for si in range(len(all_sentences[p])):
                data[i][0] = all_program_id[p]
                
                data[i][1] = all_sentences_id[p][si]
                data[i][2] = all_sentences[p][si]
                
                if len(all_highlights_ind[p][si]) == 1:
                    data[i][3] = all_highlights_ind[p][si]
                    data[i][4] = all_highlights[p][si]
                    
                elif all_highlights_ind[p][si]:
                    
                    print('HELP')
                    print(all_program_id[p])
                    #print(all_sentences[p][si])
                    print(all_highlights_ind[p][si])
                    print(all_highlights[p][si])
                    new_s, new_idx = self.processMultiClaim(all_sentences[p][si],\
                                                      all_highlights_ind[p][si])
                    
                    print('Trying to handle this multi-claim, is the output correct?')
                    print(new_idx)
                    print(new_s)
                    print()
                    
                    data[i][3] = new_idx
                    data[i][4] = new_s
                
                i = i+1
            
        return data, labels

#### Load directory

In [4]:
annotatedData = DebattenAnnotatedDatacleaner(loc_ann_data)
file_paths = annotatedData.getFilePaths()

##### Test of file-loading

In [5]:
program_id, sentences_id, sentences = annotatedData.getProgramAndSentences(file_paths[1])
print('Id is '+program_id)

s = sentences[111]

match_high = annotatedData.findHighlights(s)
print(match_high)

match_text_high = annotatedData.extractHighlights(match_high)
print(match_text_high)

s_crap_free = annotatedData.cleanSentence(s, disp=True)
print('\n\n' +s_crap_free)

print(annotatedData.getHighlight_indices(s_crap_free, match_text_high))

Id is program10
['<span id="highlight48" class="highlight">Så når de store lande vælger sådan en mand er det, fordi han står for deres synspunkter.</span>']
['Så når de store lande vælger sådan en mand er det, fordi han står for deres synspunkter.']
<span id="highlight48" class="highlight">
</span>
</p>


Det er dem, der støtter ham. Så når de store lande vælger sådan en mand er det, fordi han står for deres synspunkter.
[[29, 117]]


## Process all files

In [6]:
data, labels = annotatedData.getAllCleanedProgramSentences(disp=True)

Program id program1
	There were 46 claims out of 516 sentences (8.91%)
Program id program10
	There were 7 claims out of 231 sentences (3.03%)
Program id program2
	There were 33 claims out of 320 sentences (10.31%)
Program id program3
	There were 20 claims out of 442 sentences (4.52%)
Program id program4
	There were 30 claims out of 317 sentences (9.46%)
Program id program5
	There were 42 claims out of 309 sentences (13.59%)
Program id program6
	There were 22 claims out of 334 sentences (6.59%)
Program id program7
	There were 12 claims out of 307 sentences (3.91%)
Program id program8
	There were 32 claims out of 406 sentences (7.88%)
Program id program9
	There were 22 claims out of 307 sentences (7.17%)

In total there were 266 claims out of 3489 sentences (7.62%)
HELP
program3
[[0, 18], [19, 60]]
Jeg kan frygte, at [new claim]: DR1 bliver en slags TV2 i middelmådighed.
Trying to handle this multi-claim, is the output correct?
[[0, 60]]
Jeg kan frygte, at DR1 bliver en slags TV2 i midde

# Combine timestamps from earlier

In [7]:
with Path(save_loc, 'sample_programs.pickle').open('rb') as f:
    sample_dict = pickle.load(f)
    
print(sample_dict.keys())
print()
print(sample_dict['program1'].keys())

dict_keys(['program4', 'program2', 'program6', 'program5', 'program3', 'program7', 'program1', 'program10', 'program8', 'program9'])

dict_keys(['start time', 'sentences', 'end time'])


In [8]:
N = len(data) # Observations
features = ['start time', 'end time']
[features.append(lab) for lab in labels]

print(features)

['start time', 'end time', 'program_id', 'sentence_id', 'sentence', 'claim_idx', 'claim']


In [9]:
# Get the time from the processed data
start_times = []
end_times = []
processed_programs = []

for i in range(N):  
    pro_id = data[i][0]
    
    # Note sample_dict[pro_id]['start time'] is a list of start times for all sentences
    if pro_id not in processed_programs:
        [start_times.append(t) for t in sample_dict[pro_id]['start time']]
        [end_times.append(t) for t in sample_dict[pro_id]['end time']]
        
        processed_programs.append(pro_id)
# Sanity check        
assert(len(start_times) == len(end_times))
assert(len(start_times) == N)

# Concat data
X = np.concatenate((np.asarray([start_times,end_times]).T, np.asarray(data)),axis=1)

# Ret manglende bindestreg i både de observeret sætning og de annoteret sætninger

Grundet en fejl i præprocessering, er fx 'Europa-Parlementet' blevet til 'EuropaParlementet". Det skal rettes.

Derudover skal program1, program2, ... rettes til deres programID i stedet.

In [10]:
with Path(save_loc, 'sample_programs-mbindestreg.pickle').open('rb') as f:
    sample_dict_mbind = pickle.load(f)

In [11]:
def getLeadingSpace(s,start_idx):
            # Finds first leading space before index "start_idx" in s
            if start_idx < 0:
                return 0
            elif s[start_idx] is ' ' :
                return start_idx+1
            else:
                return getLeadingSpace(s,start_idx-1)

def getTailingSpace(s,end_idx):
    # Finds first trailing space after index "end_idx" in s
    if end_idx >= len(s):
        return len(s)
    elif s[end_idx] is ' ' or end_idx == len(s):
        return end_idx
    else:
        return getTailingSpace(s,end_idx+1)

In [12]:
# Convert the fake program names to real program ids
bugged_programs = ['program1','program2','program3','program4','program5',\
                  'program6','program7', 'program8', 'program9', 'program10']

actual_programID = ['7308025','2294023','2315222','2337314','2359717',\
                  '2304494','2348260', '3411204', '3570949', '3662558']
program_mapping = dict(zip(bugged_programs, actual_programID))

## FIX inconsistencies related to the inclusion of "-" in the paragraphs
for program in bugged_programs: # Fix each of the bugged programs
    
    idx_X = np.where(X[:,2] == program)[0] #Index in X
    
    for elem in range(idx_X.shape[0]): # For each paragraph
        
        X[idx_X[elem],2] = program_mapping[program]
        
        para_bugged = X[idx_X[elem], 4]
        para_true = sample_dict_mbind[program]['sentences'][elem]
        # Replace the bugged sentence with the corrected one
        X[idx_X[elem], 4] = para_true
        
        
        if X[idx_X[elem],6]: # If there is a claim
            #print(X[idx_X[elem],5])
            
            if len(X[idx_X[elem],5]) == 1:
                start_id = X[idx_X[elem],5][0][0]
                end_id = X[idx_X[elem],5][0][1]
                
                claim_idx = [getLeadingSpace(para_true,start_id),\
                             getTailingSpace(para_true,end_id)]
                
                X[idx_X[elem],5][0] = claim_idx
            else:
                claim_idx = []
                
                print('Found:\n%s' %X[idx_X[elem],6])
                print(X[idx_X[elem],5])
                
                for c in range(len(X[idx_X[elem],5])):
                    
                    start_id = X[idx_X[elem],5][c][0]
                    end_id = X[idx_X[elem],5][c][1]

                    claim_idx.append([getLeadingSpace(para_true,start_id),\
                                 getTailingSpace(para_true,end_id)])
                    
                for idx in claim_idx:
                    print(para_true[idx[0]:idx[1]]+'\n')

                X[idx_X[elem],5] = claim_idx
                
                print()
                print(para_true)
                
                print()
                print(claim_idx)
                print()
            
        # Replace the bugged claim with the corrected one
        # Correct the claim index

Found:
Først når vi får udviklet elmotoren, kan vi knække den belastning som det er på bilerne. Den udvikling dækker 15% af den samlede CO2udledning. [new claim]: vi kan gøre noget ved der dækker det lidt over halvdelen. [new claim]: De dækker 56% i alt.
[[0, 142], [155, 212], [285, 305]]
Først når vi får udviklet elmotoren, kan vi knække den belastning som det er på bilerne. Den udvikling dækker 15% af den samlede CO2-udledning.

vi kan gøre noget ved der dækker det lidt over halvdelen.

De dækker 5-6% i alt.


Først når vi får udviklet elmotoren, kan vi knække den belastning som det er på bilerne. Den udvikling dækker 15% af den samlede CO2-udledning. Af den del, vi kan gøre noget ved der dækker det lidt over halvdelen. Det er der, nøglen ligger. Jo, Svend. Husholdningerne er også med. Nej. De dækker 5-6% i alt.

[[0, 143], [156, 213], [286, 307]]



In [13]:
all_claims = np.where([elem is not None for elem in X[:,6]])[0]

### Two claims where actually just mistakes

In [14]:
remove_claim_idx = np.where(X[:,2]=='7308025')[0][87-1]

if X[remove_claim_idx,6]:
    print('An errornous annotation')
    print(X[remove_claim_idx,:])
    print('The claim is removed')
    assert('diskuterer,'==X[remove_claim_idx,6])
    X[remove_claim_idx,5] = None
    X[remove_claim_idx,6] = None
    print(X[remove_claim_idx,:])
    print()
else:
    print('No claim to remove\n')

An errornous annotation
['20:09:03:23' '20:09:10:03' '7308025' 87
 'Man diskuterer, om DONG skal privatiseres eller styres offentligt.'
 list([[4, 15]]) 'diskuterer,']
The claim is removed
['20:09:03:23' '20:09:10:03' '7308025' 87
 'Man diskuterer, om DONG skal privatiseres eller styres offentligt.' None
 None]



In [15]:
remove_claim_idx = np.where(X[:,2]=='3411204')[0][120-1]

if X[remove_claim_idx,6]:
    print('An errornous annotation')
    print(X[remove_claim_idx,:])
    print('The claim is removed')
    assert('ledigheden'==X[remove_claim_idx,6])
    X[remove_claim_idx,5] = None
    X[remove_claim_idx,6] = None
    print(X[remove_claim_idx,:])
    print()
else:
    print('No claim to remove\n')

An errornous annotation
['00:13:13:24' '00:13:22:19' '3411204' 120
 'For et år siden sagde Fogh, at vi ville komme til at mangle hænder. Jeg sagde, ledigheden ville stige. Fogh er her ikke mere.'
 list([[79, 89]]) 'ledigheden']
The claim is removed
['00:13:13:24' '00:13:22:19' '3411204' 120
 'For et år siden sagde Fogh, at vi ville komme til at mangle hænder. Jeg sagde, ledigheden ville stige. Fogh er her ikke mere.'
 None None]



## Save results

In [16]:
description = """Claim detection in the television program DR Debatten

In several Debatten programs interesting/relevant claims were annotated by DR.
All sentences (or paragraphs) of these programs were extracted and marked as containing a
claim or not. The claims themselves are also extracted.

Currently the data contains N={:d} sentences from {:d} programs. The data is represented
as a matrix of size N x M, where M is a number of attributes for each sentence.

These attributes are:
    'start time'      The start time of a sentence/paragraph (h:m:s:ms)
    'end time'        The end time of a sentence/paragraph
    'program_id'      Indicates which Debatten program is the origin of the sentence
    'sentences_id'    Indicates which sentence in the program it is
                      (ordered from 1 to 2 to .. to the last sentence)
    'sentence'        A string with the full sentence/paragraph
    'claim_idx'       [start, end]-index of the claim (in the sentence)
    'claim'           A string with the claim

The data is available as a .csv file and .pickle file (python3).

Copyright(R): This data is made available in connection with the course "02456 Deep Learning" 
at the Technical University of Denmark, during the Fall 2017. Redistribution or commercial use
of the dataset is not allowed without prior agreement.


-------- Python3 Example: Load data, transform to Bag-of-Words and fit a logistic regression ----
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import linear_model


with open("data_matrix_sample_programs.pickle",'rb') as f:
        data = pickle.load(f)

X = data['data'][:,4]
y = data['data'][:,6]

# Now convert y to a binary indicator matrix (1 is claim, 0 no claim)
y = np.asarray([y[i] is not None for i in range(len(X))])       

# Make a Bag-of-Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
confusion_matrix(y, ypred)

""".format(N,len(processed_programs))

In [27]:
with Path(save_loc, 'readme_sample_programs.txt').open('w') as f:
    f.write(description)

# Add features as top row
Y = np.concatenate((np.asarray(features).reshape(1,-1), X), axis=0);

with Path(save_loc, "data_matrix_sample_programs.csv").open("w", newline="\n", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter=",")
    for row in Y:
        # row = [str(val).encode("utf-8") for val in row]
        print(row[4])
        # row[4] = row[4].encode("utf-8")
        # print("   ", row[4])
        # print("   ", row[4].decode())
        writer.writerow(row)


with Path(save_loc, "data_matrix_sample_programs.pickle").open('wb') as f:
        pickle.dump(dict(zip(['data','features'],[X, features])), f)

sentence
Er det stadig danske politikere, der bestemmer over samfundets udvikling?
Eller er vi i dag fuldstændig underlagt internationale vilkår?
I aften diskuterer vi salget af DONG og EU.
Dette er Debatten direkte fra Aarhus.
Vi sender fra Studie 12. Det plejede at være Danmarks største tv-studie.
Blandt debattørerne er Stine Brix fra EL, Mette Bock fra LA modtager af 284.000 stemmer ved det sidste valg til Europa-Parlamentet Morten Messerschmidt fra DF tilbage i dansk politik Morten Helveg Petersen fra R forhenværende journalist og tv-vært Morten Løkkegaard og chef for Verdensnaturfonden, WWF, i Danmark Gitte Seeberg.
Programmet blev debatteret af flere end 32.000 danskere på nettet.
I aften er det Morten Dahlin, byrådsmedlem i Greve og Camilla Schwalbe, formand for DSU der undervejs fortæller om alt det, de hører.
Der er to emner: Goldman Sachs og EU. Gå ind og bland dig nu i debatten på facebook og dr.dk
Det bedste bud var det, der kom fra Goldman Sachs.
Det er en berygtet kapital

De står over for hinanden og giver et udbud på hver sin måde.
Det ville være morsomt, hvis de konkurrerede på public service.
Men begge to svigter mere og mere. TV2 har det efterhånden ikke mere.
Hvis man læser deres program, ville man få et chok.
Det kommer vi til nu. Indholdet.
I denne måned kunne TV2 fejre 20 år med masser af kendisser og minder om de gode år efter monopolbruddet.
Det er ikke Deres fjernsyn, der er noget galt med.
Godaften. TV2 med 
Vi byder seerne velkommen til vores første nyhedsudsendelse.
TV2 brød DR landsdækkende tv den 1. oktober 88.
Det tog kun kanalen to år at blive landets største.
DR måtte lære ordet 
Vi arbejder på at anlægge en mere vedkommende, journalistisk linje.
Folkeligt holdt hjulet danskerne klistret til skærmen i 14 år.
Et andet vartegn var 
Programmet gjorde fredag aften til TV2
Fredag aften er fortsat en aften, hvor DR skal kæmpe hårdt for seerne.
Det lykkedes med men i denne sæson ligger TV2 i førertrøjen med 
Sørine, det var statsministerfrue

Demonstranterne prøvede at spærre for bussen. Det kom til kamp hvor politiet brugte peberspray og knipler.
Søvndal, en rydning af Brorsons Kirke var på Kjærsgaards ønskeliste da hun svingede pisken over regeringens rygstykker.
Var det i orden, at politiet ryddede den kirke?
Man skal ikke have asyl i Danmark bare fordi man har haft foden på dansk jord.
Den her sag er speciel, fordi det er irakere. Forløbet har været langt.
Lene og Claus har trykket på knappen, da Danmark gik i krig i Irak.
Derfor har vi ansvar for, at det blev et for langt forløb.
Den krig sendte 4 mio. mennesker ud af 24 mio. på flugt.
Vi har et ansvar for, hvad vi sender irakerne hjem til.
Vi har opfordret integrationsministeren til at overveje sagerne i lyset af, hvor længe folk har været her men også i lyset af situationen i Irak.
Det kan være svært i den ophidsede debat med det nuancerede synspunkt.
Du svarer ikke på spørgsmålet. Var det i orden at rydde kirken?
Den måde, det foregik på, har vi jo taget afstand fra

In [18]:
"".join([str(val) for val in Y.flatten()]).encode("latin1")

b'start timeend timeprogram_idsentence_idsentenceclaim_idxclaim20:00:01:0120:00:05:2073080251Er det stadig danske politikere, der bestemmer over samfundets udvikling?NoneNone20:00:05:2320:00:11:1373080252Eller er vi i dag fuldst\xe6ndig underlagt internationale vilk\xe5r?NoneNone20:00:11:1620:00:14:1873080253I aften diskuterer vi salget af DONG og EU.NoneNone20:00:14:2120:00:17:1673080254Dette er Debatten direkte fra Aarhus.NoneNone20:00:26:0020:00:32:2173080255Vi sender fra Studie 12. Det plejede at v\xe6re Danmarks st\xf8rste tv-studie.NoneNone20:00:32:2420:01:03:2473080256Blandt debatt\xf8rerne er Stine Brix fra EL, Mette Bock fra LA modtager af 284.000 stemmer ved det sidste valg til Europa-Parlamentet Morten Messerschmidt fra DF tilbage i dansk politik Morten Helveg Petersen fra R forhenv\xe6rende journalist og tv-v\xe6rt Morten L\xf8kkegaard og chef for Verdensnaturfonden, WWF, i Danmark Gitte Seeberg.NoneNone20:01:04:0220:01:09:0473080257Programmet blev debatteret af flere end 3