# Datasets preporation

The zip file contains a large json file structured as follows. There is a 'segments' key that contains a list of speeches from members of parliament from 1948 to 2020. There is a lot of meta-data associated with each speech.. For the project, all you need is the field 'text'. I suggest you split the file into multiple files one for each legislature. Consider only the speeches with the field 'score' greater than 2.5 (the other speeches are meaningless).

In [1]:
import pandas as pd
import numpy as np
import re
from IPython.display import clear_output

## Text Dataset

In [2]:
import json
with open('../speeches.json', "r") as read_file:
    j = json.load(read_file)
    
text = pd.json_normalize(j, record_path =['segments'])
text = text[text['score']>2.5]
text = text.reset_index(drop=True)
#dataset: text
del j

## Person Dataset (Info about deputy (Legislator) (dfp), senators (dfs) and ministers (dfm))

In [3]:
# load info about deputy
df1 = pd.read_csv('parlamentari0x.csv')
df2 = pd.read_csv('parlamentari1x.csv')
dfp = pd.concat([df1, df2], ignore_index=True)
# transform date from int to date format
dfp['inizioMandato'] = pd.to_datetime(dfp['inizioMandato'], format="%Y%m%d")
dfp['fineMandato'] = dfp['fineMandato'].fillna(20990101)  # if person is currently active use placeholder as end date 
dfp['fineMandato'] = pd.to_datetime(dfp['fineMandato'], format="%Y%m%d")
# loan info about senators
df1 = pd.read_csv('senato0x.csv')
df2 = pd.read_csv('senato1x.csv')
dfs = pd.concat([df1, df2], ignore_index=True)
dfs['inizioMandato'] = pd.to_datetime(dfs['inizioMandato'], format="%Y%m%d")
dfs['fineMandato'] = dfs['fineMandato'].fillna(20990101)  # if person is currently active use placeholder as end date 
dfs['fineMandato'] = pd.to_datetime(dfs['fineMandato'], format="%Y%m%d")
# load info about ministers
df1 = pd.read_csv('ministri0x.csv')
df2 = pd.read_csv('ministri1x.csv')
dfm = pd.concat([df1, df2], ignore_index=True)
# rename some colums (to be abble to concat)
dfm = dfm.rename(columns={"d": "persona"})
dfp = dfp.rename(columns={"inizioMandato": "dataInizio", "fineMandato": "dataFine"})
dfs = dfs.rename(columns={"inizioMandato": "dataInizio", "fineMandato": "dataFine"})
# replace ministers link to it's personal link (have same numerical number, need only to change prefix and remove '_xx')
for i, d in dfm.iterrows():
    m = re.search('deputato.rdf/d(.+?)_', d['persona'])
    dfm.at[i,'persona'] = 'http://dati.camera.it/ocd/persona.rdf/p'+m.group(1)
# transform date from int to date format   
dfm['dataInizio'] = pd.to_datetime(dfm['dataInizio'], format="%Y%m%d")
dfm['dataFine'] = dfm['dataFine'].fillna(20990101)
dfm['dataFine'] = pd.to_datetime(dfm['dataFine'], format="%Y%m%d")
# need only some colums
dfp = dfp[['persona','cognome','nome','dataInizio','dataFine']]
dfm = dfm[['persona','cognome','nome','dataInizio','dataFine','carica','nomeOrganoGoverno']]
# final concat into one df
filtered_personnel = pd.concat([dfp, dfm, dfs], ignore_index=True)
filtered_personnel = filtered_personnel.rename(columns={"cognome": "surname", "nome": "name"})
filtered_personnel =  filtered_personnel[['persona','surname','name','dataInizio','dataFine','carica','nomeOrganoGoverno']]
# dataset: filtered_personnel
del df1, df2, dfp, dfs, dfm

## Make a sample test dataset

Just to make it easier to work or test

In [4]:
text2 = text[['text','persona','date','surname','name']].sample(1000)
text2 = text2.reset_index(drop=True)

# First Method

In [8]:
from collections import Counter
def words(text): 
    text = text.upper().replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
    return re.findall(r"[A-Z'-]+", text.upper())
SURNAME = Counter(words(' '.join(filtered_personnel['surname'])))
print("SURNAME done")
NAME = Counter(words(' '.join(filtered_personnel['name'])))
print("NAME done")
t = ' '.join(text['text'])

TEXT = Counter(words(t))
print("TEXT done")
WORDS = SURNAME
del t

SURNAME done
NAME done
TEXT done


In [9]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = "abcdefghijklmnopqrstuvwxyz'".upper()
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [10]:
def find_by_surname(words,personnel, corr):
    if len(words) > 2:
        sur = personnel[(personnel["surname"] == words[0].upper()) #first word is surname
                       | (personnel["surname"] == words[1].upper()) #second word is surname
                       | (personnel["surname"] == words[2].upper()) #third word is surname
                       | (personnel["surname"] == (words[0]+' '+words[1]).upper()) #surname by multiple words
                       | (personnel["surname"] == (words[1]+' '+words[2]).upper())] #surname by multiple words
    elif len(words) == 2:
        sur = personnel[(personnel["surname"] == words[0].upper()) #first word is surname
                       | (personnel["surname"] == words[1].upper()) #second word is surname
                       | (personnel["surname"] == (words[0]+' '+words[1]).upper())] #surname by multiple words
    else:
        sur = personnel[(personnel["surname"] == words[0].upper())] #first word is surname
    if len(sur.index) == 0 and not corr:
        corr = True
        WORDS = SURNAME
        words[0] = correction(words[0].upper())
        if len(words) == 2:
            words[1] = correction(words[1].upper())
        elif len(words) > 2:
            words[2] = correction(words[2].upper())
        sur = find_by_surname(words,personnel,corr)
    return sur

def find_by_name(words,personnel,corr):
    if len(words) > 2:
        sur = personnel[(personnel["name"] == words[0].upper()) #first word is name
                       | (personnel["name"] == words[1].upper()) #second word is name
                       | (personnel["name"] == words[2].upper()) #third word is name
                       | (personnel["name"] == (words[0]+' '+words[1]).upper()) #name by multiple words
                       | (personnel["name"] == (words[1]+' '+words[2]).upper())] #name by multiple words
    elif len(words) == 2:
        sur = personnel[(personnel["name"] == words[0].upper()) #first word is name
                       | (personnel["name"] == words[1].upper()) #second word is name
                       | (personnel["name"] == (words[0]+' '+words[1]).upper())] #name by multiple words
    else:
        sur = personnel[(personnel["name"] == words[0].upper())] #first word is name
    if len(pd.unique(sur["name"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        sur = find_by_name(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        corr = True
        WORDS = NAME
        words[0] = correction(words[0].upper())
        if len(words) == 2:
            words[1] = correction(words[1].upper())
        elif len(words) > 2:
            words[2] = correction(words[2].upper())
        sur = find_by_surname(words,personnel,corr)
    return sur

def find_person(text, date, personnel):
    global phase  # used for debug and error log
    global l # size of text dataset
    phase = "char_replace"
    text = text.replace("À","A'")
    text = text.replace("È","E'")
    text = text.replace("É","E'")
    text = text.replace("Ì","I'")
    text = text.replace("Ò","O'")
    text = text.replace("Ó","O'")
    text = text.replace("Ù","U'")
    phase = "words"
    words = regex.sub('', text).split(maxsplit = 4)
    if len(words)==0:   # text is empty
        l -= 1
        return None
    while (not words[0].isalpha()):  # use only words.
        words.pop(0)
        if len(words) == 0:
            return None
        t = ' '.join(words)    # this and next line is used just because i limit split up to 4
        words = regex.sub('', t).split(maxsplit = 4)
    phase = "date_search"
    date = personnel[(personnel["dataInizio"] <= date) & (personnel["dataFine"] >= date)]  # start with persons in possitions in a date of speech
    if len(date.index) >= 1:
        phase = "sur_search"
        sur = find_by_surname(words, date, False)
        #print(sur)
        if len(sur.index) == 1:
            return sur.iloc[0]['persona']
        elif len(sur.index) > 1:
            if len(pd.unique(sur['persona'])) == 1:
                return sur.iloc[0]['persona']
            elif len(words) > 1:
                phase = "name_search"
                name = find_by_name(words, sur, False)
                #print('-------------------')
                #print(name)
                if len(name.index) == 1 or (len(name.index) > 1 and len(pd.unique(name['persona']))):
                    return name.iloc[0]['persona']
    else:#No person found with that surename with correction
        return None

I use here a sumple of the text dataset.

To use whole dataset, change 2 lines:

l = len(text2.index)

to 

l = len(text.index)



and 



for index, row in text2.iterrows():

to 

for index, row in text.iterrows():

In [11]:
l = len(text2.index)
res = []
errors = []

regex = re.compile("[^a-zA-Z -']")

ind = 0

for index, row in text2.iterrows():
    global phase
    try:
        person = find_person(row['text'], row['date'], filtered_personnel)
    except:
        errors.append({'index':index, 'text':row['text'], 'phase':phase})
        continue
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    if row["persona"] == person:
        correct = True
    else:
        correct = False
    
    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})
    
    clear_output(wait=True)
    print('Done record '+str(index))
    ind += 1
    print(str(ind)+'/'+str(l)+'    '+str((ind/l)*100)+'%')
            
clear_output(wait=True)
print('Done')
result = pd.DataFrame.from_dict(res)
result['Correct'].describe()

Done


count     1000
unique       2
top       True
freq       786
Name: Correct, dtype: object

# Second Method

In [26]:
from collections import Counter
def words(text): 
    text = text.upper().replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
    return re.findall(r"[A-Z'-]+", text.upper())
SURNAME = Counter(words(' '.join(filtered_personnel['surname'])))
print("SURNAME done")
NAME = Counter(words(' '.join(filtered_personnel['name'])))
print("NAME done")
t = ' '.join(text['text'])

TEXT = Counter(words(t))
print("TEXT done")
WORDS = SURNAME
del t

SURNAME done
NAME done
TEXT done


In [27]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = "abcdefghijklmnopqrstuvwxyz'".upper()
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

create one-gramms used in word-segmentation

In [28]:
with open("one-grams.txt", 'w') as f:
    c = TEXT+SURNAME+NAME
    for k,v in  c.most_common():
        f.write( "{}\t{}\n".format(k.lower(),v) )

In [29]:
import segment
def find_by_surname(text,personnel, corr):
    import pandas as pd
    data = []
    for i, s in personnel.iterrows():
        if s['surname'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname(' '.join(word),personnel,True)
    return sur

def find_by_name(text,personnel,corr):
    import pandas as pd
    data = []
    for i, s in personnel.iterrows():
        if s['name'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    return sur

def find_person(text, date, personnel):
    import pandas as pd
    import segment
    global phase
    global l
    phase = "char_replace"
    text = text.replace("À","A'")
    text = text.replace("È","E'")
    text = text.replace("É","E'")
    text = text.replace("Ì","I'")
    text = text.replace("Ò","O'")
    text = text.replace("Ó","O'")
    text = text.replace("Ù","U'")
    
    # take only up to 20 first words
    word = regex.sub('', text).split()
    average = sum(len(w) for w in word) / len(word)
    if average < 4:
        if len(word) >= 120: #avarage 6-8 letters in italian word (6*20 = 120), but segment have limit of 100
            seg = segment.segment(''.join(word[0:60]))
        else:
            seg = segment.segment(''.join(word))
        text = ' '.join(seg)
    elif len(word) >= 20:
        text = ' '.join(word[0:20])
    else:
        text = ' '.join(word)
    #print(text)
    phase = "date_search"
    date = personnel[(personnel["dataInizio"] <= date) & (personnel["dataFine"] >= date)]
    if len(date.index) >= 1:
        phase = "sur_search"
        sur = find_by_surname(text.upper(), date, False)
        phase = "after_sur_search"
        #print(sur)
        if len(sur.index) == 0:
            return None
        elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
            return sur.iloc[0]['persona']
        else:
            if len(sur['surname'].value_counts()) > 1:
                word = regex.sub('', text.upper()).split()
                sur2 = pd.DataFrame({'A' : []})
                i = 0
                t = ''
                while(sur2.empty and (i < 20 and i < len(word))): #limit of frase up to 20 words
                    t = t + ' ' + word[i]
                    sur2 = find_by_surname(t,sur,False) # check what surname is appering first in text 
                    i+=1
                sur = sur2
                if sur.empty:
                    return None
            #print(sur)
            if len(pd.unique(sur['persona'])) == 1:
                return sur.iloc[0]['persona']
            else:
                phase = "name_search"
                name = find_by_name(text.upper(), sur, False)
                #print('-------------------')
                #print(name)
                if len(name.index) == 1 or (len(name.index) > 1 and len(pd.unique(name['persona']))==1):
                    return name.iloc[0]['persona']
    else:#No person found with that surename with correction
        return None

In [30]:
l = len(text2.index)
res = []
errors = []

regex = re.compile("[^a-zA-Z -']")

ind = 0

for index, row in text2.iterrows():
    global phase
    try:
        person = find_person(row['text'], row['date'], filtered_personnel)
    except:
        errors.append({'index':index, 'text':row['text'], 'phase':phase})
        continue
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    if row["persona"] == person:
        correct = True
    else:
        correct = False
    
    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})
    
    clear_output(wait=True)
    print('Done record '+str(index))
    ind += 1
    print(str(ind)+'/'+str(l)+'    '+str((ind/l)*100)+'%')
            
clear_output(wait=True)
print('Done')
result = pd.DataFrame.from_dict(res)
result['Correct'].describe()

Done


count      999
unique       2
top       True
freq       728
Name: Correct, dtype: object

# Third Method

In [5]:
from collections import Counter
def words(text): 
    text = text.upper().replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
    return re.findall(r"[A-Z'-]+", text.upper())
SURNAME = Counter(words(' '.join(filtered_personnel['surname'])))
print("SURNAME done")
NAME = Counter(words(' '.join(filtered_personnel['name'])))
print("NAME done")
t = ' '.join(text['text'])

TEXT = Counter(words(t))
print("TEXT done")
WORDS = SURNAME

SURNAME done
NAME done
TEXT done


In [6]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = "abcdefghijklmnopqrstuvwxyz'".upper()
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [8]:
s = filtered_personnel["surname"].value_counts()
with open("surname2-one-grams.txt", 'w') as f:
    for k,v in s.iteritems():
        k = k.replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
        string = k.lower() + "\t\t" + str(v) + "\n"
        f.write(string)

In [9]:
import segment2_sur

def find_by_surname(text,personnel, corr):
    import pandas as pd
    data = []
    for i, s in personnel.iterrows():
        if s['surname'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    if len(sur.index) == 0 and not corr:
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname(' '.join(word),personnel,True)
    return sur

def find_by_surname_2(text,personnel, corr):
    import segment2_sur
    import pandas as pd
    global phase
    phase = 'sur_start'
    #print(phase)
    data = []
    phase = 'sur_segment'
    #print(phase)
    seg = segment2_sur.segment(text)
    sur = pd.DataFrame({'A' : []})
    #print(seg)
    phase = 'sur_loop'
    #print(phase)
    for s in seg:
        p = personnel[(personnel["surname"] == s.upper())]
        sur = pd.concat([sur,p])
    phase = 'sur_loop_end'
    #print(phase)
    if len(sur.index) == 0 and not corr:
        phase = 'sur_corr'
        #print(phase)
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname_2(' '.join(word),personnel,True)
    return sur

def find_by_name(text,personnel,corr):
    import pandas as pd
    data = []
    for i, s in personnel.iterrows():
        if s['name'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    return sur

def find_person(text, date, personnel):
    global phase
    global l
    import re
    import segment
    import pandas as pd
    phase = "char_replace"
    text = text.upper().replace("À","A'")
    text = text.replace("È","E'")
    text = text.replace("É","E'")
    text = text.replace("Ì","I'")
    text = text.replace("Ò","O'")
    text = text.replace("Ó","O'")
    text = text.replace("Ù","U'")
    
    # take only up to 20 first words
    phase = "segment_start"
    #print(phase)
    regex = re.compile("[^a-zA-Z -']")
    phase = "error1"
    #print(phase)
    word = regex.sub('', text).split()
    phase = "error2"
    #print(phase)
    average = sum(len(w) for w in word) / len(word)
    phase = "error3"
    #print(phase)
    if average < 4:
        phase = "segment"
        #print(phase)
        if len(word) >= 60: #avarage 6-8 letters in italian word (6*20 = 120), but segment have limit of 60
            seg = segment.segment(''.join(word[0:60]))
        else:
            seg = segment.segment(''.join(word))
        text = ' '.join(seg)
    elif len(word) >= 20:
        phase = "segment2"
        #print(phase)
        text = ' '.join(word[0:20])
    else:
        text = ' '.join(word)
    #print(text)
    phase = "date_search"
    #print(phase)
    date = personnel[(personnel["dataInizio"] <= date) & (personnel["dataFine"] >= date)]
    if len(date.index) >= 1:
        phase = "sur_search"
        #print(phase)
        sur = find_by_surname_2(text.upper(), date, False)
        phase = "sur_search_end"
        #print(phase)
        #print(sur)
        if len(sur.index) == 0:
            return None
        elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
            return sur.iloc[0]['persona']
        else:
            phase = "name_search"
            #print(phase)
            name = find_by_name(text.upper(), sur, False)
            #print('-------------------')
            #print(name)
            if len(name.index) == 1 or (len(name.index) > 1 and len(pd.unique(name['persona']))==1):
                return name.iloc[0]['persona']
            elif (len(name.index) > 1 and len(name['surname'].value_counts()) > 1):
                phase = "sur_search_2"
                #print(phase)
                word = regex.sub('', text.upper()).split()
                sur2 = pd.DataFrame({'A' : []})
                i = 0
                while(sur2.empty and (i < 20 and i < len(word))): #limit of frase up to 20 words
                    sur2 = find_by_surname(' '.join(word[0:i]),name,False) # check what surname is appering first in text 
                    i+=1
                sur = sur2
                #print("----------")
                #print(sur)
                if len(sur.index) == 0:
                    return None
                elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
                    return sur.iloc[0]['persona']
                else: return None
            else: #difference between this else and elif is in dataframe used (in elif 'name', in else 'sur')
                phase = "sur_search_3"
                #print(phase)
                word = regex.sub('', text.upper()).split()
                sur2 = pd.DataFrame({'A' : []})
                i = 0
                while(sur2.empty and (i < 20 and i < len(word))): #limit of frase up to 20 words
                    sur2 = find_by_surname(' '.join(word[0:i]),sur,False) # check what surname is appering first in text 
                    i+=1
                sur = sur2
                #print("----------")
                #print(sur)
                if len(sur.index) == 0:
                    return None
                elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
                    return sur.iloc[0]['persona']
                else: return None
    return None #No person found with that surename with correction

In [10]:
l = len(text2.index)
res = []
errors = []

regex = re.compile("[^a-zA-Z -']")

ind = 0

for index, row in text2.iterrows():
    global phase
    try:
        person = find_person(row['text'], row['date'], filtered_personnel)
    except:
        errors.append({'index':index, 'text':row['text'], 'phase':phase})
        continue
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    if row["persona"] == person:
        correct = True
    else:
        correct = False
    
    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})
    
    clear_output(wait=True)
    print('Done record '+str(index))
    ind += 1
    print(str(ind)+'/'+str(l)+'    '+str((ind/l)*100)+'%')
            
clear_output(wait=True)
print('Done')
result = pd.DataFrame.from_dict(res)
result['Correct'].describe()

Done


count     1000
unique       2
top       True
freq       647
Name: Correct, dtype: object