The zip file contains a large json file structured as follows. There is a 'segments' key that contains a list of speeches from members of parliament from 1948 to 2020. There is a lot of meta-data associated with each speech.. For the project, all you need is the field 'text'. I suggest you split the file into multiple files one for each legislature. Consider only the speeches with the field 'score' greater than 2.5 (the other speeches are meaningless).

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_json('../speeches.json',orient='records')
df

Unnamed: 0,segments
0,"{'segment': 0, 'tag': 'RESOCONTO', 'start': 'R..."
1,"{'segment': 1, 'tag': 'SEDUTA', 'start': 'SEDU..."
2,"{'segment': 2, 'tag': 'PRESIDENZA', 'start': '..."
3,"{'segment': 3, 'tag': 'RESOCONTO', 'start': 'R..."
4,"{'segment': 4, 'tag': 'XVII', 'start': 'XVII L..."
...,...
2348714,"{'segment': 182, 'tag': 'LEGISLATURA', 'start'..."
2348715,"{'segment': 183, 'tag': 'PRESIDENTE', 'start':..."
2348716,"{'segment': 184, 'tag': 'BERLINGUER', 'start':..."
2348717,"{'segment': 186, 'tag': 'PRESIDENTE', 'start':..."


In [3]:
df.iloc[0].to_dict()['segments']

{'segment': 0,
 'tag': 'RESOCONTO',
 'start': 'RESOCONTO STENOGRAFICO',
 'president': 0.128916308276516,
 'page': 0.032191545690914,
 'len': 2,
 'score': 1.484956310433453,
 'year': 2015,
 'month': 6,
 'day': 8,
 'leg': 17,
 'id': 370,
 'convocation': 25231,
 'date': '2015-06-08 00:00:00',
 'title': 'Seduta di lunedì 8 giugno 2015',
 'text': 'RESOCONTO STENOGRAFICO 437. ',
 'presidenza': False,
 'speech': False}

{"segment": 0, "tag": "RESOCONTO", "start": "RESOCONTO STENOGRAFICO", "president": 0.12891630827651634, "page": 0.032191545690914675, "len": 2, "score": 1.4849563104334538, "year": 2015, "month": 6, "day": 8, "leg": 17, "id": 370, "convocation": 25231, "date": "2015-06-08 00:00:00", "title": "Seduta di luned\u00ec 8 giugno 2015", "text": "RESOCONTO STENOGRAFICO 437. ", "presidenza": false, "speech": false},

In [4]:
from IPython.display import clear_output

data = []
l = len(df.index)
for index in range(l):
    info = df.iloc[index].to_dict()['segments']
    data.append(info)
    if index%1000==0:
        clear_output(wait=True)
        print('done '+str(index)+'/'+str(l)+'    '+str((index/l)*100)+'%')

clear_output(wait=False)
text = pd.DataFrame.from_dict(data)
text = text[text['score']>2.5]

In [5]:
text.iloc[121235]

segment                                                        352
tag                                                      FRANCESCO
start                               FRANCESCO MONACO. La prima do-
president                                                 0.055345
page                                                       0.06436
len                                                             20
score                                                     4.471383
year                                                          2000
month                                                            9
day                                                             26
leg                                                             13
id                                                              33
convocation                                                  22923
date                                           2000-09-26 00:00:00
title                   Seduta n. 777 di martedì 26 settembre 

## Info about deputy (Legislator) and ministers

In [6]:
# load info about deputy
df1 = pd.read_csv('parlamentari0x.csv')
df2 = pd.read_csv('parlamentari1x.csv')
dfp = pd.concat([df1, df2], ignore_index=True)
# transform date from int to date format
dfp['inizioMandato'] = pd.to_datetime(dfp['inizioMandato'], format="%Y%m%d")
dfp['fineMandato'] = dfp['fineMandato'].fillna(20990101)  # if person is currently active use placeholder as end date 
dfp['fineMandato'] = pd.to_datetime(dfp['fineMandato'], format="%Y%m%d")
# loan info about senators
df1 = pd.read_csv('senato0x.csv')
df2 = pd.read_csv('senato1x.csv')
dfs = pd.concat([df1, df2], ignore_index=True)
dfs['inizioMandato'] = pd.to_datetime(dfs['inizioMandato'], format="%Y%m%d")
dfs['fineMandato'] = dfs['fineMandato'].fillna(20990101)  # if person is currently active use placeholder as end date 
dfs['fineMandato'] = pd.to_datetime(dfs['fineMandato'], format="%Y%m%d")
# load info about ministers
df1 = pd.read_csv('ministri0x.csv')
df2 = pd.read_csv('ministri1x.csv')
dfm = pd.concat([df1, df2], ignore_index=True)
# rename some colums (to be abble to concat)
dfm = dfm.rename(columns={"d": "persona"})
dfp = dfp.rename(columns={"inizioMandato": "dataInizio", "fineMandato": "dataFine"})
dfs = dfs.rename(columns={"inizioMandato": "dataInizio", "fineMandato": "dataFine"})
# replace ministers link to it's personal link (have same numerical number, need only to change prefix and remove '_xx')
for i, d in dfm.iterrows():
    m = re.search('deputato.rdf/d(.+?)_', d['persona'])
    dfm.at[i,'persona'] = 'http://dati.camera.it/ocd/persona.rdf/p'+m.group(1)
# transform date from int to date format   
dfm['dataInizio'] = pd.to_datetime(dfm['dataInizio'], format="%Y%m%d")
dfm['dataFine'] = dfm['dataFine'].fillna(20990101)
dfm['dataFine'] = pd.to_datetime(dfm['dataFine'], format="%Y%m%d")
# need only some colums
dfp = dfp[['persona','cognome','nome','dataInizio','dataFine']]
dfm = dfm[['persona','cognome','nome','dataInizio','dataFine','carica','nomeOrganoGoverno']]
# final concat into one df
filtered_personnel = pd.concat([dfp, dfm, dfs], ignore_index=True)
filtered_personnel = filtered_personnel.rename(columns={"cognome": "surname", "nome": "name"})
filtered_personnel =  filtered_personnel[['persona','surname','name','dataInizio','dataFine','carica','nomeOrganoGoverno']]
filtered_personnel

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
0,http://dati.camera.it/ocd/persona.rdf/p10000,RESCIGNO,MATTEO,1948-04-27,1953-06-24,,
1,http://dati.camera.it/ocd/persona.rdf/p1000,BONOMI,PAOLO,1948-04-27,1953-06-24,,
2,http://dati.camera.it/ocd/persona.rdf/p1000,BONOMI,PAOLO,1953-06-15,1958-06-11,,
3,http://dati.camera.it/ocd/persona.rdf/p1000,BONOMI,PAOLO,1958-06-03,1963-05-15,,
4,http://dati.camera.it/ocd/persona.rdf/p10010,RICCI,MARIO,1948-04-23,1953-06-24,,
...,...,...,...,...,...,...,...
21016,http://dati.camera.it/ocd/persona.rdf/p306702,TOSATO,PAOLO,2014-07-02,2099-01-01,,
21017,http://dati.camera.it/ocd/persona.rdf/p306900,BOCCARDI,MICHELE,2015-09-08,2099-01-01,,
21018,http://dati.camera.it/ocd/persona.rdf/p307896,SEGRE,LILIANA,2018-01-19,2099-01-01,,
21019,http://dati.camera.it/ocd/persona.rdf/p308320,ALESSANDRINI,VALERIA,2020-03-12,2099-01-01,,


In [7]:
text2 = text[['text','persona','date','surname','name']].sample(1000)
text2 = text2.reset_index(drop=True)

In [8]:
text = text.reset_index(drop=True)

In [9]:
filtered_personnel.loc[filtered_personnel['surname'] == "SCIORILLI BORRELLI"]

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
45,http://dati.camera.it/ocd/persona.rdf/p10330,SCIORILLI BORRELLI,RAFFAELE,1953-06-17,1958-06-11,,
46,http://dati.camera.it/ocd/persona.rdf/p10330,SCIORILLI BORRELLI,RAFFAELE,1958-06-18,1963-05-15,,


Method 3: http://norvig.com/spell-correct.html

In [10]:
from collections import Counter
def words(text): return re.findall(r'\w+', text.upper())
SURNAME = Counter(words(' '.join(filtered_personnel['surname'])))
NAME = Counter(words(' '.join(filtered_personnel['name'])))
WORDS = SURNAME

In [11]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = "abcdefghijklmnopqrstuvwxyz'".upper()
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [12]:
correction('R0SSI')

'ROSSI'

## search with spelling correction

In [29]:
#def find_by_surname(words,personnel, corr):
    if len(words) > 2:
        sur = personnel[(personnel["surname"] == words[0].upper()) #first word is surname
                       | (personnel["surname"] == words[1].upper()) #second word is surname
                       | (personnel["surname"] == words[2].upper()) #third word is surname
                       | (personnel["surname"] == (words[0]+' '+words[1]).upper()) #surname by multiple words
                       | (personnel["surname"] == (words[1]+' '+words[2]).upper())] #surname by multiple words
    elif len(words) == 2:
        sur = personnel[(personnel["surname"] == words[0].upper()) #first word is surname
                       | (personnel["surname"] == words[1].upper()) #second word is surname
                       | (personnel["surname"] == (words[0]+' '+words[1]).upper())] #surname by multiple words
    else:
        sur = personnel[(personnel["surname"] == words[0].upper())] #first word is surname
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        corr = True
        WORDS = SURNAME
        words[0] = correction(words[0].upper())
        if len(words) == 2:
            words[1] = correction(words[1].upper())
        elif len(words) > 2:
            words[2] = correction(words[2].upper())
        sur = find_by_surname(words,personnel,corr)
    return sur

#def find_by_name(words,personnel,corr):
    if len(words) > 2:
        sur = personnel[(personnel["name"] == words[0].upper()) #first word is name
                       | (personnel["name"] == words[1].upper()) #second word is name
                       | (personnel["name"] == words[2].upper()) #third word is name
                       | (personnel["name"] == (words[0]+' '+words[1]).upper()) #name by multiple words
                       | (personnel["name"] == (words[1]+' '+words[2]).upper())] #name by multiple words
    elif len(words) == 2:
        sur = personnel[(personnel["name"] == words[0].upper()) #first word is name
                       | (personnel["name"] == words[1].upper()) #second word is name
                       | (personnel["name"] == (words[0]+' '+words[1]).upper())] #name by multiple words
    else:
        sur = personnel[(personnel["name"] == words[0].upper())] #first word is name
    if len(pd.unique(sur["name"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
        sur = find_by_name(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        corr = True
        WORDS = NAME
        words[0] = correction(words[0].upper())
        if len(words) == 2:
            words[1] = correction(words[1].upper())
        elif len(words) > 2:
            words[2] = correction(words[2].upper())
        sur = find_by_surname(words,personnel,corr)
    return sur

IndentationError: unexpected indent (2713420564.py, line 2)

In [28]:
#def find_person(text, date, personnel):
    global phase
    global l
    phase = "char_replace"
    text = text.replace("À","A'")
    text = text.replace("È","E'")
    text = text.replace("É","E'")
    text = text.replace("Ì","I'")
    text = text.replace("Ò","O'")
    text = text.replace("Ó","O'")
    text = text.replace("Ù","U'")
    phase = "words"
    words = regex.sub('', text).split(maxsplit = 4)
    if len(words)==0:
        l -= 1
        return None
    while (not words[0].isalpha()):
        words.pop(0)
        if len(words) == 0:
            return None
        t = ' '.join(words)
        words = regex.sub('', t).split(maxsplit = 4)
    phase = "date_search"
    date = personnel[(personnel["dataInizio"] <= date) & (personnel["dataFine"] >= date)]
    if len(date.index) >= 1:
        phase = "sur_search"
        sur = find_by_surname(words, date, False)
        #print(sur)
        if len(sur.index) == 1:
            return sur.iloc[0]['persona']
        elif len(sur.index) > 1:
            if len(pd.unique(sur['persona'])) == 1:
                return sur.iloc[0]['persona']
            elif len(words) > 1:
                phase = "name_search"
                name = find_by_name(words, sur, False)
                #print('-------------------')
                #print(name)
                if len(name.index) == 1 or (len(name.index) > 1 and len(pd.unique(name['persona']))):
                    return name.iloc[0]['persona']
    else:#No person found with that surename with correction
        return None

IndentationError: unexpected indent (1186805189.py, line 2)

In [16]:
l = len(text.index)
res = []

regex = re.compile("[^a-zA-Z -']")

for index, row in text.iterrows():
    #DEBUG
    #if index%100==0:
    clear_output(wait=True)
    print('done '+str(index+1)+'/'+str(l)+'    '+str(((index+1)/l)*100)+'%')
    #DEBUG
    global phase
    phase = "search"
    person = find_person(row['text'], row['date'], filtered_personnel)
    phase = "take person data"
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    phase = "check if correct"
    if row["persona"] == person:
        correct = True
    else:
        correct = False
    phase = "append"
    
    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})
            
clear_output(wait=True)
print('DONE')
result = pd.DataFrame.from_dict(res)
result['Correct'].describe()

done 1190217/1197003    99.43308412760871%


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [17]:
result['Correct'].describe()

count     1197023
unique          2
top          True
freq       935183
Name: Correct, dtype: object

In [None]:
pd.set_option("display.max_rows", None)
result[result['Correct'] == False]

In [None]:
find_person(text2.iloc[63]['text'], text2.iloc[63]['date'], filtered_personnel)

In [37]:
text2.iloc[11]['text']

"ABBATANGELO. Onorevole Presidente, onorevoli colleghi, signori del Governo, mi sarei meravigliato se fosse avvenuto il contrario, cioè se a questo dibattito importantissimo avessero partecipato tut ti i deputati della Campania e della Basilicata, per portare il loro contributo d i esperienza, di professionalità, di vita vissuta, direi, per cercare di emendare e d i migliorare un disegno di legge che, a parer nostro, certamente non soddisfa e non soddisferà mai le reali esigenze dei disoccupati napoletani e della Basilicata. Questo provvedimento è stato emanato , non tanto sotto l'incalzare degli eventi , non tanto in seguito alle manifestazioni d i protesta dei disoccupati napoletani, m a sotto forma di decreto-legge affinché potesse essere sperimentato e si potessero accontentare, con la sperimentazione , quelle forze politiche e sindacali che, con l'attuazione del provvedimento n . 760, si accingeranno in maniera monolitica, qua si in condizioni di monopolio, a gestire i l mercato d

In [36]:
text2.iloc[0]['date']

'1991-01-15 00:00:00'

In [64]:
date = filtered_personnel[(filtered_personnel["dataInizio"] <= '2005-06-29') & (filtered_personnel["dataFine"] >= '2005-06-29')]
find_by_surname(["GIUSEPPE", "VEGAS", "Viceministro"],date,False)

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno


In [75]:
sum = 0
n = ''
for i, r in filtered_personnel.iterrows():
    s = len(r['surname'].split(maxsplit = 10))# + len(r['name'].split(maxsplit = 10))
    if s > sum:
        sum = s
        n = r['surname']# + ' ' + r['name']
print(sum)
print(n)

6
DI SAN MARTINO LORENZATO DI IVREA


### Other find_surmane
(check if candidate is substring of a text)

In [13]:
from collections import Counter
def words(text): 
    text = text.upper().replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
    return re.findall(r"[A-Z'-]+", text.upper())
SURNAME = Counter(words(' '.join(filtered_personnel['surname'])))
print("SURNAME done")
NAME = Counter(words(' '.join(filtered_personnel['name'])))
print("NAME done")
t = ' '.join(text['text'])

#t = t.replace("À","A'")
#t = t.replace("È","E'")
#t = t.replace("È","E'")
#t = t.replace("É","E'")
#t = t.replace("Ì","I'")
#t = t.replace("Ò","O'")
#t = t.replace("Ó","O'")
#t = t.replace("Ù","U'")
TEXT = Counter(words(t))
print("TEXT done")
WORDS = SURNAME

SURNAME done
NAME done
TEXT done


In [14]:
with open("one-grams.txt", 'w') as f:
    c = TEXT+SURNAME+NAME
    for k,v in  c.most_common():
        f.write( "{}\t{}\n".format(k.lower(),v) )
with open("surname-one-grams.txt", 'w') as f:
    for k,v in  SURNAME.most_common():
        f.write( "{}\t{}\n".format(k.lower(),v) )

In [15]:
import segment
import segment_sur

In [16]:
def find_by_surname(text,personnel, corr):
    import pandas as pd
    data = []
    for i, s in personnel.iterrows():
        if s['surname'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname(' '.join(word),personnel,True)
    return sur

def find_by_surname_2(text,personnel, corr):
    import pandas as pd
    data = []
    seg = segment2_sur.segment(text)
    sur = pd.DataFrame({'A' : []})
    #print(seg)
    for i, s in enumerate(seg):
        p = personnel[(personnel["surname"] == s.upper())]
        sur = pd.concat([sur,p])
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname(' '.join(word),personnel,True)
    return sur

def find_by_name(text,personnel,corr):
    import pandas as pd
    data = []
    for i, s in personnel.iterrows():
        if s['name'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    #if len(sur.index) == 0 and not corr:
    #    WORDS = NAME+TEXT
    #    word = regex.sub('', text).split()
    #    for i, w in enumerate(word):
    #        word[i] = correction(w)
    #    sur = find_by_surname(' '.join(word),personnel,True)
    return sur

In [17]:
def find_person(text, date, personnel):
    import pandas as pd
    import segment
    global phase
    global l
    phase = "char_replace"
    text = text.replace("À","A'")
    text = text.replace("È","E'")
    text = text.replace("É","E'")
    text = text.replace("Ì","I'")
    text = text.replace("Ò","O'")
    text = text.replace("Ó","O'")
    text = text.replace("Ù","U'")
    
    # take only up to 20 first words
    word = regex.sub('', text).split()
    average = sum(len(w) for w in word) / len(word)
    if average < 4:
        if len(word) >= 120: #avarage 6-8 letters in italian word (6*20 = 120), but segment have limit of 100
            seg = segment.segment(''.join(word[0:60]))
        else:
            seg = segment.segment(''.join(word))
        text = ' '.join(seg)
    elif len(word) >= 20:
        text = ' '.join(word[0:20])
    else:
        text = ' '.join(word)
    #print(text)
    phase = "date_search"
    date = personnel[(personnel["dataInizio"] <= date) & (personnel["dataFine"] >= date)]
    if len(date.index) >= 1:
        phase = "sur_search"
        sur = find_by_surname(text.upper(), date, False)
        phase = "after_sur_search"
        #print(sur)
        if len(sur.index) == 0:
            return None
        elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
            return sur.iloc[0]['persona']
        else:
            if len(sur['surname'].value_counts()) > 1:
                word = regex.sub('', text.upper()).split()
                sur2 = pd.DataFrame({'A' : []})
                i = 0
                t = ''
                while(sur2.empty and (i < 20 and i < len(word))): #limit of frase up to 20 words
                    t = t + ' ' + word[i]
                    sur2 = find_by_surname(t,sur,False) # check what surname is appering first in text 
                    i+=1
                sur = sur2
                if sur.empty:
                    return None
            #print(sur)
            if len(pd.unique(sur['persona'])) == 1:
                return sur.iloc[0]['persona']
            else:
                phase = "name_search"
                name = find_by_name(text.upper(), sur, False)
                #print('-------------------')
                #print(name)
                if len(name.index) == 1 or (len(name.index) > 1 and len(pd.unique(name['persona']))==1):
                    return name.iloc[0]['persona']
    else:#No person found with that surename with correction
        return None

In [None]:
l = len(text2.index)
res = []

regex = re.compile("[^a-zA-Z -']")

for index, row in text2.iterrows():
    #DEBUG
    #if index%100==0:
    clear_output(wait=True)
    print('done '+str(index)+'/'+str(l)+'    '+str((index/l)*100)+'%')
    #DEBUG
    global phase
    phase = "search"
    person = find_person(row['text'], row['date'], filtered_personnel)
    phase = "take person data"
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    phase = "check if correct"
    if row["persona"] == person:
        correct = True
    else:
        correct = False
    phase = "append"
    
    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})
            
clear_output(wait=True)
print('done '+str(index)+'/'+str(l)+'    '+str(((index)/l)*100)+'%')
result = pd.DataFrame.from_dict(res)
result['Correct'].describe()

done 118/1000    11.799999999999999%
FRANCESCO GIULIO BAGHINO Signor Presidente noi voteremo a favor e dell'emendamento Luigi d'Amato Tab A Con esso infatti si viene
                                            persona  surname  \
6031   http://dati.camera.it/ocd/persona.rdf/p18490  D'AMATO   
6298   http://dati.camera.it/ocd/persona.rdf/p22610    AMATO   
6727   http://dati.camera.it/ocd/persona.rdf/p29420  D'AMATO   
10256    http://dati.camera.it/ocd/persona.rdf/p410  BAGHINO   

                   name dataInizio   dataFine  carica  nomeOrganoGoverno  
6031              LUIGI 1987-07-02 1992-04-22     NaN                NaN  
6298           GIULIANO 1987-06-25 1992-04-22     NaN                NaN  
6727              CARLO 1987-06-27 1992-04-22     NaN                NaN  
10256  FRANCESCO GIULIO 1987-06-26 1992-04-22     NaN                NaN  


In [None]:
pd.set_option("display.max_rows", None)
result[result['Correct'] == False]

In [51]:
find_person(text2.iloc[107]['text'], text2.iloc[107]['date'], filtered_personnel)

STEFANO APUZZO. S t iamo pres idiando l ' inquinatore! 
                                             persona  surname     name  \
8838    http://dati.camera.it/ocd/persona.rdf/p32180   APUZZO  STEFANO   
19023  http://dati.camera.it/ocd/persona.rdf/p200914  STEFANO  IPPAZIO   

      dataInizio   dataFine  carica  nomeOrganoGoverno  
8838  1992-04-21 1994-04-14     NaN                NaN  
19023 1992-04-05 1994-04-14     NaN                NaN  
                                             persona  surname     name  \
19023  http://dati.camera.it/ocd/persona.rdf/p200914  STEFANO  IPPAZIO   

      dataInizio   dataFine  carica  nomeOrganoGoverno  
19023 1992-04-05 1994-04-14     NaN                NaN  


'http://dati.camera.it/ocd/persona.rdf/p200914'

In [86]:
s = filtered_personnel["surname"].value_counts()
with open("surname2-one-grams.txt", 'w') as f:
    for k,v in s.iteritems():
        k = k.replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
        string = k.lower() + "\t" + str(v) + "\n"
        clear_output(wait=True)
        print(string)
        f.write(string)

fasiolo	1



In [35]:
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[30]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[30]['date'])]
t = text2.iloc[30]['text']
sur = find_by_surname(t.upper(),date,False)
print('----')
print(sur)
word = regex.sub('', text2.iloc[30]['text']).split()
if len(word) >=20:
    print(segment_sur.segment(' '.join(word[0:20])))
    f = find_by_surname_2(' '.join(word[0:20]),sur, False)
else:
    print(segment_sur.segment(' '.join(word)))
    f = find_by_surname_2(' '.join(word),sur, False)
f

----
                                           persona  surname       name  \
97    http://dati.camera.it/ocd/persona.rdf/p10790  VILLANI  VITTORINO   
4751   http://dati.camera.it/ocd/persona.rdf/p6130    VILLA    RUGGERO   

     dataInizio   dataFine  carica  nomeOrganoGoverno  
97   1953-06-15 1958-06-11     NaN                NaN  
4751 1953-06-15 1958-06-11     NaN                NaN  
['villani', ' non vern questa p una sua ammissione implicita']
['villani', ' non vern questa p una sua ammissione implicita']


Unnamed: 0,A,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
97,,http://dati.camera.it/ocd/persona.rdf/p10790,VILLANI,VITTORINO,1953-06-15,1958-06-11,,


In [98]:
import segment2_sur
i = 404
print(text2.iloc[i]['text'])
print('----------------')
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[i]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[i]['date'])]
word = regex.sub('', text2.iloc[i]['text']).split()
if len(word) >=20:
    print(segment2_sur.segment(' '.join(word[0:20])))
    f = find_by_surname_2(' '.join(word[0:20]),date, False)
else:
    print(segment2_sur.segment(' '.join(word)))
    f = find_by_surname_2(' '.join(word),date, False)
f

RAFFAELE DELLA V\LLE. Sì, ma avrei potuto quanto meno accennare a cosa avrei voluto dire... (Commenti). Lei mi fa un processo a futura memoria! 
----------------
['raffa', 'ele della vlle s ma avrei potuto quanto meno accennare a cosa avrei voluto dire commenti lei mi fa un']
['raffa', 'ele della vlle s ma avrei potuto quanto meno accennare a cosa avrei voluto dire commenti lei mi fa un']


Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
9735,http://dati.camera.it/ocd/persona.rdf/p36280,DELLA VALLE,RAFFAELE,1994-04-02,1996-05-08,,
10120,http://dati.camera.it/ocd/persona.rdf/p38370,RAFFAELLI,PAOLO,1994-04-01,1996-05-08,,


In [38]:
text2.iloc[1]['text']

'« SCIORILLI BORRELLI » . « Il sottoscritto chiede d\'interrogare il ministro della pubblica istruzione, per sapere se sia ammissibile che il provveditore agl i studi di Milano, rispondendo alle istanze di un gruppo di insegnanti che chiedevano legittimamente il pagamento delle indennit à dovute per la partecipazione agli esami d i abilitazione tecnica commerciale della sessione autunnale 1959 presso l\'istituto Cattane o di Milano, abbia non soltanto rifiutato tale pagamento, ma abbia perfino invitato i professori richiedenti a " non inviare continui solleciti relativi ai pagamenti delle indennità suddette " (circolare del 15 maggio 1950 , n . 21751/147) . (13941) '

In [39]:
text2.iloc[1]['date']

'1960-09-07 00:00:00'

In [42]:
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[1]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[1]['date'])]
t = text2.iloc[1]['text']
sur = find_by_surname(t.upper(),date,False)
sur

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
46,http://dati.camera.it/ocd/persona.rdf/p10330,SCIORILLI BORRELLI,RAFFAELE,1958-06-18,1963-05-15,,
348,http://dati.camera.it/ocd/persona.rdf/p11510,GITTI,SALVATORE ANGELO,1958-05-31,1963-05-15,,
5836,http://dati.camera.it/ocd/persona.rdf/p9980,RE,GIUSEPPINA,1958-06-06,1963-05-15,,
16072,http://dati.camera.it/ocd/persona.rdf/p300707,BO,GIORGIO,1958-06-12,1963-05-15,,
16531,http://dati.camera.it/ocd/persona.rdf/p300919,CONTI,ALFREDO,1958-06-02,1963-05-15,,


In [43]:
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[1]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[1]['date'])]
sur = find_by_surname(t.upper(),date,False)
print(sur)
if len(sur['surname'].value_counts()) > 1:
    word = regex.sub('', t.upper()).split()
    sur2 = pd.DataFrame({'A' : []})
    i = 0
    while(sur2.empty and i < 20 and i < len(word)):
        i+=1
        sur2 = find_by_surname(' '.join(word[0:i]),sur, False)
    sur = sur2
sur

                                             persona             surname  \
46      http://dati.camera.it/ocd/persona.rdf/p10330  SCIORILLI BORRELLI   
348     http://dati.camera.it/ocd/persona.rdf/p11510               GITTI   
5836     http://dati.camera.it/ocd/persona.rdf/p9980                  RE   
16072  http://dati.camera.it/ocd/persona.rdf/p300707                  BO   
16531  http://dati.camera.it/ocd/persona.rdf/p300919               CONTI   

                   name dataInizio   dataFine  carica  nomeOrganoGoverno  
46             RAFFAELE 1958-06-18 1963-05-15     NaN                NaN  
348    SALVATORE ANGELO 1958-05-31 1963-05-15     NaN                NaN  
5836         GIUSEPPINA 1958-06-06 1963-05-15     NaN                NaN  
16072           GIORGIO 1958-06-12 1963-05-15     NaN                NaN  
16531           ALFREDO 1958-06-02 1963-05-15     NaN                NaN  


Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
46,http://dati.camera.it/ocd/persona.rdf/p10330,SCIORILLI BORRELLI,RAFFAELE,1958-06-18,1963-05-15,,
5836,http://dati.camera.it/ocd/persona.rdf/p9980,RE,GIUSEPPINA,1958-06-06,1963-05-15,,
16072,http://dati.camera.it/ocd/persona.rdf/p300707,BO,GIORGIO,1958-06-12,1963-05-15,,


In [112]:
find_by_surname(' '.join(word[0:1]),sur,False)

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
4798,http://dati.camera.it/ocd/persona.rdf/p6490,ARCAINI,GIUSEPPE,1953-06-16,1958-06-11,,
12335,http://dati.camera.it/ocd/persona.rdf/p6490,ARCAINI,GIUSEPPE,1954-02-11,1955-07-06,SOTTOSEGRETARIO DI STATO,MINISTERO DEL TESORO


## KL-divergence

In [None]:
# vocabulary - list of all words or can i use a list of names/surnames
def s(e,m,vocab):
    sum = 0
    for w in vocab:
        #pm = p(w|theta_m)
        #pe = p(w|theta_e)
        #sum += pm * log(pm/pe)
    sum = sum*(-1)
theta_m = 1
theta_e = 1

## Maximum entropy Markov model
aka Maximum entropy

aka maxent

(also in nltk)

In [18]:
split_df = np.array_split(text, 1000)

In [26]:
def evaluation(row):
    import pandas as pd
    import numpy as np
    import re
    import os
    import sys
    sys.path.insert(0, 'Desktop/Unimi/information retriveal/project/corpus (Alfio)/inforet-project/')
    #os.chdir('Desktop/Unimi/information retriveal/project/corpus (Alfio)/inforet-project/')
    import segment
    import segment_sur
    import segment2_sur
    from collections import Counter
    
    
    #l = len(text.index)
    res = []
    errors = []
    
    global regex
    regex = re.compile("[^a-zA-Z -']")
    

    #for index, row in text.iterrows():
    #DEBUG
    #if index%100==0:
    #clear_output(wait=True)
    #print('done '+str(index)+'/'+str(l)+'    '+str((index/l)*100)+'%')
    #DEBUG
    global phase
    try:
        person = find_person(row['text'], row['date'], filtered_personnel)
    except Exception as ex:
        #errors.append({'index':index, 'text':row['text'], 'phase':phase, 'ex':ex})
        errors.append({'text':row['text'], 'phase':phase, 'ex':ex})
        return pd.DataFrame.from_dict(res), pd.DataFrame.from_dict(errors)
        #continue
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    if row["persona"] == person:
        correct = True
    else:
        correct = False

    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})

    #clear_output(wait=True)
    #print('Done')
    return pd.DataFrame.from_dict(res), pd.DataFrame.from_dict(errors)

In [20]:
import ipyparallel as ipp
rc = ipp.Client()
#dv = rc[:]
#dv.push({'filtered_personnel':filtered_personnel})
rc.ids

[]

In [23]:
rc.ids

[0, 1, 2, 3, 4, 5, 6, 7]

dview = rc[:]
def par_init():
    import os
    import sys
    sys.path.insert(0, 'Desktop/Unimi/information retriveal/project/corpus (Alfio)/inforet-project/')
    os.chdir('Desktop/Unimi/information retriveal/project/corpus (Alfio)/inforet-project/')
dview.apply(par_init)
dview.wait()

In [24]:
rc[:].push(dict(
    words = words,
    P = P,
    correction = correction,
    candidates = candidates,
    known = known,
    edits1 = edits1,
    edits2 = edits2,
    filtered_personnel=filtered_personnel,
    find_person = find_person,
    find_by_surname = find_by_surname,
    find_by_surname_2 = find_by_surname_2,
    find_by_name = find_by_name,
    SURNAME = SURNAME,
    NAME = NAME,
    TEXT = TEXT,
    WORDS = WORDS
))
view = rc.load_balanced_view()

In [27]:
%%time

for i, df in enumerate(split_df[501:]):
    ar = view.map_async(evaluation, [row for index, row in df.iterrows()])
    ar.wait() # Wait until all tasks are done.
    #async_results.append(ar)
    results = pd.DataFrame({'A' : []})
    for res in ar:
        #if len(res[1])>0:
            #print(res[1])
        results = pd.concat([results, res[0]], ignore_index=True)
    results = results.drop(columns=['A'])
    file_name = './result2/res'+str(i+501)+'.csv'
    results.to_csv(file_name,index=False)
    clear_output(wait=True)
    print('Done: ' + str(i+501) + '/' + str(len(split_df)))

Done: 999/1000
CPU times: total: 1h 39s
Wall time: 8h 57min 51s
