The zip file contains a large json file structured as follows. There is a 'segments' key that contains a list of speeches from members of parliament from 1948 to 2020. There is a lot of meta-data associated with each speech.. For the project, all you need is the field 'text'. I suggest you split the file into multiple files one for each legislature. Consider only the speeches with the field 'score' greater than 2.5 (the other speeches are meaningless).

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_json('speeches.json',orient='records')

In [3]:
df.iloc[0].to_dict()['segments']

{'segment': 0,
 'tag': 'RESOCONTO',
 'start': 'RESOCONTO STENOGRAFICO',
 'president': 0.128916308276516,
 'page': 0.032191545690914,
 'len': 2,
 'score': 1.484956310433453,
 'year': 2015,
 'month': 6,
 'day': 8,
 'leg': 17,
 'id': 370,
 'convocation': 25231,
 'date': '2015-06-08 00:00:00',
 'title': 'Seduta di lunedì 8 giugno 2015',
 'text': 'RESOCONTO STENOGRAFICO 437. ',
 'presidenza': False,
 'speech': False}

{"segment": 0, "tag": "RESOCONTO", "start": "RESOCONTO STENOGRAFICO", "president": 0.12891630827651634, "page": 0.032191545690914675, "len": 2, "score": 1.4849563104334538, "year": 2015, "month": 6, "day": 8, "leg": 17, "id": 370, "convocation": 25231, "date": "2015-06-08 00:00:00", "title": "Seduta di luned\u00ec 8 giugno 2015", "text": "RESOCONTO STENOGRAFICO 437. ", "presidenza": false, "speech": false},

In [4]:
from IPython.display import clear_output

data = []
l = len(df.index)
for index in range(l):
    info = df.iloc[index].to_dict()['segments']
    data.append(info)
    if index%1000==0:
        clear_output(wait=True)
        print('done '+str(index)+'/'+str(l)+'    '+str((index/l)*100)+'%')

clear_output(wait=False)
text = pd.DataFrame.from_dict(data)
text = text[text['score']>2.5]

In [5]:
text.iloc[121235]

segment                                                        352
tag                                                      FRANCESCO
start                               FRANCESCO MONACO. La prima do-
president                                                 0.055345
page                                                       0.06436
len                                                             20
score                                                     4.471383
year                                                          2000
month                                                            9
day                                                             26
leg                                                             13
id                                                              33
convocation                                                  22923
date                                           2000-09-26 00:00:00
title                   Seduta n. 777 di martedì 26 settembre 

In [6]:
text = text[['text','persona','date','surname','name']]

## Info about deputy (Legislator) and ministers

In [7]:
# load info about deputy
df1 = pd.read_csv('parlamentari0x.csv')
df2 = pd.read_csv('parlamentari1x.csv')
dfp = pd.concat([df1, df2], ignore_index=True)
# transform date from int to date format
dfp['inizioMandato'] = pd.to_datetime(dfp['inizioMandato'], format="%Y%m%d")
dfp['fineMandato'] = dfp['fineMandato'].fillna(20990101)  # if person is currently active use placeholder as end date 
dfp['fineMandato'] = pd.to_datetime(dfp['fineMandato'], format="%Y%m%d")
# loan info about senators
df1 = pd.read_csv('senato0x.csv')
df2 = pd.read_csv('senato1x.csv')
dfs = pd.concat([df1, df2], ignore_index=True)
dfs['inizioMandato'] = pd.to_datetime(dfs['inizioMandato'], format="%Y%m%d")
dfs['fineMandato'] = dfs['fineMandato'].fillna(20990101)  # if person is currently active use placeholder as end date 
dfs['fineMandato'] = pd.to_datetime(dfs['fineMandato'], format="%Y%m%d")
# load info about ministers
df1 = pd.read_csv('ministri0x.csv')
df2 = pd.read_csv('ministri1x.csv')
dfm = pd.concat([df1, df2], ignore_index=True)
# rename some colums (to be abble to concat)
dfm = dfm.rename(columns={"d": "persona"})
dfp = dfp.rename(columns={"inizioMandato": "dataInizio", "fineMandato": "dataFine"})
dfs = dfs.rename(columns={"inizioMandato": "dataInizio", "fineMandato": "dataFine"})
# replace ministers link to it's personal link (have same numerical number, need only to change prefix and remove '_xx')
for i, d in dfm.iterrows():
    m = re.search('deputato.rdf/d(.+?)_', d['persona'])
    dfm.at[i,'persona'] = 'http://dati.camera.it/ocd/persona.rdf/p'+m.group(1)
# transform date from int to date format   
dfm['dataInizio'] = pd.to_datetime(dfm['dataInizio'], format="%Y%m%d")
dfm['dataFine'] = dfm['dataFine'].fillna(20990101)
dfm['dataFine'] = pd.to_datetime(dfm['dataFine'], format="%Y%m%d")
# need only some colums
dfp = dfp[['persona','cognome','nome','dataInizio','dataFine']]
dfm = dfm[['persona','cognome','nome','dataInizio','dataFine','carica','nomeOrganoGoverno']]
# final concat into one df
filtered_personnel = pd.concat([dfp, dfm, dfs], ignore_index=True)
filtered_personnel = filtered_personnel.rename(columns={"cognome": "surname", "nome": "name"})
filtered_personnel =  filtered_personnel[['persona','surname','name','dataInizio','dataFine','carica','nomeOrganoGoverno']]
filtered_personnel

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
0,http://dati.camera.it/ocd/persona.rdf/p10000,RESCIGNO,MATTEO,1948-04-27,1953-06-24,,
1,http://dati.camera.it/ocd/persona.rdf/p1000,BONOMI,PAOLO,1948-04-27,1953-06-24,,
2,http://dati.camera.it/ocd/persona.rdf/p1000,BONOMI,PAOLO,1953-06-15,1958-06-11,,
3,http://dati.camera.it/ocd/persona.rdf/p1000,BONOMI,PAOLO,1958-06-03,1963-05-15,,
4,http://dati.camera.it/ocd/persona.rdf/p10010,RICCI,MARIO,1948-04-23,1953-06-24,,
...,...,...,...,...,...,...,...
21016,http://dati.camera.it/ocd/persona.rdf/p306702,TOSATO,PAOLO,2014-07-02,2099-01-01,,
21017,http://dati.camera.it/ocd/persona.rdf/p306900,BOCCARDI,MICHELE,2015-09-08,2099-01-01,,
21018,http://dati.camera.it/ocd/persona.rdf/p307896,SEGRE,LILIANA,2018-01-19,2099-01-01,,
21019,http://dati.camera.it/ocd/persona.rdf/p308320,ALESSANDRINI,VALERIA,2020-03-12,2099-01-01,,


In [8]:
text2 = text[['text','persona','date','surname','name']].sample(1000)
text2 = text2.reset_index(drop=True)

In [9]:
filtered_personnel.loc[filtered_personnel['surname'] == "NARO"]

Unnamed: 0,persona,surname,name,dataInizio,dataFine,carica,nomeOrganoGoverno
6865,http://dati.camera.it/ocd/persona.rdf/p300260,NARO,GIUSEPPE,2001-05-21,2006-04-27,,
7046,http://dati.camera.it/ocd/persona.rdf/p300260,NARO,GIUSEPPE,2008-04-23,2013-03-14,,
19294,http://dati.camera.it/ocd/persona.rdf/p300260,NARO,GIUSEPPE,2006-04-21,2008-04-28,,


## search with spelling correction

correction with http://norvig.com/spell-correct.htmlhttp://norvig.com/spell-correct.html

### Other find_surmane
(check if candidate is substring of a text)

In [10]:
from collections import Counter
def words(text): 
    text = text.upper().replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
    return re.findall(r"[A-Z'-]+", text.upper())
SURNAME = Counter(words(' '.join(filtered_personnel['surname'])))
print("SURNAME done")
NAME = Counter(words(' '.join(filtered_personnel['name'])))
print("NAME done")
t = ' '.join(text['text'])

#t = t.replace("À","A'")
#t = t.replace("È","E'")
#t = t.replace("È","E'")
#t = t.replace("É","E'")
#t = t.replace("Ì","I'")
#t = t.replace("Ò","O'")
#t = t.replace("Ó","O'")
#t = t.replace("Ù","U'")
TEXT = Counter(words(t))
print("TEXT done")
WORDS = SURNAME

SURNAME done
NAME done
TEXT done


In [11]:
def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = "abcdefghijklmnopqrstuvwxyz'".upper()
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [12]:
correction('R0SSI')

'ROSSI'

In [13]:
with open("one-grams.txt", 'w') as f:
    c = TEXT+SURNAME+NAME
    for k,v in  c.most_common():
        f.write( "{}\t{}\n".format(k.lower(),v) )
with open("surname-one-grams.txt", 'w') as f:
    for k,v in  SURNAME.most_common():
        f.write( "{}\t{}\n".format(k.lower(),v) )
s = filtered_personnel["surname"].value_counts()
with open("surname2-one-grams.txt", 'w') as f:
    for k,v in s.iteritems():
        k = k.replace("À","A'").replace("È","E'").replace("É","E'").replace("Ì","I'").replace("Í","I'").replace("Ò","O'").replace("Ó","O'").replace("Ù","U'").replace("Ú","U'").replace("Ü","U'")
        string = k.lower() + "\t\t" + str(v) + "\n"
        f.write(string)

In [14]:
import segment
import segment_sur
import segment2_sur

In [15]:
def find_by_surname(text,personnel, corr):
    data = []
    for i, s in personnel.iterrows():
        if s['surname'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname(' '.join(word),personnel,True)
    return sur

def find_by_surname_2(text,personnel, corr):
    data = []
    seg = segment2_sur.segment(text)
    sur = pd.DataFrame({'A' : []})
    #print(seg)
    for s in seg:
        p = personnel[(personnel["surname"] == s.upper())]
        sur = pd.concat([sur,p])
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    if len(sur.index) == 0 and not corr:
        WORDS = SURNAME+TEXT
        word = regex.sub('', text).split()
        for i, w in enumerate(word):
            word[i] = correction(w)
        sur = find_by_surname_2(' '.join(word),personnel,True)
    return sur

def find_by_name(text,personnel,corr):
    data = []
    for i, s in personnel.iterrows():
        if s['name'] in text:
            data.append(s)
    sur = pd.DataFrame(data)
    #if len(pd.unique(sur["surname"])) > 1 and len(words) > 1: # if more then one surename in result, reduce up to first in text
        #len(words) just as hedge against error
    #    sur = find_by_surname(words[0:-1],sur, False)
    #if len(sur.index) == 0 and not corr:
    #    WORDS = NAME+TEXT
    #    word = regex.sub('', text).split()
    #    for i, w in enumerate(word):
    #        word[i] = correction(w)
    #    sur = find_by_surname(' '.join(word),personnel,True)
    return sur

In [16]:
def find_person(text, date, personnel):
    global phase
    global l
    phase = "char_replace"
    text = text.replace("À","A'")
    text = text.replace("È","E'")
    text = text.replace("É","E'")
    text = text.replace("Ì","I'")
    text = text.replace("Ò","O'")
    text = text.replace("Ó","O'")
    text = text.replace("Ù","U'")
    
    # take only up to 20 first words
    word = regex.sub('', text).split()
    #average = sum(len(w) for w in word) / len(word)
    #if average < 4:
    #    if len(word) >= 60: #avarage 6-8 letters in italian word (6*20 = 120), but segment have limit of 60
    #        seg = segment.segment(''.join(word[0:60]))
    #    else:
    #        seg = segment.segment(''.join(word))
    #    text = ' '.join(seg)
    #elif len(word) >= 20:
    text = ' '.join(word[0:20])
    #print(text)
    phase = "date_search"
    date = personnel[(personnel["dataInizio"] <= date) & (personnel["dataFine"] >= date)]
    if len(date.index) >= 1:
        phase = "sur_search"
        sur = find_by_surname_2(text.upper(), date, False)
        #print(sur)
        if len(sur.index) == 0:
            return None
        elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
            return sur.iloc[0]['persona']
        else:
            phase = "name_search"
            name = find_by_name(text.upper(), sur, False)
            #print('-------------------')
            #print(name)
            if len(name.index) == 1 or (len(name.index) > 1 and len(pd.unique(name['persona']))==1):
                return name.iloc[0]['persona']
            elif (len(name.index) > 1 and len(name['surname'].value_counts()) > 1):
                word = regex.sub('', text.upper()).split()
                sur2 = pd.DataFrame({'A' : []})
                i = 0
                while(sur2.empty and (i < 20 and i < len(word))): #limit of frase up to 20 words
                    sur2 = find_by_surname(' '.join(word[0:i]),name,False) # check what surname is appering first in text 
                    i+=1
                sur = sur2
                #print("----------")
                #print(sur)
                if len(sur.index) == 0:
                    return None
                elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
                    return sur.iloc[0]['persona']
                else: return None
            else:
                word = regex.sub('', text.upper()).split()
                sur2 = pd.DataFrame({'A' : []})
                i = 0
                while(sur2.empty and (i < 20 and i < len(word))): #limit of frase up to 20 words
                    sur2 = find_by_surname(' '.join(word[0:i]),sur,False) # check what surname is appering first in text 
                    i+=1
                sur = sur2
                #print("----------")
                #print(sur)
                if len(sur.index) == 0:
                    return None
                elif (len(sur.index) == 1) or (len(pd.unique(sur['persona'])) == 1):
                    return sur.iloc[0]['persona']
                else: return None
    return None #No person found with that surename with correction

In [18]:
l = len(text2.index)
res = []

regex = re.compile("[^a-zA-Z -']")

for index, row in text2.iterrows():
    #DEBUG
    #if index%100==0:
    clear_output(wait=True)
    print('done '+str(index)+'/'+str(l)+'    '+str((index/l)*100)+'%')
    #DEBUG
    global phase
    phase = "search"
    person = find_person(row['text'], row['date'], filtered_personnel)
    phase = "take person data"
    predicted = filtered_personnel.loc[filtered_personnel['persona'] == person]
    if person is not None:
        pred_surname = predicted.iloc[0]['surname']
        pred_name = predicted.iloc[0]['name']
    else:
        pred_surname = None
        pred_name = None
    phase = "check if correct"
    if row["persona"] == person:
        correct = True
    else:
        correct = False
    phase = "append"
    
    res.append({"True persona": row["persona"], "True name": row["name"], "True surname": row["surname"],
                "Text":row["text"],  "Correct":correct, "Predicted persona": person,
                "Predicted surname": pred_surname, 
                "Predicted name": pred_name})
            
clear_output(wait=True)
print('done '+str(index)+'/'+str(l)+'    '+str(((index)/l)*100)+'%')
result = pd.DataFrame.from_dict(res)
result['Correct'].describe()

done 999/1000    99.9%


count     1000
unique       2
top       True
freq       654
Name: Correct, dtype: object

In [18]:
#import importlib
#importlib.reload(segment2_sur)

In [19]:
pd.set_option("display.max_rows", None)
result[result['Correct'] == False]

Unnamed: 0,True persona,True name,True surname,Text,Correct,Predicted persona,Predicted surname,Predicted name
1,http://dati.camera.it/ocd/persona.rdf/p50373,STEFANO,STEFANI,"STEFANO CUSUMANO, ribadita la netta contrariet...",False,,,
17,http://dati.camera.it/ocd/persona.rdf/p50353,FILIPPO,MANCUSO,FILIPPO MANCUSO ritiene che il disposto normat...,False,,,
20,http://dati.camera.it/ocd/persona.rdf/p38440,EUGENIO,RICCIO,"EUGENIO RICCIO. Signor Presidente, signor sott...",False,,,
23,http://dati.camera.it/ocd/persona.rdf/p17300,VITTORINO,CARRA,"VITTORINO, BIANCHI FORTUNATO ,",False,http://dati.camera.it/ocd/persona.rdf/p830,BIANCHI,FORTUNATO
26,http://dati.camera.it/ocd/persona.rdf/p307424,FEDERICO,MOLLICONE,"FEDERICO FORNARO (LEU). No, spiace perché…",False,http://dati.camera.it/ocd/persona.rdf/p306398,FORNARO,FEDERICO
27,http://dati.camera.it/ocd/persona.rdf/p15580,VINCENZO,GATTO,GATTO. So come sia difficile per me trasferirm...,False,,,
29,http://dati.camera.it/ocd/persona.rdf/p9830,TULLIO,PIETROBONO,TROBONO. — Al Presidente del Consiglio de i mi...,False,,,
30,http://dati.camera.it/ocd/persona.rdf/p9980,GIUSEPPINA,RE,"RE GIUSEPPINA, Segretario, legge . (V. stampat...",False,,,
33,http://dati.camera.it/ocd/persona.rdf/p305528,IVAN,CATALANO,IVAN CATALANO. Grazie Presidente. Il collega Q...,False,,,
34,http://dati.camera.it/ocd/persona.rdf/p50024,LUIGI,MASSA,LUIGI MASSA dichiara il voto convintamente fav...,False,,,


In [None]:
find_person(text2.iloc[25]['text'], text2.iloc[25]['date'], filtered_personnel)

In [None]:
import importlib
importlib.reload(segment2_sur)
i = 25
word = regex.sub('', text2.iloc[i]['text']).split()
#for w in word:
segment2_sur.segmentWithProb(' '.join(word[0:20]))

In [None]:
import segment2_sur
i = 404
print(text2.iloc[i]['text'])
print('----------------')
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[i]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[i]['date'])]
word = regex.sub('', text2.iloc[i]['text']).split()
if len(word) >=20:
    print(segment2_sur.segment(' '.join(word[0:20])))
    f = find_by_surname_2(' '.join(word[0:20]),date, False)
else:
    print(segment2_sur.segment(' '.join(word)))
    f = find_by_surname_2(' '.join(word),date, False)
f

In [20]:
text2.iloc[1]['text']

'STEFANO CUSUMANO, ribadita la netta contrarietà alla dottrina della guerra preventiva, ritiene che l’attuale situazione irachena non consenta il ritiro del contingente militare italiano impegnato in delicate operazioni; preannunzia quindi l’astensione sulla mozione Violante n. 401 ed il voto contrario sulla mozione Elio Vito n. 402. '

In [None]:
text2.iloc[1]['date']

In [None]:
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[1]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[1]['date'])]
t = text2.iloc[1]['text']
sur = find_by_surname(t.upper(),date,False)
sur

In [None]:
date = filtered_personnel[(filtered_personnel["dataInizio"] <= text2.iloc[1]['date']) & (filtered_personnel["dataFine"] >= text2.iloc[1]['date'])]
sur = find_by_surname(t.upper(),date,False)
print(sur)
if len(sur['surname'].value_counts()) > 1:
    word = regex.sub('', t.upper()).split()
    sur2 = pd.DataFrame({'A' : []})
    i = 0
    while(sur2.empty and i < 20 and i < len(word)):
        i+=1
        sur2 = find_by_surname(' '.join(word[0:i]),sur, False)
    sur = sur2
sur

In [None]:
find_by_surname(' '.join(word[0:1]),sur,False)

## Maximum entropy Markov model
aka Maximum entropy

aka maxent

(also in nltk)
nltk hmm

In [32]:
import random
dataset = []
for index, row in text.iterrows():
    if len(row['text']) == 0:
        continue
    s = row['surname'] + " " + row['name']
    dataset.append((row['text'], s))
random.shuffle(dataset)

In [20]:
import nltk
from nltk.tag import hmm
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(trainset)

NameError: name 'trainset' is not defined

In [30]:
def merge(customerreview, reviewrating): 
    merged_list = []
    for i in range(0, len(customerreview)):
        if len(customerreview) == 0: continue
    merged_list.append((customerreview[i], reviewrating[i]))
    return merged_list 

dataset = (merge(text['text'].tolist(), (text['surname'] + ' ' + text['name']).tolist())) 

In [33]:
round(len(dataset)*0.7)

837904

In [35]:
random.shuffle(dataset)
trainset = dataset[:round(len(dataset)*0.7)]
testset = dataset[round(len(dataset)*0.7):]

In [36]:
testset[0][0]

'SCALIA F F F F F F '

In [None]:
tagger.tag(testset[0][0])

[('B', 'U'),
 ('E', 'U'),
 ('N', 'U'),
 ('E', 'U'),
 ('D', 'U'),
 ('E', 'U'),
 ('T', 'U'),
 ('T', 'U'),
 ('O', 'L'),
 (' ', 'L'),
 ('D', 'L'),
 ('E', 'L'),
 ('L', 'L'),
 ('L', 'L'),
 ('A', 'L'),
 (' ', 'L'),
 ('V', 'L'),
 ('E', 'L'),
 ('D', 'L'),
 ('O', 'L'),
 ('V', 'L'),
 ('A', 'L'),
 (',', 'L'),
 (' ', 'L'),
 ('S', 'L'),
 ('o', 'L'),
 ('t', 'L'),
 ('t', 'L'),
 ('o', 'L'),
 ('s', 'L'),
 ('e', 'L'),
 ('g', 'L'),
 ('r', 'L'),
 ('e', 'L'),
 ('t', 'L'),
 ('a', 'L'),
 ('r', 'L'),
 ('i', 'L'),
 ('o', 'L'),
 (' ', 'L'),
 ('d', 'L'),
 ('i', 'L'),
 (' ', 'L'),
 ('S', 'L'),
 ('t', 'L'),
 ('a', 'L'),
 ('t', 'L'),
 ('o', 'L'),
 (' ', 'L'),
 ('p', 'L'),
 ('e', 'L'),
 ('r', 'L'),
 (' ', 'L'),
 ('g', 'L'),
 ('l', 'L'),
 ('i', 'L'),
 (' ', 'L'),
 ('a', 'L'),
 ('f', 'L'),
 ('f', 'L'),
 ('a', 'L'),
 ('r', 'L'),
 ('i', 'L'),
 (' ', 'L'),
 ('e', 'L'),
 ('s', 'L'),
 ('t', 'L'),
 ('e', 'L'),
 ('r', 'L'),
 ('i', 'L'),
 (' ', 'L'),
 ('e', 'L'),
 (' ', 'L'),
 ('l', 'L'),
 ('a', 'L'),
 (' ', 'L'),
 ('c', 'L'),

In [47]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Anton\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


ORGANIZATION BENEDETTO
ORGANIZATION DELLA
GPE Sottosegretario
PERSON Stato
PERSON Signor
PERSON Valente
GPE Ventimiglia
GPE Chiede
GPE Italia
GPE Si
GPE La
GPE Ventimiglia
GPE Italia
PERSON Francia
PERSON Paesi
GPE Europa
GPE Ventimiglia
GPE Imperia
GPE Nel
ORGANIZATION No Border
GPE Italia
PERSON Francia
GSP Con
GPE Ventimiglia
GPE La
GPE Nel
ORGANIZATION Contestualmente
PERSON Rete Ferroviaria Italiana
GPE Ciò
PERSON Sant
GPE Ventimiglia
PERSON Gianchette
GPE Per
PERSON Croce Rossa Italiana
PERSON Caritas


In [56]:
i = 3
for sent in nltk.sent_tokenize(testset[i][0]):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))
print('----------------')
print(testset[i][1])

ORGANIZATION BRUNO
GPE SOLAROLI
GPE Sottosegretario
PERSON Stato
ORGANIZATION Contento
----------------
SOLAROLI BRUNO


In [57]:
testset[i][0]

'BRUNO SOLAROLI, Sottosegretario di Stato per il tesoro, il bilancio e la programmazione economica. Il Governo concorda con il parere espresso dal relatore per la maggioranza e deve fare una osservazione per quanto riguarda gli identici emendamenti Contento 75.3, Possa 75.4 e Pezzoli 75.5, il cui parere subordina alla condizione che al comma 1, tabella 1, voce: « Legge n. 194 del 1998, articolo 2, comma 6: Trasporti ... » rimanga l’indicazione di 50 miliardi come previsto e poi si aggiunga: « Legge n. 194 del 1998, articolo 2, comma 5: » con l’indicazione di 30 miliardi, che è la cifra già indicata sotto. È chiaro ? '

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
sw = stopwords.words('italian')

#corpus = text['text'].tolist()

tokens = []

for i in range(len(dataset)):
    t = dataset[i][0]
    tokens.append([x for x in word_tokenize(t) if x not in sw and x not in string.punctuation])
    clear_output(wait=True)
    print('done '+str(i)+'/'+str(len(dataset))+'    '+str(((i)/len(dataset))*100)+'%')

#tokens = [[x for x in word_tokenize(t) if x not in sw and x not in string.punctuation] for t in corpus]
tokens

done 349459/1197006    29.19442341976565%


In [22]:
def build_vocab(token_batches, min_count=15) -> dict:
    '''Build vocabulary and filter rare words.
    
    Args:
        token_batches(list): Token batches.
        min_count(int, optional, default=15): Minimum frequency threshold.
    Returns:
        vocab(dict): Dictionary containing token and frequencies.
    '''
    
    counts = Counter([token for tokens in token_batches for token in tokens])
    vocab = list(counts.keys())
    vocab = {k:v for k, v in counts.items() if v > min_count} # Filtering
    return vocab

vocab = build_vocab(tokens, min_count=10)
vocab_size = len(vocab)

In [23]:
def mapping(vocab: list) -> tuple:
    '''
    Generate mapping dictionaries.
    
    Args:
        vocab(dict): Dictionary containing token and frequencies.
    Returns:
        token_to_id(dict): Mapping token -> id. 
        id_to_token(dict): Mapping id -> token. 
    '''
    token_to_id = dict()
    id_to_token = dict()

    for i, token in enumerate(vocab):
        token_to_id[token] = i
        id_to_token[i] = token

    return token_to_id, id_to_token

token_to_id, id_to_token = mapping(vocab)

In [None]:
def softmax(x):
    x = x.astype(float)
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [None]:
from gensim.models import Word2Vec
import nltk
sentences = []
for doc in corpus:
    for s in nltk.tokenize.sent_tokenize(doc):
        sentences.append(s.lower())
tokens = [[x for x in word_tokenize(t) if x not in sw and x not in 
           string.punctuation] for t in sentences]

In [None]:
model = Word2Vec(sentences=tokens, vector_size=100, 
                 window=6, epochs=20, min_count=1, workers=4)

In [None]:
model.wv.similar_by_word('SOLAROLI')