In [111]:
import pandas as pd
import numpy as np
import re
path = 'beau_monde_1908.txt'

In [2]:
# Import data
r = open(path, 'r')
raw_data = r.readlines()
r.close()
raw_data = raw_data[23:] # Skipping intro of the directory

In [3]:
raw_data

['A\n',
 '\n',
 '\n',
 "Aage (cl’) voir Colmet (I'Aage.\n",
 '\n',
 '\n',
 'ABRADIE-d’ARRAST (Michel-Robert d’), 5S bis, rue\n',
 'Jouffroy (xvu*) — et d’Elhorriaga, S Saint-\n',
 'Jcan-de-Luz (Basses-Pyrénées).\n',
 '\n',
 '\n',
 'ABBATUCCI, née de FORESTA (M-), 103, rue La\n',
 'Boétie (vin*) [35 321.S7] — et fljj Rey de Foresta, SI\n',
 "[s ' â [A] Montmorency (Seine-et-Oise).\n",
 'ABBATUCCI (C“ Jacques), 103, rue La Boétie (mi‘)\n',
 '[35 521.57] — et même A . Sp-RR\n',
 '\n',
 '\n',
 'ABBATUCCI (C1""*), 32, rue Washington (vni\'j.\n',
 'ABEILLE (Adolphe),27, Faubourg-Saint-Honoré (vm*)\n',
 'et haras de Viroflay, CK! feâ Virollay [35 3)\n',
 '\n',
 '\n',
 '(Seine-et-Oise). RR-UA-T\n',
 '\n',
 '\n',
 'ABOILARI) Georges), U. #,[et M“" née BONUS], 46,\n',
 'avenue de Breteuil (vu*) [35 707.11] — et de\n',
 'Peyrieu, Kl \\p= eS Peyrieu (Ain). AC-M-YC\n',
 "ABOVILLE (V‘* H. d’), O.iK, Colonel du 131' de ligne\n",
 'à Orléans [et V*,“‘ née de GOUVELLO de KÉ-\n',
 'RIAVAL], à d® Kerentr

# Load into DataFrame

In [4]:
def cleanAddress(address):
    cleaned_address = address.replace("av.", "avenue") \
    .replace("boul.", "boulevard") \
    .replace("b.", "boulevard") \
    .replace("r.", "rue") \
    .replace("¬", "")
    return cleaned_address

def getAddress(data):
    address=""
    for i, o in enumerate(data):
        if(o.replace(" ", "").isdigit() or 'bis' in o):
            for addr in data[i:]:
                address += addr.replace("\n", " ")
    if(len(address) < 1):
        return "unknown"
    return cleanAddress(address.split('(')[0])

In [5]:
buffer = ""
all_people = []
for line in raw_data:
    if(line == "\n"):
        if(len(buffer) > 0):
            all_people.append(buffer.split(','))
            buffer = ""
    else:
        buffer += line

In [6]:
# Keep only names and addresses:
names = []
addresses = []
for people in all_people:
    if(len(people) > 1):
        names.append(people[0])
        addresses.append(getAddress(people))

df_people = pd.DataFrame({"name": names, "address": addresses})

In [7]:
unknown = df_people[[x == "unknown" for x in df_people["address"] ]]
print("Unknown addresses: " + str(len(unknown)/len(df_people)))

Unknown addresses: 0.3064312736443884


In [8]:
#Drop unknown addresses:
df_people_known = df_people[[x != "unknown" for x in df_people["address"]]].copy()
df_people_known

Unnamed: 0,name,address
0,ABRADIE-d’ARRAST (Michel-Robert d’),5S bis rue Jouffroy
1,ABBATUCCI,103 rue La Boétie
2,"ABBATUCCI (C1""""*)",32 rue Washington
3,ABOILARI) Georges),46 avenue de Breteuil
4,"ABOVILLE (B** Christian d’) [et B*»”"" née I1EN...",23 rue de Grenelle
...,...,...
11097,ZOGHEB (OGeorges de) [et C-- née SINANO],8 rue Alboni
11098,"ZOGRAPHOS (M. et M""“ Solon)",30 avenue Kléber
11099,Z0L0T0VITZ (Lubomir),103 boulevard Haussmann
11100,ZUYLEN de NYEVELT (R-- Étienne de) [et B*“* né...,70 avenue du Bois-de-Boulogne


In [9]:
#df_people_known.to_csv (r'./people.csv', index = None, header=True)

# Names cleaning

In [179]:
def flatten(array):
    flattened = ""
    for s in array:
        flattened += s + " "
    return flattened

def get_family_name(str):
    names = ""
    splitted = str.split(" ")
    for i, word in enumerate(splitted):
        if(len(word) > 1):
            upper_chars_ratio = sum(1 for c in word if c.isupper())/len(word)
            if(upper_chars_ratio > 0.6 or word == "de"):
                if(len(names)>0):
                    names += " "
                names += word
            if(word == "née" and len(str) > i): 
                wife_name = get_family_name(flatten(splitted[i+1:]))
                if(len(wife_name) > 0):
                    names += " et Mme née " + wife_name
                    break

    names = names.replace("0", "O").replace("1", "I").replace("\n", "")
    names = re.sub('[^a-zA-Z- éÉèÊêÈûÛüÜöÖ’]+', '', names)
    return names

def get_first_name(str):
    names = ""
    splitted = re.sub('[^a-zA-Z- éÉèÊêÈûÛüÜöÖ]+', '', str).split(" ")
    for word in splitted:
        if(len(word) > 1):
            
            lower_chars_ratio = sum(1 for c in word if c.islower())/len(word)
            alpha_ratio = sum(1 for c in word if c.isalpha())/len(word)
            if lower_chars_ratio > 0.6 and alpha_ratio > 0.6 and word[0].isupper():
                if(len(names)>0):
                    names += " "
                names += word
    return names

In [180]:
df_cleaned = df_people_known.copy()
df_cleaned["Last name"] = [get_family_name(x) for x in df_cleaned["name"]]
df_cleaned["First name"] = [get_first_name(x) for x in df_cleaned["name"]]
df_cleaned

Unnamed: 0,name,address,Last name,First name
0,ABRADIE-d’ARRAST (Michel-Robert d’),5S bis rue Jouffroy,ABRADIE-d’ARRAST,Michel-Robert
1,ABBATUCCI,103 rue La Boétie,ABBATUCCI,
2,"ABBATUCCI (C1""""*)",32 rue Washington,ABBATUCCI,
3,ABOILARI) Georges),46 avenue de Breteuil,ABOILARI,Georges
4,"ABOVILLE (B** Christian d’) [et B*»”"" née I1EN...",23 rue de Grenelle,ABOVILLE et Mme née IIENNE-CART,Christian
...,...,...,...,...
11097,ZOGHEB (OGeorges de) [et C-- née SINANO],8 rue Alboni,ZOGHEB et Mme née SINANO,OGeorges
11098,"ZOGRAPHOS (M. et M""“ Solon)",30 avenue Kléber,ZOGRAPHOS,Solon
11099,Z0L0T0VITZ (Lubomir),103 boulevard Haussmann,ZOLOTOVITZ,Lubomir
11100,ZUYLEN de NYEVELT (R-- Étienne de) [et B*“* né...,70 avenue du Bois-de-Boulogne,ZUYLEN de NYEVELT de ROTHSCHILD,Étienne


In [181]:
df_cleaned = df_cleaned[["name", "Last name", "First name", "address"]]
df_cleaned

Unnamed: 0,name,Last name,First name,address
0,ABRADIE-d’ARRAST (Michel-Robert d’),ABRADIE-d’ARRAST,Michel-Robert,5S bis rue Jouffroy
1,ABBATUCCI,ABBATUCCI,,103 rue La Boétie
2,"ABBATUCCI (C1""""*)",ABBATUCCI,,32 rue Washington
3,ABOILARI) Georges),ABOILARI,Georges,46 avenue de Breteuil
4,"ABOVILLE (B** Christian d’) [et B*»”"" née I1EN...",ABOVILLE et Mme née IIENNE-CART,Christian,23 rue de Grenelle
...,...,...,...,...
11097,ZOGHEB (OGeorges de) [et C-- née SINANO],ZOGHEB et Mme née SINANO,OGeorges,8 rue Alboni
11098,"ZOGRAPHOS (M. et M""“ Solon)",ZOGRAPHOS,Solon,30 avenue Kléber
11099,Z0L0T0VITZ (Lubomir),ZOLOTOVITZ,Lubomir,103 boulevard Haussmann
11100,ZUYLEN de NYEVELT (R-- Étienne de) [et B*“* né...,ZUYLEN de NYEVELT de ROTHSCHILD,Étienne,70 avenue du Bois-de-Boulogne


In [182]:
df_cleaned.to_csv (r'./people.csv', index = None, header=True)

# CRF TEST to clean names

In [52]:
import random
from collections import Counter

from IPython.display import display

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import metrics

import spacy

from utils.export import export_to_excel
from utils import tags_format
from utils.features import generate_features

np.random.seed(42)
random.seed(42)

nlp = spacy.load('it_core_news_sm')

tags = [
    'FNAME',
    'LNAME'
]
tags_prefixed = ["-".join([prefix, tag]) for tag in tags for prefix in 'IB']
tags_prefixed.append('O')
print(tags_prefixed)

['I-FNAME', 'B-FNAME', 'I-LNAME', 'B-LNAME', 'O']


In [53]:
#Put the people in a dataframe
names = df_people_known["name"].to_frame().copy()
names = names.reset_index()
names.columns = ["idx", "text"]
names = names.set_index("idx")
names

Unnamed: 0_level_0,text
idx,Unnamed: 1_level_1
0,ABRADIE-d’ARRAST (Michel-Robert d’)
1,ABBATUCCI
2,"ABBATUCCI (C1""""*)"
3,ABOILARI) Georges)
4,"ABOVILLE (B** Christian d’) [et B*»”"" née I1EN..."
...,...
11097,ZOGHEB (OGeorges de) [et C-- née SINANO]
11098,"ZOGRAPHOS (M. et M""“ Solon)"
11099,Z0L0T0VITZ (Lubomir)
11100,ZUYLEN de NYEVELT (R-- Étienne de) [et B*“* né...


In [54]:
# Raphael's code to toeknize:

# We tokenize the text into words
df_tokens = names['text'].apply(lambda text: [
    str(token.text) for token in nlp(text, disable=['parser', 'tagger', 'ner'])])

# We stack the tokens to get a new token index
df_tokens = df_tokens.apply(pd.Series).rename_axis('tok_idx', axis=1).stack().to_frame('token')

# Finally, we join with the previous dataframe to get the fulltext column
df_tokens = df_tokens.join(names)
df_tokens = df_tokens[['text', 'token']]
display(df_tokens.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,text,token
idx,tok_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,ABRADIE-d’ARRAST (Michel-Robert d’),ABRADIE
0,1,ABRADIE-d’ARRAST (Michel-Robert d’),-
0,2,ABRADIE-d’ARRAST (Michel-Robert d’),d’
0,3,ABRADIE-d’ARRAST (Michel-Robert d’),ARRAST
0,4,ABRADIE-d’ARRAST (Michel-Robert d’),(


In [55]:
#Export to excel
export_to_excel(df_tokens, './names_to_annotate.xlsx', tags=tags_prefixed)

In [77]:
# Reload
df_annotations_small = pd.read_excel('./names_annotated.xlsx', index_col=[0,1])
df_annotations_small['tag'] = df_annotations_small['tag'].str.upper()

is_annotated = (df_annotations_small['tag'].fillna(False) # Fill all NaN with False
                .apply(lambda x: True if x else False) # Make filled tag True
                .reset_index().groupby('idx') # Transform into dataframe and group by entry
                .apply(lambda x: all(x['tag']))) # Check that all tags are filled
df_annotations_small = df_annotations_small.loc[is_annotated[is_annotated].index] # Only keep entries with all tags filled

annotations_small = df_annotations_small.groupby(level=[0])[['token', 'tag']].agg(list).rename(columns={'token': 'tokens', 'tag': 'tags_iob2'})
annotations_small.head()

annotations_small['tags_io'] = annotations_small['tags_iob2'].apply(tags_format.iob2_to_io)
annotations_small['tags_biluo'] = annotations_small['tags_iob2'].apply(tags_format.iob2_to_biluo)
annotations_small.head()

Unnamed: 0_level_0,tokens,tags_iob2,tags_io,tags_biluo
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[ABRADIE, -, d’, ARRAST, (, Michel, -, Robert,...","[B-LNAME, O, B-LNAME, I-LNAME, O, B-FNAME, O, ...","[I-LNAME, O, I-LNAME, I-LNAME, O, I-FNAME, O, ...","[U-LNAME, O, B-LNAME, L-LNAME, O, U-FNAME, O, ..."
1,[ABBATUCCI],[B-LNAME],[I-LNAME],[U-LNAME]
2,"[ABBATUCCI, (, C1, "", "", *, )]","[B-LNAME, O, O, O, O, O, O]","[I-LNAME, O, O, O, O, O, O]","[U-LNAME, O, O, O, O, O, O]"
3,"[ABOILARI, ), Georges, )]","[B-LNAME, O, B-FNAME, O]","[I-LNAME, O, I-FNAME, O]","[U-LNAME, O, U-FNAME, O]"
4,"[ABOVILLE, (, B, *, *, Christian, d, ’, ), [, ...","[B-LNAME, O, O, O, O, B-FNAME, O, O, O, O, O, ...","[I-LNAME, O, O, O, O, I-FNAME, O, O, O, O, O, ...","[U-LNAME, O, O, O, O, U-FNAME, O, O, O, O, O, ..."


In [78]:
train_idx, test_idx = train_test_split(annotations_small.index, test_size=0.5)
print(f"There is {len(train_idx)} training samples and {len(test_idx)} test samples.")

There is 8 training samples and 8 test samples.


In [79]:

# Features for the current word
default_features = {
        'bias': None,
        'word.lower': lambda word: word.lower(),
        'word.upper': lambda word: word.upper(),
        'word.alpha': lambda word: word.isalpha()
    }
# Features to be computed for surrounding words
default_surrounding_features = {
        # TODO add features
}

In [80]:
def generate_features_venetian(tokens):
    return generate_features(tokens,
                      features=default_features,
                      n_surrounding=1,
                      surrounding_features=default_surrounding_features)

annotations_small['features'] = annotations_small['tokens'].apply(generate_features_venetian)

In [81]:
tokens, features = annotations_small[['tokens', 'features']].iloc[0].values
print(tokens[0])
print(features[0])
print()
print(tokens[2])
print(features[2])
print()
print(tokens[-1])
print(features[-1])

ABRADIE
['bias', 'word.lower=abradie', 'word.upper=ABRADIE', 'word.alpha=True', 'BOS']

d’
['bias', 'word.lower=d’', 'word.upper=D’', 'word.alpha=False']

)
['bias', 'word.lower=)', 'word.upper=)', 'word.alpha=False', 'EOS']


In [82]:
train = annotations_small.loc[train_idx]
test = annotations_small.loc[test_idx]

# TODO you can change the encoding scheme to see if it performs diffently
tags_col = 'tags_io'

X_train = train['features'].values
y_train = train[tags_col].values
X_test = test['features'].values
y_test = test[tags_col].values

In [83]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=3e-1,
    c2=1e-4,
    max_iterations=1000,
    all_possible_transitions=True
)
crf.fit(X_train, y_train);

In [84]:
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_test)
print(f"Weighted f1_score {metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels):.3f}")
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Weighted f1_score 0.417
              precision    recall  f1-score   support

     I-FNAME      0.000     0.000     0.000         4
     I-LNAME      0.875     0.350     0.500        20

   micro avg      0.875     0.292     0.438        24
   macro avg      0.438     0.175     0.250        24
weighted avg      0.729     0.292     0.417        24



In [76]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))
        
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-10:][::-1])

Top positive:
3.736425 O        word.alpha=False
3.298295 I-LNAME  BOS
2.072419 O        word.lower=de
2.072419 O        word.upper=DE
1.698029 O        word.lower=d
1.698029 O        word.upper=D
1.602541 I-LNAME  word.lower=d’
1.602541 I-LNAME  word.upper=D’
1.325730 O        word.lower=née
1.325730 O        word.upper=NÉE

Top negative:
-0.495128 I-FNAME  bias
0.008198 I-LNAME  word.upper=ARRAST
0.008198 I-LNAME  word.lower=arrast
0.190428 I-LNAME  word.upper=O
0.190428 I-LNAME  word.lower=o
0.205255 O        word.upper=)
0.205255 O        word.lower=)
0.414878 O        word.upper=-
0.414878 O        word.lower=-
0.497918 O        word.upper=(


# Trials

In [144]:
addr = ['avenue', 'av.', 'rue', 'r.', 'boulevard', 'b.', 'mêmes adresses', 'même adresse', 'villa', 'place']
def is_address(str):
    return str in addr