In [73]:
import pandas as pd
import numpy as np
path = 'beau_monde_1908.txt'

In [74]:
# Import data
r = open(path, 'r')
raw_data = r.readlines()
r.close()
raw_data = raw_data[23:] # Skipping intro of the directory

In [75]:
raw_data

['A\n',
 '\n',
 '\n',
 "Aage (cl’) voir Colmet (I'Aage.\n",
 '\n',
 '\n',
 'ABRADIE-d’ARRAST (Michel-Robert d’), 5S bis, rue\n',
 'Jouffroy (xvu*) — et d’Elhorriaga, S Saint-\n',
 'Jcan-de-Luz (Basses-Pyrénées).\n',
 '\n',
 '\n',
 'ABBATUCCI, née de FORESTA (M-), 103, rue La\n',
 'Boétie (vin*) [35 321.S7] — et fljj Rey de Foresta, SI\n',
 "[s ' â [A] Montmorency (Seine-et-Oise).\n",
 'ABBATUCCI (C“ Jacques), 103, rue La Boétie (mi‘)\n',
 '[35 521.57] — et même A . Sp-RR\n',
 '\n',
 '\n',
 'ABBATUCCI (C1""*), 32, rue Washington (vni\'j.\n',
 'ABEILLE (Adolphe),27, Faubourg-Saint-Honoré (vm*)\n',
 'et haras de Viroflay, CK! feâ Virollay [35 3)\n',
 '\n',
 '\n',
 '(Seine-et-Oise). RR-UA-T\n',
 '\n',
 '\n',
 'ABOILARI) Georges), U. #,[et M“" née BONUS], 46,\n',
 'avenue de Breteuil (vu*) [35 707.11] — et de\n',
 'Peyrieu, Kl \\p= eS Peyrieu (Ain). AC-M-YC\n',
 "ABOVILLE (V‘* H. d’), O.iK, Colonel du 131' de ligne\n",
 'à Orléans [et V*,“‘ née de GOUVELLO de KÉ-\n',
 'RIAVAL], à d® Kerentr

# CRF TEST

In [16]:
import random
from collections import Counter

from IPython.display import display

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import metrics

import spacy

from utils.export import export_to_excel
from utils import tags_format
from utils.features import generate_features

np.random.seed(42)
random.seed(42)

nlp = spacy.load('it_core_news_sm')

tags = [
    'FNAME',
    'LNAME',
    'ECC',
    'GOV'
]
tags_prefixed = ["-".join([prefix, tag]) for tag in tags for prefix in 'IB']
tags_prefixed.append('O')
print(tags_prefixed)

['I-FNAME', 'B-FNAME', 'I-LNAME', 'B-LNAME', 'I-ECC', 'B-ECC', 'I-GOV', 'B-GOV', 'O']


In [51]:
#Put the people in a dataframe

buffer = ""
people = []
for line in raw_data:
    if(line == "\n"):
        if(len(buffer) > 0):
            people.append(buffer.replace('\n', ''))
            buffer = ""
    else:
        buffer += line
idx = np.arange(len(people))
df = pd.DataFrame({"idx": idx, "text" : people})
df = df.set_index("idx")
df.head()

Unnamed: 0_level_0,text
idx,Unnamed: 1_level_1
0,A
1,Aage (cl’) voir Colmet (I'Aage.
2,"ABRADIE-d’ARRAST (Michel-Robert d’), 5S bis, r..."
3,"ABBATUCCI, née de FORESTA (M-), 103, rue LaBoé..."
4,"ABBATUCCI (C1""""*), 32, rue Washington (vni'j.A..."


In [52]:
# Raphael's code to toeknize:

# We tokenize the text into words
df_tokens = df['text'].apply(lambda text: [
    str(token.text) for token in nlp(text, disable=['parser', 'tagger', 'ner'])])

# We stack the tokens to get a new token index
df_tokens = df_tokens.apply(pd.Series).rename_axis('tok_idx', axis=1).stack().to_frame('token')

# Finally, we join with the previous dataframe to get the fulltext column
df_tokens = df_tokens.join(df)
df_tokens = df_tokens[['text', 'token']]
display(df_tokens.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,text,token
idx,tok_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,A,A
1,0,Aage (cl’) voir Colmet (I'Aage.,Aage
1,1,Aage (cl’) voir Colmet (I'Aage.,(
1,2,Aage (cl’) voir Colmet (I'Aage.,cl
1,3,Aage (cl’) voir Colmet (I'Aage.,’


In [53]:
#Export to excel
export_to_excel(df_tokens, './propr_to_annotate.xlsx', tags=tags_prefixed)

In [None]:
# Reload

df_annotations_small = pd.read_excel('./propr_to_annotate.xlsx', index_col=[0,1])
df_annotations_small['tag'] = df_annotations_small['tag'].str.upper()

is_annotated = (df_annotations_small['tag'].fillna(False) # Fill all NaN with False
                .apply(lambda x: True if x else False) # Make filled tag True
                .reset_index().groupby('idx') # Transform into dataframe and group by entry
                .apply(lambda x: all(x['tag']))) # Check that all tags are filled
df_annotations_small = df_annotations_small.loc[is_annotated[is_annotated].index] # Only keep entries with all tags filled

annotations_small = df_annotations_small.groupby(level=[0])[['token', 'tag']].agg(list).rename(columns={'token': 'tokens', 'tag': 'tags_iob2'})
annotations_small.head()

In [None]:
train_idx, test_idx = train_test_split(annotations_small.index, test_size=0.5)
print(f"There is {len(train_idx)} training samples and {len(test_idx)} test samples.")

In [None]:
# Features for the current word
default_features = {
        'bias': None,
        'word.lower': lambda word: word.lower(),
        'word.lastChar': lambda word: word[-1:]
        # TODO add other features
    }
# Features to be computed for surrounding words
default_surrounding_features = {
        # TODO add features
}

In [None]:
tokens, features = annotations_small[['tokens', 'features']].iloc[0].values
print(tokens[0])
print(features[0])
print()
print(tokens[2])
print(features[2])
print()
print(tokens[-1])
print(features[-1])

In [None]:
train = annotations_small.loc[train_idx]
test = annotations_small.loc[test_idx]

# TODO you can change the encoding scheme to see if it performs diffently
tags_col = 'tags_io'

X_train = train['features'].values
y_train = train[tags_col].values
X_test = test['features'].values
y_test = test[tags_col].values

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=3e-1,
    c2=1e-4,
    max_iterations=1000,
    all_possible_transitions=True
)
crf.fit(X_train, y_train);

In [None]:
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_test)
print(f"Weighted f1_score {metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels):.3f}")
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

# Splitting people

In [83]:
def getAddress(data):
    address=""
    for i, o in enumerate(data):
        if(o.replace(" ", "").isdigit() or 'bis' in o):
            for addr in data[i:]:
                address += addr.replace("\n", " ")
    if(len(address) < 1):
        return "unknown"
    return address.split('(')[0]

In [84]:
buffer = ""
all_people = []
for line in raw_data:
    if(line == "\n"):
        if(len(buffer) > 0):
            all_people.append(buffer.split(','))
            buffer = ""
    else:
        buffer += line
all_people

[['A\n'],
 ["Aage (cl’) voir Colmet (I'Aage.\n"],
 ['ABRADIE-d’ARRAST (Michel-Robert d’)',
  ' 5S bis',
  ' rue\nJouffroy (xvu*) — et d’Elhorriaga',
  ' S Saint-\nJcan-de-Luz (Basses-Pyrénées).\n'],
 ['ABBATUCCI',
  ' née de FORESTA (M-)',
  ' 103',
  ' rue La\nBoétie (vin*) [35 321.S7] — et fljj Rey de Foresta',
  " SI\n[s ' â [A] Montmorency (Seine-et-Oise).\nABBATUCCI (C“ Jacques)",
  ' 103',
  ' rue La Boétie (mi‘)\n[35 521.57] — et même A . Sp-RR\n'],
 ['ABBATUCCI (C1""*)',
  ' 32',
  " rue Washington (vni'j.\nABEILLE (Adolphe)",
  '27',
  ' Faubourg-Saint-Honoré (vm*)\net haras de Viroflay',
  ' CK! feâ Virollay [35 3)\n'],
 ['(Seine-et-Oise). RR-UA-T\n'],
 ['ABOILARI) Georges)',
  ' U. #',
  '[et M“" née BONUS]',
  ' 46',
  '\navenue de Breteuil (vu*) [35 707.11] — et de\nPeyrieu',
  ' Kl \\p= eS Peyrieu (Ain). AC-M-YC\nABOVILLE (V‘* H. d’)',
  ' O.iK',
  " Colonel du 131' de ligne\nà Orléans [et V*",
  '“‘ née de GOUVELLO de KÉ-\nRIAVAL]',
  ' à d® Kerentré',
  ' K {s= aË [0] A

In [85]:
# Keep only names and addresses:
names = []
addresses = []
for people in all_people:
    if(len(people) > 1):
        names.append(people[0])
        addresses.append(getAddress(people))

df_people = pd.DataFrame({"name": names, "address": addresses})

In [87]:
df_people

Unnamed: 0,name,address
0,ABRADIE-d’ARRAST (Michel-Robert d’),5S bis rue Jouffroy
1,ABBATUCCI,103 rue La Boétie
2,"ABBATUCCI (C1""""*)",32 rue Washington
3,ABOILARI) Georges),46 avenue de Breteuil
4,"ABOVILLE (B** Christian d’) [et B*»”"" née I1EN...",23 rue de Grenelle
...,...,...
11097,ZOGHEB (OGeorges de) [et C-- née SINANO],8 rue Alboni
11098,"ZOGRAPHOS (M. et M""“ Solon)",30 avenue Kléber
11099,Z0L0T0VITZ (Lubomir),103 boul. Haussmann
11100,ZUYLEN de NYEVELT (R-- Étienne de) [et B*“* né...,70 av. du Bois-de-Boulogne


In [89]:
unknown = df_people[[x == "unknown" for x in df_people["address"] ]]
print("Missing addresses: " + str(len(unknown)/len(df)))

Missing addresses: 0.25209336791404224


# First try:

In [71]:
# Merging people information and removing empty lines
buffer = ""
all_people = []
for line in raw_data:
    if(line == '\n'):
        if(len(buffer) > 0):
            all_people.append(buffer)
            buffer = ""
    elif(starts_with_family_name(line)):
        if(len(buffer) > 0):
            all_people.append(buffer)
        buffer = line
    else:
        buffer += line
all_people

['A\n',
 "Aage (cl’) voir Colmet (I'Aage.\n",
 'ABRADIE-d’ARRAST (Michel-Robert d’), 5S bis, rue\nJouffroy (xvu*) — et d’Elhorriaga, S Saint-\nJcan-de-Luz (Basses-Pyrénées).\n',
 "ABBATUCCI, née de FORESTA (M-), 103, rue La\nBoétie (vin*) [35 321.S7] — et fljj Rey de Foresta, SI\n[s ' â [A] Montmorency (Seine-et-Oise).\n",
 'ABBATUCCI (C“ Jacques), 103, rue La Boétie (mi‘)\n[35 521.57] — et même A . Sp-RR\n',
 'ABBATUCCI (C1""*), 32, rue Washington (vni\'j.\n',
 'ABEILLE (Adolphe),27, Faubourg-Saint-Honoré (vm*)\net haras de Viroflay, CK! feâ Virollay [35 3)\n',
 '(Seine-et-Oise). RR-UA-T\n',
 'ABOILARI) Georges), U. #,[et M“" née BONUS], 46,\navenue de Breteuil (vu*) [35 707.11] — et de\nPeyrieu, Kl \\p= eS Peyrieu (Ain). AC-M-YC\n',
 "ABOVILLE (V‘* H. d’), O.iK, Colonel du 131' de ligne\nà Orléans [et V*,“‘ née de GOUVELLO de KÉ-\nRIAVAL], à d® Kerentré, K {s= aË [0] Auray\n(Morbihan) — et de Rouville, K à\n[P.L.M.] Malesherbes (Loiret).\n",
 'Paris-Mondain — 1908\n',
 'ABOVILLE (B**

In [78]:
def getAddress(data):
    address=""
    for i, o in enumerate(data):
        if(o.replace(" ", "").isdigit() or 'bis' in o):
            for addr in data[i:]:
                address += addr.replace("\n", " ")
    if(len(address) < 1):
        return "unknown"
    return address

def is_family_name(str):
    if(len(str) < 2):
        return False
    nb_up = 0
    for c in str:
        if(c.isupper()):
            nb_up += 1
    
    return (nb_up/len(str) > 0.6)

def starts_with_family_name(line):
    if(" " not in line):
        return is_family_name(line)
    first_space = line.index(" ")
    if("]" in line[:first_space]):
        return False
    return is_family_name(line[:first_space])

def nom_jeune_fille(str):
    if("née" not in str):
        return ""
    name = ''
    nee_found = False
    for word in str.split(" "):
        if(word == "née"):
            nee_found = True
        if(is_family_name(word) and nee_found):
            name = "née "+ word.upper()
            break
    
    return name

In [72]:
people_split = []
names = []
addresses = []
for i, line in enumerate(all_people):
    data = line.split(',')
    if(len(data) > 1):
        if(starts_with_family_name(data[0])):
            names.append(data[0] + " " + nom_jeune_fille(data[1]))
            addresses.append(getAddress(data))
            people_split.append(data)
    
people_split

IndexError: string index out of range

In [13]:
df = pd.DataFrame({"name" : names, "address": addresses})
df

Unnamed: 0,name,address
0,ABRADIE-d’ARRAST (Michel-Robert d’),5S bis rue Jouffroy (xvu*) — et d’Elhorriaga ...
1,ABBATUCCI née FORESTA,103 rue La Boétie (vin*) [35 321.S7] — et flj...
2,ABBATUCCI (C“ Jacques),103 rue La Boétie (mi‘) [35 521.57] — et même...
3,"ABBATUCCI (C1""""*)",32 rue Washington (vni'j.
4,ABEILLE (Adolphe),27 Faubourg-Saint-Honoré (vm*) et haras de Vir...
5,ABOILARI) Georges),46 avenue de Breteuil (vu*) [35 707.11] — et ...
6,ABOVILLE (V‘* H. d’),unknown
7,"ABOVILLE (B** Christian d’) [et B*»”"" née I1EN...",23 rue de Grenelle (vu*)— et A deGlux K Chàte...
8,ABRANTÈS née JL'XOT,12 ave- nue Henri-Martin (xvi*) — et sêâ de B...
9,ABRANTÈS (Duc d’),12 avenue Henri- Martin (xvi*) —- même de Bai...


In [14]:
unknown = df[[x == "unknown" for x in df["address"] ]]
print("Missing addresses: " + str(len(unknown)/len(df)))

Missing addresses: 0.3093059042682433


# Trials

In [7]:
import pandas as pd
#import lxml.etree.ElementTree as ET
from lxml import etree
path = 'fdh-gallica/bpt6k205233j/alto/bpt6k205233j_038.xml'

In [147]:
tree = etree.parse(path)
root = tree.getroot()
full_name = ''
addr_found = False
complete_person = False
for elem in (root.findall(".//{http://bibnum.bnf.fr/ns/alto_prod}TextLine")):
    for line in elem.findall(".//{http://bibnum.bnf.fr/ns/alto_prod}String"):
        word = line.attrib['CONTENT']
        #print(word)
        if(complete_person):
            complete_person=False
            if(is_family_name(word)):
                print(full_name)
                full_name=''
                addr_found= False
                print()
        full_name += word
        full_name += ' '   
        if('.' in word and addr_found):
            complete_person = True
        if(is_address(word)):
            addr_found = True
        

ABZ AGIT ABZAC (C""1 Pauld'), 181, rue de La Pompe (xvi*). 

fACHER de MONTGASCON (B™ d') [et B"" née Jeanne de COURCY], 18, avenue d'Antin (vin"),– et de Villequier, ES è53 [Q], Caudebec-en-Caux (Seine- Inférieure) ^= Villequier. 

ACHERY de SAN" DONNINO (P" d'), Ex-camérier secret, 62, avenue de la Grande-Armée (xvn*) et eJj de Soisy-sous-Montmorency, El Ji fi [S] Soisy-sous-Montmorency (Seine-et-Oise). 

ACHON (B°n Renéd'), Capitaine au l"r chasseurs [et B™" née de LAMOLÈRE, C"– d'HUST], 124, rue de Provence (vin*), et ^5 de la Roche-de- Gennes, El Sf Gennes (Maine-et-Loire), gg [Or], Les Rosiers-sur-Loire. 

ACLOQUE (M™) née DUCHANOY, 26, avenue Mar- ceau (xvi*) – et j*5 de La Borde, Kl Saint-An- toiue-du-Rocher, j: (I au d N" S), âï Wettray (Indre-et-Loire). 

ACLOQUE (M™ A.), née SEBERT, 9, place des Etats- Unis (xvi1)- [3698.92]. 

ACLOQUE (Maurice), 19, rue de Presbourg (xvi1) et villa Monténégro [3°. 590], à Cannes (Alpes-Mari- times). 

AC-Y ACOLLAS (René), $(, Conseiller réf

In [144]:
addr = ['avenue', 'av.', 'rue', 'r.', 'boulevard', 'b.', 'mêmes adresses', 'même adresse', 'villa', 'place']
def is_address(str):
    return str in addr

In [130]:
def is_family_name(str):
    nb_up = 0
    for c in str:
        if(c.isupper()):
            nb_up += 1
    
    return (nb_up/len(str) > 0.6)

In [131]:
def nom_jeune_fille(str):
    if("née" not in str):
        return ""
    name = ''
    nee_found = False
    for word in str.split(" "):
        if(word == "née"):
            nee_found = True
        if(is_family_name(word) and nee_found):
            name = "née "+ word.upper()
            break
    
    return name
    

In [132]:
nom_jeune_fille("Arnaud CHAZEAU née GATeAU")

'née GATEAU'

In [68]:
s = "34"
s.isdigit()

True