In [7]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer

**Utility functions**

In [8]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

### Loading the data from the 1884 OCR

In [9]:
df_dict = pd.read_excel("data/raw/beau_monde_1884_tables.xlsx", header = None, sheet_name = None)

In [10]:
df_dict['table_1'].head()

Unnamed: 0,0,1,2,3
0,M. Me,,Aai.,"Av. Parmentier, 10."
1,M.,d’,Abadie Ghailes.,"Rue Vaneau, 32."
2,Me,,Abadie-Gasquin H. ...,"Rue de Grenelle, 94."
3,y te y se,d’,Abancourt.,"Rue Vézelay, 11."
4,M. Me,d’,Abannza Gh. ...,"B1 Montmartre, 19."


Let's count how many addresses we have to start with.

In [11]:
num_adr = 0
for i in range(167):
    table = 'table_' + str(i+1)
    num_adr += count_elmt(df_dict[table])
    
print('At the beginning, we have %d addresses.' %num_adr)

At the beginning, we have 5709 addresses.


How many rows with missing values (or nan values) do we have? NEED TO CHECK WHERE THEY ACTUALLY ARE!!!

In [12]:
counter = 0
for table in df_dict:
    counter += df_dict[table].isna().sum().sum()
print('There are %d missing values in our data.' %(counter))

There are 5938 missing values in our data.


Since there a too many values to fill them in by hand, we will just get rid of the corresponding rows (at least for now)

In [20]:
for table in df_dict:
    df_dict[table].fillna('',inplace=True)
    
counter = 0
for table in df_dict:
    counter += df_dict[table].isna().sum().sum()
print('Now, there are %d missing values in our data.' %(counter))

Now, there are 0 missing values in our data.


Let's now harmonize the format of all the dataframes. We want to end up with one dataframe with two colums: Names and Addresses

In [21]:
cnt = Counter()
for i in range(167):
    table = 'table_' + str(i+1)
    cnt[str(len(df_dict[table].columns)) + ' columns'] += 1

cnt

Counter({'4 columns': 30, '3 columns': 131, '2 columns': 4, '5 columns': 2})

There seem to be different formats for the dataframes.

In [22]:
df_2 = pd.DataFrame(columns = [0,1])
df_3 = pd.DataFrame(columns = [0,1,2])
df_4 = pd.DataFrame(columns = [0,1,2,3])
df_5 = pd.DataFrame(columns = [0,1,2,3,4])

for i in range(167):
    table = 'table_' + str(i+1)
    if len(df_dict[table].columns) == 2: 
        df_2 = df_2.append(df_dict[table])
    elif len(df_dict[table].columns) == 3: 
        df_3 = df_3.append(df_dict[table])
    elif len(df_dict[table].columns) == 4:
        df_4 = df_4.append(df_dict[table])
    elif len(df_dict[table].columns) == 5:
        df_5 = df_5.append(df_dict[table])

Let's see those with 2 columns.

In [23]:
df_2.head() #Just names, we can get rid of this

Unnamed: 0,0,1
0,M.,Amilhau Paul .
1,Gte d’,Amilly Auguste.
2,O Cse d’,Amilly Jean.
3,C'e d',Amu ly Maurice.
4,M.,Anargyrou.


Let's see those with 3 columns.

In [24]:
df_3['Names'] = df_3[0] + ' ' + df_3[1]
df_3['Addresses'] = df_3[2]
df_3.drop(labels = [0,1,2], axis = 1, inplace = True)
df_3.reset_index(drop = True, inplace = True)
df_3.head()

Unnamed: 0,Names,Addresses
0,M° Adam.,"Av. Champs-Elysées, 53."
1,M. Adam Alfred.,"Rue Monceau, 67."
2,M° Adam Edmond .,"BJ Poissonnière, 23."
3,Mlle Addenet.,"Rue Blairault, 19."
4,Bno d’ Adelsward.,"Rue de la Bienfaisance,44."


Let's see those with 4 columns.

In [25]:
df_4.head()

Unnamed: 0,0,1,2,3
0,M. Me,,Aai.,"Av. Parmentier, 10."
1,M.,d’,Abadie Ghailes.,"Rue Vaneau, 32."
2,Me,,Abadie-Gasquin H. ...,"Rue de Grenelle, 94."
3,y te y se,d’,Abancourt.,"Rue Vézelay, 11."
4,M. Me,d’,Abannza Gh. ...,"B1 Montmartre, 19."


In [26]:
df_4['Names'] = df_4[0].map(lambda x: str(x)) + ' ' + df_4[1].map(lambda x: str(x)) + ' ' + df_4[2].map(lambda x: str(x)) 
df_4['Addresses'] = df_4[3]
df_4.drop(labels = [0,1,2,3], axis = 1, inplace = True)
df_4.reset_index(drop = True, inplace = True)
df_4.head()

Unnamed: 0,Names,Addresses
0,M. Me Aai.,"Av. Parmentier, 10."
1,M. d’ Abadie Ghailes.,"Rue Vaneau, 32."
2,Me Abadie-Gasquin H. ...,"Rue de Grenelle, 94."
3,y te y se d’ Abancourt.,"Rue Vézelay, 11."
4,M. Me d’ Abannza Gh. ...,"B1 Montmartre, 19."


Let's see those with 5 columns.

In [32]:
df_5.head()

Unnamed: 0,Names,Addresses
0,"Bon Bne de Sérèville. R lie dc Grenelle, 89.",■' !
1,M. de Sérèville Roger.'•,"Rue Fabert, 38."
2,M. le SERGEANT DE MONNE -,
3,COVE F.,"Rue S‘-Florentin, 4."
4,yse de Sernanceles.,"Rue Pauquet, 23."


In [28]:
df_5['Names'] = df_5[0].map(lambda x: str(x)) + ' ' +  df_5[1].map(lambda x: str(x)) + ' ' + df_5[2].map(lambda x: str(x)) 
df_5['Addresses'] = df_5[3]
df_5.drop(labels = [0,1,2,3,4], axis = 1, inplace = True)
df_5.reset_index(drop = True, inplace = True)
df_5.head()

Unnamed: 0,Names,Addresses
0,"Bon Bne de Sérèville. R lie dc Grenelle, 89.",■' !
1,M. de Sérèville Roger.'•,"Rue Fabert, 38."
2,M. le SERGEANT DE MONNE -,
3,COVE F.,"Rue S‘-Florentin, 4."
4,yse de Sernanceles.,"Rue Pauquet, 23."


We can now combine them all in one single dataframe.

In [29]:
df = pd.concat([df_3, df_4, df_5])
pre_cleaned = count_elmt(df)
num_lost_adr = num_adr - pre_cleaned
print('Before cleaning the strings, we have %d addresses.' %pre_cleaned)
print('We have therefore lost %d%% addresses due to missing values.' %(100*num_lost_adr/num_adr))

Before cleaning the strings, we have 5590 addresses.
We have therefore lost 2% addresses due to missing values.


In [30]:
df.to_csv('data/data_1884.csv', index = False)

In [31]:
df = pd.read_csv('data/data_1884.csv')

### Spelling Corrector

Spelling Corrector based on the work of Peter Norvig: http://norvig.com/spell-correct.html

In [34]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('data/utils/list_addresses.txt', encoding='utf-8').read())) 
#list_addresses.txt is a hand-corrected list of addresses

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if (w.isalpha() and w.lower() in WORDS))

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Cleaning the addresses

In [35]:
nlp = spacy.load('fr_core_news_sm')

infix_re = re.compile(r'''[-~]''') #find hyphens
suffix_re = re.compile(r'''[,."']$''') #find , or . at end of word
def customize_tokenizer(nlp):
# Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, 
                     infix_finditer=infix_re.finditer,
                     suffix_search=suffix_re.search, 
                     token_match=None)

nlp.tokenizer = customize_tokenizer(nlp)

In [40]:
def correct_adrs(adrs):
    clean_adrs = ''
    
    #Tokenize the address using SpaCy tokenizer
    adrs_parts = nlp(str(adrs))
    
    #Find the street number
    last = 1
    if str(adrs_parts[-1]) in punctuation:
        number = adrs_parts[-2]
        last = 2
    else:
        number = adrs_parts[-1]
        
    #Correction of errors
    for i in range(len(adrs_parts)-last):
        if adrs_parts[i].text in punctuation:
            if adrs_parts[i].text == '-':
                clean_adrs = clean_adrs[:-1] + adrs_parts[i].text
        else:
            clean_adrs += correction(adrs_parts[i].text).capitalize()
            clean_adrs += ' '
            
    return clean_adrs + str(number)

convert_adrs = {'av':'Avenue', 
        'r':'Rue', 
        'bd':'Boulevard',
        'pl':'Place',
        'fr':'Faubourg'}

def clean_adrs(adrs):
    adrs = correct_adrs(str(adrs))
    adrs_part_punct = adrs.split()
    adrs_part = remove_punct(adrs).split()
    for i in range(len(adrs_part)):
        if lowercase_all(adrs_part[i]) in convert_adrs:
            adrs_part_punct[i] = convert_adrs[lowercase_all(adrs_part[i])]
    adrs = ' '.join(adrs_part_punct)
    return adrs

The execution time of this cell is quite long. Directly load the cleaned data.

In [41]:
df.Addresses = df.Addresses.apply(clean_adrs)

In [42]:
#df = pd.read_csv('data/addresses_cleaned.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Names,Addresses
0,0,M° Adam.,Avenue Champs-Elysées 53
1,1,M. Adam Alfred.,Rue Monceau 67
2,2,M° Adam Edmond .,Boulevard Poissonnière 23
3,3,Mlle Addenet.,Rue Blairault 19
4,4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"


In [54]:
df.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)

In [55]:
df.head()

Unnamed: 0,Names,Addresses
0,M° Adam.,Avenue Champs-Elysées 53
1,M. Adam Alfred.,Rue Monceau 67
2,M° Adam Edmond .,Boulevard Poissonnière 23
3,Mlle Addenet.,Rue Blairault 19
4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"


In [56]:
df.to_csv('data/data_1884_cleaned.csv', index = False)

In [57]:
df = pd.read_csv('data/data_1884_cleaned.csv')

In [58]:
df.head()

Unnamed: 0,Names,Addresses
0,M° Adam.,Avenue Champs-Elysées 53
1,M. Adam Alfred.,Rue Monceau 67
2,M° Adam Edmond .,Boulevard Poissonnière 23
3,Mlle Addenet.,Rue Blairault 19
4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"


### Cleaning the names

In [59]:
df.head(10)

Unnamed: 0,Names,Addresses
0,M° Adam.,Avenue Champs-Elysées 53
1,M. Adam Alfred.,Rue Monceau 67
2,M° Adam Edmond .,Boulevard Poissonnière 23
3,Mlle Addenet.,Rue Blairault 19
4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"
5,Bon Bne cl' Adelsward Gustave...,Boulevard Courcelles 65
6,Bon Jjne J’ Adelsward Axel.,Rue Royale 10
7,M. Adenis de la Roserie ...,Rue Tronchet 27
8,Cso d’ Adhémar.,Rue Lamennais 15
9,Ysed' Adhémard.,Rue Cle Bourgogne 63


Let's define a dict to map all the abbreviations to the corresponding titles (we also include 'Monsieur', 'Madame' and 'Mademoiselle).

In [47]:
titres = {'Cte':'Comte', 'Cse':'Comtesse', 'Vte':'Vicomte', 'Vse':'Vicomtesse', 'Dc':'Duc', 'Dse':'Duchesse',
         'Bon':'Baron', 'Bne':'Baronne', 'Mis':'Marquis', 'Mse':'Marquise', 'Pce':'Prince', 'M':'Monsieur', 'M°':'Monsieur', 
          'Me':'Madame', 'Mlle':'Mademoiselle'}

TO DO: add regex to find patterns of type "' voyelle" to get "d'" (ex: "cl ' Adelsward" doit donner "d'Adelsward")

In [48]:
def clean_name(name):
    name_parts = nlp(name)
    cleaned_name = ''
    titre = ''
    #print ([token.text for token in name_parts])
    if name_parts[-1].text in punctuation:
        name_parts = name_parts[:-1]
    for i in range(len(name_parts)):
        if name_parts[i].text in titres:
            #print(name_parts[i].text)
            titre += titres[remove_punct(name_parts[i].text)] #we add the corresponding title using the dict titres
            titre += ' '
        else: 
            cleaned_name += name_parts[i].text
        cleaned_name += ' '
    #print ([token.text for token in name_parts])
    
    return (titre,cleaned_name)
    

In [49]:
test = 'Bon Bne cl\' Adelsward Gustave'
clean_name(test)

('Baron Baronne ', "  cl ' Adelsward Gustave ")

In [50]:
df.head(10).Names.apply(clean_name)

0                                (Monsieur ,  Adam )
1                       (Monsieur ,  . Adam Alfred )
2                         (Monsieur ,  Adam Edmond )
3                         (Mademoiselle ,  Addenet )
4                              (, Bno d’ Adelsward )
5    (Baron Baronne ,   cl ' Adelsward Gustave . . )
6                 (Baron ,  Jjne J’ Adelsward Axel )
7          (Monsieur ,  . Adenis de la Roserie . . )
8                                (, Cso d’ Adhémar )
9                               (, Ysed ' Adhémard )
Name: Names, dtype: object