In [1]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer

**Utility functions**

In [2]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

### Loading the data from the 1884 OCR

In [3]:
df_dict = pd.read_excel("data/beau_monde_1884_tables.xlsx", header = None, sheet_name = None)

In [4]:
df_dict['table_1'].head()

Unnamed: 0,0,1,2,3
0,M. Me,,Aai.,"Av. Parmentier, 10."
1,M.,d’,Abadie Ghailes.,"Rue Vaneau, 32."
2,Me,,Abadie-Gasquin H. ...,"Rue de Grenelle, 94."
3,y te y se,d’,Abancourt.,"Rue Vézelay, 11."
4,M. Me,d’,Abannza Gh. ...,"B1 Montmartre, 19."


Let's count how many addresses we have to start with.

In [5]:
num_adr = 0
for i in range(167):
    table = 'table_' + str(i+1)
    num_adr += count_elmt(df_dict[table])
    
print('At the beginning, we have %d addresses.' %num_adr)

At the beginning, we have 5709 addresses.


How many rows with missing values (or nan values) do we have? NEED TO CHECK WHERE THEY ACTUALLY ARE!!!

In [6]:
counter = 0
for table in df_dict:
    counter += df_dict[table].isna().sum().sum()
print('There are %d missing values in our data.' %(counter))

There are 5938 missing values in our data.


Since there a too many values to fill them in by hand, we will just get rid of the corresponding rows (at least for now)

In [7]:
for table in df_dict:
    df_dict[table].dropna(inplace=True)
    
counter = 0
for table in df_dict:
    counter += df_dict[table].isna().sum().sum()
print('Now, there are %d missing values in our data.' %(counter))

Now, there are 0 missing values in our data.


Let's now harmonize the format of all the dataframes. We want to end up with one dataframe with two colums: Names and Addresses

In [8]:
cnt = Counter()
for i in range(167):
    table = 'table_' + str(i+1)
    cnt[str(len(df_dict[table].columns)) + ' columns'] += 1

cnt

Counter({'4 columns': 30, '3 columns': 131, '2 columns': 4, '5 columns': 2})

There seem to be different formats for the dataframes.

In [9]:
df_2 = pd.DataFrame(columns = [0,1])
df_3 = pd.DataFrame(columns = [0,1,2])
df_4 = pd.DataFrame(columns = [0,1,2,3])
df_5 = pd.DataFrame(columns = [0,1,2,3,4])

for i in range(167):
    table = 'table_' + str(i+1)
    if len(df_dict[table].columns) == 2: 
        df_2 = df_2.append(df_dict[table])
    elif len(df_dict[table].columns) == 3: 
        df_3 = df_3.append(df_dict[table])
    elif len(df_dict[table].columns) == 4:
        df_4 = df_4.append(df_dict[table])
    elif len(df_dict[table].columns) == 5:
        df_5 = df_5.append(df_dict[table])

Let's see those with 2 columns.

In [10]:
df_2.head() #Just names, we can get rid of this

Unnamed: 0,0,1
0,M.,Amilhau Paul .
1,Gte d’,Amilly Auguste.
2,O Cse d’,Amilly Jean.
3,C'e d',Amu ly Maurice.
4,M.,Anargyrou.


Let's see those with 3 columns.

In [11]:
df_3['Names'] = df_3[0] + ' ' + df_3[1]
df_3['Addresses'] = df_3[2]
df_3.drop(labels = [0,1,2], axis = 1, inplace = True)
df_3.reset_index(drop = True, inplace = True)
df_3.head()

Unnamed: 0,Names,Addresses
0,M° Adam.,"Av. Champs-Elysées, 53."
1,M. Adam Alfred.,"Rue Monceau, 67."
2,M° Adam Edmond .,"BJ Poissonnière, 23."
3,Mlle Addenet.,"Rue Blairault, 19."
4,Bno d’ Adelsward.,"Rue de la Bienfaisance,44."


Let's see those with 4 columns.

In [12]:
df_4.head()

Unnamed: 0,0,1,2,3
1,M.,d’,Abadie Ghailes.,"Rue Vaneau, 32."
3,y te y se,d’,Abancourt.,"Rue Vézelay, 11."
4,M. Me,d’,Abannza Gh. ...,"B1 Montmartre, 19."
5,M,de,Abaroa Claudio.,"Rue Sl-Lazare, 81."
6,M. M°,d',Abbadie.,"Rue du Bac, 120."


In [13]:
df_4['Names'] = df_4[0].map(lambda x: str(x)) + ' ' + df_4[1].map(lambda x: str(x)) + ' ' + df_4[2].map(lambda x: str(x)) 
df_4['Addresses'] = df_4[3]
df_4.drop(labels = [0,1,2,3], axis = 1, inplace = True)
df_4.reset_index(drop = True, inplace = True)
df_4.head()

Unnamed: 0,Names,Addresses
0,M. d’ Abadie Ghailes.,"Rue Vaneau, 32."
1,y te y se d’ Abancourt.,"Rue Vézelay, 11."
2,M. Me d’ Abannza Gh. ...,"B1 Montmartre, 19."
3,M de Abaroa Claudio.,"Rue Sl-Lazare, 81."
4,M. M° d' Abbadie.,"Rue du Bac, 120."


Let's see those with 5 columns.

In [14]:
df_5

Unnamed: 0,0,1,2,3,4
5,Dse,de,Serran.,"Av. de Tourville, 12.",\
6,Cte,de,Serrant.,"Rue d’Iéna, 15.",)
12,Mse,de,Serres de Mesplès ....,"Av. Marceau, 39.",1
23,Cte Csc,de,Sesmaisons R... .,"Rue de Marignan, 17.",■ .


In [15]:
df_5['Names'] = df_5[0].map(lambda x: str(x)) + ' ' +  df_5[1].map(lambda x: str(x)) + ' ' + df_5[2].map(lambda x: str(x)) 
df_5['Addresses'] = df_5[3]
df_5.drop(labels = [0,1,2,3,4], axis = 1, inplace = True)
df_5.reset_index(drop = True, inplace = True)
df_5.head()

Unnamed: 0,Names,Addresses
0,Dse de Serran.,"Av. de Tourville, 12."
1,Cte de Serrant.,"Rue d’Iéna, 15."
2,Mse de Serres de Mesplès ....,"Av. Marceau, 39."
3,Cte Csc de Sesmaisons R... .,"Rue de Marignan, 17."


We can now combine them all in one single dataframe.

In [16]:
df = pd.concat([df_3, df_4, df_5])
pre_cleaned = count_elmt(df)
num_lost_adr = num_adr - pre_cleaned
print('Before cleaning the strings, we have %d addresses.' %pre_cleaned)
print('We have therefore lost %d%% addresses due to missing values.' %(100*num_lost_adr/num_adr))

Before cleaning the strings, we have 4758 addresses.
We have therefore lost 16% addresses due to missing values.


In [17]:
df.to_csv('data/data_1884.csv')

In [18]:
df = pd.read_csv('data/data_1884.csv')

### Spelling Corrector

Spelling Corrector based on the work of Peter Norvig: http://norvig.com/spell-correct.html

In [23]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('data/list_addresses.txt', encoding='utf-8').read())) 
#list_addresses.txt is a hand-corrected list of addresses

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if (w.isalpha() and w.lower() in WORDS))

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Cleaning the addresses

In [24]:
nlp = spacy.load('fr_core_news_sm')

infix_re = re.compile(r'''[-~]''') #find hyphens
suffix_re = re.compile(r'''[,."']$''') #find , or . at end of word
def customize_tokenizer(nlp):
# Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, 
                     infix_finditer=infix_re.finditer,
                     suffix_search=suffix_re.search, 
                     token_match=None)

nlp.tokenizer = customize_tokenizer(nlp)

In [25]:
def correct_adrs(adrs):
    clean_adrs = ''
    
    #Tokenize the address using SpaCy tokenizer
    adrs_parts = nlp(adrs)
    #print ([token.text for token in adrs_parts])
    
    #Find the street number
    last = 1
    if str(adrs_parts[-1]) in punctuation:
        number = adrs_parts[-2]
        last = 2
    else:
        number = adrs_parts[-1]
        
    #Correction of errors
    for i in range(len(adrs_parts)-last):
        if adrs_parts[i].text in punctuation:
            if adrs_parts[i].text == '-':
                clean_adrs = clean_adrs[:-1] + adrs_parts[i].text
        else:
            clean_adrs += correction(adrs_parts[i].text).capitalize()
            clean_adrs += ' '
            
    return clean_adrs + str(number)

convert_adrs = {'av':'Avenue', 
        'r':'Rue', 
        'bd':'Boulevard',
        'pl':'Place',
        'fr':'Faubourg'}

def clean_adrs(adrs):
    adrs = correct_adrs(adrs)
    adrs_part_punct = adrs.split()
    adrs_part = remove_punct(adrs).split()
    for i in range(len(adrs_part)):
        if lowercase_all(adrs_part[i]) in convert_adrs:
            adrs_part_punct[i] = convert_adrs[lowercase_all(adrs_part[i])]
    adrs = ' '.join(adrs_part_punct)
    return adrs

The execution time of this cell is quite long. Directly load the cleaned data.

In [26]:
#df.Addresses = df.Addresses.apply(clean_adrs)

In [20]:
df = pd.read_csv('data/addresses_cleaned.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Names,Addresses
0,0,0,M° Adam.,Avenue Champs-Elysées 53
1,1,1,M. Adam Alfred.,Rue Monceau 67
2,2,2,M° Adam Edmond .,Boulevard Poissonnière 23
3,3,3,Mlle Addenet.,Rue Blairault 19
4,4,4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"


In [21]:
df.drop(labels = ['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)

In [22]:
df.head()

Unnamed: 0,Names,Addresses
0,M° Adam.,Avenue Champs-Elysées 53
1,M. Adam Alfred.,Rue Monceau 67
2,M° Adam Edmond .,Boulevard Poissonnière 23
3,Mlle Addenet.,Rue Blairault 19
4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"


In [23]:
df.to_csv('data/data_1884_cleaned.csv')

In [28]:
df = pd.read_csv('data/data_1884_cleaned.csv')

In [29]:
df.head(20)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Names,Addresses
0,0,0,M° Adam.,Avenue Champs-Elysées 53
1,1,1,M. Adam Alfred.,Rue Monceau 67
2,2,2,M° Adam Edmond .,Boulevard Poissonnière 23
3,3,3,Mlle Addenet.,Rue Blairault 19
4,4,4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"
5,5,5,Bon Bne cl' Adelsward Gustave...,Boulevard Courcelles 65
6,6,6,Bon Jjne J’ Adelsward Axel.,Rue Royale 10
7,7,7,M. Adenis de la Roserie ...,Rue Tronchet 27
8,8,8,Cso d’ Adhémar.,Rue Lamennais 15
9,9,9,Ysed' Adhémard.,Rue Cle Bourgogne 63


Let's add some other preprocessing for the addresses.

In [30]:
def remove_accent(string):
    string = string.replace('é','e')
    string = string.replace('è','e')
    string = string.replace('ê','e')
    string = string.replace('ë','e')
    string = string.replace('à','a')
    string = string.replace('â','a')
    string = string.replace('ô','o')
    return string

def simplest(string): #Return the simplest form (no punctuation, all lowercase, no accents) of a string
    new_string = ''
    if type(string) == str:
        for c in string:
            if c.isalpha():
                new_string += c
    return remove_punct(lowercase_all(remove_accent(new_string)))

def simplest_adr(string): #Format: Avenue St-Honoré 21 -> avenuesthonore21
    num = ''
    if type(string) == str:
        for c in string:
            if c.isnumeric():
                num += c   
    return(simplest(string)+num)

In [31]:
df['Simplest'] = df['Addresses'].apply(simplest_adr)

Let's take a chunck of those addresses for testing.

In [32]:
df_proto = df.head(100)

In [33]:
df_proto.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Names,Addresses,Simplest
0,0,0,M° Adam.,Avenue Champs-Elysées 53,avenuechampselysees53
1,1,1,M. Adam Alfred.,Rue Monceau 67,ruemonceau67
2,2,2,M° Adam Edmond .,Boulevard Poissonnière 23,boulevardpoissonniere23
3,3,3,Mlle Addenet.,Rue Blairault 19,rueblairault19
4,4,4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44",ruedelabienfaisance44


**Paris streets names**

Let's load a list of Paris addresses with the corresponding coordinates.

In [34]:
coord = pd.read_csv('data/All_nums.csv')
coord['Simplest'] = coord['nom_entier'] + coord['num'].map(lambda x: str(x))
coord['Simplest'] = coord['Simplest'].apply(simplest_adr)
coord.head()

Unnamed: 0,id,type,article,nom,nom_entier,num,debut,fin,source,Y,X,Simplest
0,7646,allee,d',antin,Allée d'Antin,23,,,Vasserot,48.868123,2.309918,alleedantin23
1,7647,allee,d',antin,Allée d'Antin,21,,,Vasserot,48.867949,2.309923,alleedantin21
2,7648,allee,d',antin,Allée d'Antin,19,,,Vasserot,48.867724,2.309931,alleedantin19
3,7649,allee,d',antin,Allée d'Antin,17,,,Vasserot,48.867548,2.309941,alleedantin17
4,7650,allee,d',antin,Allée d'Antin,15,,,Vasserot,48.867392,2.309942,alleedantin15


We can now merge with the Paris addresses coordinates database.

In [35]:
df_coord = df.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')
df_coord.drop(labels = ['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)

In [36]:
print("We have %d addresses with coordinates." %count_elmt(df_coord))

We have 2765 addresses with coordinates.


In [37]:
df_coord.to_csv('data/adr_coord.csv')

In [38]:
df_coord = pd.read_csv('data/adr_coord.csv')

In [39]:
df_coord.head()

Unnamed: 0.1,Unnamed: 0,Names,Addresses,Simplest,Y,X
0,0,M° Adam Edmond .,Boulevard Poissonnière 23,boulevardpoissonniere23,48.871206,2.344183
1,1,M° Adam Edmond .,Boulevard Poissonnière 23,boulevardpoissonniere23,48.871136,2.344168
2,2,M. M° Faure Antoine-Xavier.,Boulevard Poissonnière 23,boulevardpoissonniere23,48.871206,2.344183
3,3,M. M° Faure Antoine-Xavier.,Boulevard Poissonnière 23,boulevardpoissonniere23,48.871136,2.344168
4,4,Bno d’ Adelsward.,"Rue De La Bienfaisance,44",ruedelabienfaisance44,48.876625,2.314868


Let's create a second dataframe with all the addresses for which we don't have yet coordinates.

In [40]:
df_no_coord = pd.concat([df,df_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)
df_no_coord.head()

Unnamed: 0.2,Addresses,Names,Simplest,Unnamed: 0,Unnamed: 0.1,X,Y
1,Rue Monceau 67,M. Adam Alfred.,ruemonceau67,1,1.0,,
3,Rue Blairault 19,Mlle Addenet.,rueblairault19,3,3.0,,
5,Boulevard Courcelles 65,Bon Bne cl' Adelsward Gustave...,boulevardcourcelles65,5,5.0,,
9,Rue Cle Bourgogne 63,Ysed' Adhémard.,rueclebourgogne63,9,9.0,,
10,Rue Cle Rennes 127,Gt0 Affrede S1-Rome Denis,rueclerennes127,10,10.0,,


In [41]:
print("We still have %d addresses without coordinates." %count_elmt(df_no_coord))

We still have 1497 addresses without coordinates.


In [42]:
df_no_coord.to_csv('data/adr_no_coord.csv')

In [43]:
df_no_coord.head(20)

Unnamed: 0.2,Addresses,Names,Simplest,Unnamed: 0,Unnamed: 0.1,X,Y
1,Rue Monceau 67,M. Adam Alfred.,ruemonceau67,1,1.0,,
3,Rue Blairault 19,Mlle Addenet.,rueblairault19,3,3.0,,
5,Boulevard Courcelles 65,Bon Bne cl' Adelsward Gustave...,boulevardcourcelles65,5,5.0,,
9,Rue Cle Bourgogne 63,Ysed' Adhémard.,rueclebourgogne63,9,9.0,,
10,Rue Cle Rennes 127,Gt0 Affrede S1-Rome Denis,rueclerennes127,10,10.0,,
11,Rue St-Placide 62,Mc d' Affry de i.a Monnoye.,ruestplacide62,11,11.0,,
14,Rue De Villejust 38,l> Aguado A.,ruedevillejust38,14,14.0,,
16,Rue Aguesseau 20,Mis Ms° de F Aigle.,rueaguesseau20,16,16.0,,
17,Rue Astorg 12,Cte CSJ de 1’Aigle.,rueastorg12,17,17.0,,
21,Rue Du D Septembre 28,C°> d' Aillières.,ruedudseptembre28,21,21.0,,


### Cleaning the names

In [46]:
df.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Names,Addresses,Simplest
0,0,0,M° Adam.,"Av. Champs-Elysées, 53.",avchampselysees53
1,1,1,M. Adam Alfred.,"Rue Monceau, 67.",ruemonceau67
2,2,2,M° Adam Edmond .,"BJ Poissonnière, 23.",bjpoissonniere23
3,3,3,Mlle Addenet.,"Rue Blairault, 19.",rueblairault19
4,4,4,Bno d’ Adelsward.,"Rue de la Bienfaisance,44.",ruedelabienfaisance44
5,5,5,Bon Bne cl' Adelsward Gustave...,"Bd Courcelles, 65.",bdcourcelles65
6,6,6,Bon Jjne J’ Adelsward Axel.,"Rue Royale, 10.",rueroyale10
7,7,7,M. Adenis de la Roserie ...,"Rue Tronchet, 27.",ruetronchet27
8,8,8,Cso d’ Adhémar.,"Rue Lamennais, 15.",ruelamennais15
9,9,9,Ysed' Adhémard.,"Rue cle Bourgogne, 63.",rueclebourgogne63


Let's define a dict to map all the abbreviations to the corresponding titles (we also include 'Monsieur', 'Madame' and 'Mademoiselle).

In [47]:
titres = {'Cte':'Comte', 'Cse':'Comtesse', 'Vte':'Vicomte', 'Vse':'Vicomtesse', 'Dc':'Duc', 'Dse':'Duchesse',
         'Bon':'Baron', 'Bne':'Baronne', 'Mis':'Marquis', 'Mse':'Marquise', 'Pce':'Prince', 'M':'Monsieur', 'M°':'Monsieur', 
          'Me':'Madame', 'Mlle':'Mademoiselle'}

TO DO: add regex to find patterns of type "' voyelle" to get "d'" (ex: "cl ' Adelsward" doit donner "d'Adelsward")

In [48]:
def clean_name(name):
    name_parts = nlp(name)
    cleaned_name = ''
    titre = ''
    #print ([token.text for token in name_parts])
    if name_parts[-1].text in punctuation:
        name_parts = name_parts[:-1]
    for i in range(len(name_parts)):
        if name_parts[i].text in titres:
            #print(name_parts[i].text)
            titre += titres[remove_punct(name_parts[i].text)] #we add the corresponding title using the dict titres
            titre += ' '
        else: 
            cleaned_name += name_parts[i].text
        cleaned_name += ' '
    #print ([token.text for token in name_parts])
    
    return (titre,cleaned_name)
    

In [49]:
test = 'Bon Bne cl\' Adelsward Gustave'
clean_name(test)

('Baron Baronne ', "  cl ' Adelsward Gustave ")

In [50]:
df.head(10).Names.apply(clean_name)

0                                (Monsieur ,  Adam )
1                       (Monsieur ,  . Adam Alfred )
2                         (Monsieur ,  Adam Edmond )
3                         (Mademoiselle ,  Addenet )
4                              (, Bno d’ Adelsward )
5    (Baron Baronne ,   cl ' Adelsward Gustave . . )
6                 (Baron ,  Jjne J’ Adelsward Axel )
7          (Monsieur ,  . Adenis de la Roserie . . )
8                                (, Cso d’ Adhémar )
9                               (, Ysed ' Adhémard )
Name: Names, dtype: object