In [None]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer

**Utility functions**

In [None]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

### Loading the data from the 1884 OCR

In [None]:
df_dict = pd.read_excel("beau_monde_1884_tables.xlsx", header = None, sheet_name = None)

In [None]:
df_dict['table_1'].head()

Let's count how many addresses we have to start with.

In [None]:
num_adr = 0
for i in range(167):
    table = 'table_' + str(i+1)
    num_adr += count_elmt(df_dict[table])
    
print('At the beginning, we have %d addresses.' %num_adr)

How many rows with missing values (or nan values) do we have?

In [None]:
counter = 0
for table in df_dict:
    counter += df_dict[table].isna().sum().sum()
print('There are %d missing values in our data.' %(counter))

Since there a too many values to fill them in by hand, we will just get rid of the corresponding rows (at least for now)

In [None]:
for table in df_dict:
    df_dict[table].dropna(inplace=True)
    
counter = 0
for table in df_dict:
    counter += df_dict[table].isna().sum().sum()
print('Now, there are %d missing values in our data.' %(counter))

Let's now harmonize the format of all the dataframes. We want to end up with one dataframe with two colums: Names and Addresses

In [None]:
cnt = Counter()
for i in range(167):
    table = 'table_' + str(i+1)
    cnt[str(len(df_dict[table].columns)) + ' columns'] += 1

cnt

There seem to be different formats for the dataframes.

In [None]:
df_2 = pd.DataFrame(columns = [0,1])
df_3 = pd.DataFrame(columns = [0,1,2])
df_4 = pd.DataFrame(columns = [0,1,2,3])
df_5 = pd.DataFrame(columns = [0,1,2,3,4])

for i in range(167):
    table = 'table_' + str(i+1)
    if len(df_dict[table].columns) == 2: 
        df_2 = df_2.append(df_dict[table])
    elif len(df_dict[table].columns) == 3: 
        df_3 = df_3.append(df_dict[table])
    elif len(df_dict[table].columns) == 4:
        df_4 = df_4.append(df_dict[table])
    elif len(df_dict[table].columns) == 5:
        df_5 = df_5.append(df_dict[table])

Let's see those with 2 columns.

In [None]:
df_2.head() #Just names, we can get rid of this

Let's see those with 3 columns.

In [None]:
df_3['Names'] = df_3[0] + ' ' + df_3[1]
df_3['Addresses'] = df_3[2]
df_3.drop(labels = [0,1,2], axis = 1, inplace = True)
df_3.reset_index(drop = True, inplace = True)
df_3.head()

Let's see those with 4 columns.

In [None]:
df_4.head()

In [None]:
df_4['Names'] = df_4[0].map(lambda x: str(x)) + ' ' + df_4[1].map(lambda x: str(x)) + ' ' + df_4[2].map(lambda x: str(x)) 
df_4['Addresses'] = df_4[3]
df_4.drop(labels = [0,1,2,3], axis = 1, inplace = True)
df_4.reset_index(drop = True, inplace = True)
df_4.head()

Let's see those with 5 columns.

In [None]:
df_5

In [None]:
df_5['Names'] = df_5[0].map(lambda x: str(x)) + ' ' +  df_5[1].map(lambda x: str(x)) + ' ' + df_5[2].map(lambda x: str(x)) 
df_5['Addresses'] = df_5[3]
df_5.drop(labels = [0,1,2,3,4], axis = 1, inplace = True)
df_5.reset_index(drop = True, inplace = True)
df_5.head()

We can now combine them all in one single dataframe.

In [None]:
df = pd.concat([df_3, df_4, df_5])
pre_cleaned = count_elmt(df)
num_lost_adr = num_adr - pre_cleaned
print('Before cleaning the strings, we have %d addresses.' %pre_cleaned)
print('We have therefore lost %d%% addresses due to missing values.' %(100*num_lost_adr/num_adr))

In [None]:
df.to_csv('data/data_to_clean.csv')

In [None]:
df = pd.read_csv('data_to_clean.csv')

### Spelling Corrector

Spelling Corrector based on the work of Peter Norvig: http://norvig.com/spell-correct.html

In [None]:
# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

In [None]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('list_addresses.txt', encoding='utf-8').read())) 
#list_addresses.txt is a hand-corrected list of addresses

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if (w.isalpha() and w.lower() in WORDS))

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Cleaning the addresses

We will use SpaCy to tokenize the addresses. We just need to add some rules to deal with special cases (like the hyphen in St-Honoré or commas at the end of a word).

In [None]:
nlp = spacy.load('fr_core_news_sm')

infix_re = re.compile(r'''[-~]''') #find hyphens
suffix_re = re.compile(r'''[,."']$''') #find , or . at end of word
def customize_tokenizer(nlp):
# Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, 
                     infix_finditer=infix_re.finditer,
                     suffix_search=suffix_re.search, 
                     token_match=None)

nlp.tokenizer = customize_tokenizer(nlp)

We can now define some functions to clean the addresses. The first one corrects spelling error and the second one harmonizes all the results.

In [None]:
def correct_adrs(adrs):
    clean_adrs = ''
    
    #Tokenize the address using SpaCy tokenizer
    adrs_parts = nlp(adrs)
    #print ([token.text for token in adrs_parts])
    
    #Find the street number
    last = 1
    if str(adrs_parts[-1]) in punctuation:
        number = adrs_parts[-2]
        last = 2
    else:
        number = adrs_parts[-1]
        
    #Correction of errors
    for i in range(len(adrs_parts)-last):
        if adrs_parts[i].text in punctuation:
            if adrs_parts[i].text == '-':
                clean_adrs = clean_adrs[:-1] + adrs_parts[i].text
        else:
            clean_adrs += correction(adrs_parts[i].text).capitalize()
            clean_adrs += ' '
            
    return clean_adrs + str(number)

convert_adrs = {'av':'Avenue', 
        'r':'Rue', 
        'bd':'Boulevard',
        'pl':'Place',
        'fr':'Faubourg'}

def clean_adrs(adrs):
    adrs = correct_adrs(adrs)
    adrs_part_punct = adrs.split()
    adrs_part = remove_punct(adrs).split()
    for i in range(len(adrs_part)):
        if lowercase_all(adrs_part[i]) in convert_adrs:
            adrs_part_punct[i] = convert_adrs[lowercase_all(adrs_part[i])]
    adrs = ' '.join(adrs_part_punct)
    return adrs

The execution time of this cell is quite long. Directly load the cleaned data.

In [None]:
#df.Addresses = df.Addresses.apply(clean_adrs)

In [None]:
df.to_csv('data/addresses_cleaned.csv')

In [None]:
df = pd.read_csv('addresses_cleaned.csv')

In [None]:
df.head()

Let's add some other preprocessing for the addresses.

In [None]:
def remove_accent(string):
    string = string.replace('é','e')
    string = string.replace('è','e')
    string = string.replace('ê','e')
    string = string.replace('ë','e')
    string = string.replace('à','a')
    string = string.replace('â','a')
    string = string.replace('ô','o')
    return string

def simplest(string): #Return the simplest form (no punctuation, all lowercase, no accents) of a string
    new_string = ''
    if type(string) == str:
        for c in string:
            if c.isalpha():
                new_string += c
    return remove_punct(lowercase_all(remove_accent(new_string)))

def simplest_adr(string): #Format: Avenue St-Honoré 21 -> avenuesthonore21
    num = ''
    if type(string) == str:
        for c in string:
            if c.isnumeric():
                num += c   
    return(simplest(string)+num)

In [None]:
df['Simplest'] = df['Addresses'].apply(simplest_adr)

Let's take a chunck of those addresses for testing.

In [None]:
df_proto = df.head(100)

In [None]:
df_proto.head()

**Paris streets names**

Let's load a list of Paris addresses with the corresponding coordinates.

In [None]:
coord = pd.read_csv('All_nums.csv')
coord['Simplest'] = coord['nom_entier'] + coord['num'].map(lambda x: str(x))
coord['Simplest'] = coord['Simplest'].apply(simplest_adr)
coord.head()

We can now merge with the Paris addresses coordinates database.

In [None]:
df_coord = df.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')
df_coord.drop(labels = ['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)

In [None]:
print("We have %d addresses with coordinates." %count_elmt(df_coord))

In [None]:
df_coord.to_csv('data/adr_coord.csv')

In [None]:
df_coord = pd.read_csv('data/adr_coord.csv')

In [None]:
df_coord.head()

Let's create a second dataframe with all the addresses for which we don't have yet coordinates.

In [None]:
df_no_coord = pd.concat([df,df_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)
df_no_coord.head()

In [None]:
print("We still have %d addresses without coordinates." %count_elmt(df_no_coord))

In [None]:
df_no_coord.to_csv('data/adr_no_coord.csv')

### Cleaning the names

In [None]:
df.head(10)

Let's define a dict to map all the abbreviations to the corresponding titles (we also include 'Monsieur', 'Madame' and 'Mademoiselle).

In [None]:
titres = {'Cte':'Comte', 'Cse':'Comtesse', 'Vte':'Vicomte', 'Vse':'Vicomtesse', 'Dc':'Duc', 'Dse':'Duchesse',
         'Bon':'Baron', 'Bne':'Baronne', 'Mis':'Marquis', 'Mse':'Marquise', 'Pce':'Prince', 'M':'Monsieur', 'M°':'Monsieur', 
          'Me':'Madame', 'Mlle':'Mademoiselle'}

TO DO: add regex to find patterns of type "' voyelle" to get "d'" (ex: "cl ' Adelsward" doit donner "d'Adelsward")

In [None]:
def clean_name(name):
    name_parts = nlp(name)
    cleaned_name = ''
    titre = ''
    #print ([token.text for token in name_parts])
    if name_parts[-1].text in punctuation:
        name_parts = name_parts[:-1]
    for i in range(len(name_parts)):
        if name_parts[i].text in titres:
            #print(name_parts[i].text)
            titre += titres[remove_punct(name_parts[i].text)] #we add the corresponding title using the dict titres
            titre += ' '
        else: 
            cleaned_name += name_parts[i].text
        cleaned_name += ' '
    #print ([token.text for token in name_parts])
    
    return (titre,cleaned_name)
    

In [None]:
test = 'Bon Bne cl\' Adelsward Gustave'
clean_name(test)

In [None]:
df.head(10).Names.apply(clean_name)