In [1]:
import pandas as pd
import re

**Loading the data from the 1884 OCR**

In [2]:
df = pd.read_excel("beau_monde_1884_tables.xlsx",header = None, sheet_name = None)
column_names = ['Gender', 'de', 'Name', 'Adress']

In [172]:
df['table_11'].head()

Unnamed: 0,0,1,2
0,Cte,de Beaucaire.,"B1 Haussman, 111."
1,Bon,de Beaucaire.,". Rue de Rigny, 5."
2,M. Me,de Beauchamp.,". R. de la Bienfaisance, 17."
3,M. Ml!,Beauchamp.,". Rue JoufTroy, 81."
4,Gse,de Beauchamp.,. Rue de l’Université; 70.


**Cleaning the addresses**

In [4]:
# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

Spelling Corrector based on the work of Peter Norvig: http://norvig.com/spell-correct.html

In [152]:
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('list_addresses.txt', encoding='utf-8').read())) 
#list_addresses.txt is a hand-corrected list of addresses

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if (w.isalpha() and w.lower() in WORDS))

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

We will use SpaCy to tokenize the addresses. We just need to add some rules to deal with special cases (like the hyphen in St-Honoré or commas at the end of a word).

In [153]:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('fr_core_news_sm')

infix_re = re.compile(r'''[-~]''') #find hyphens
suffix_re = re.compile(r'''[,."']$''') #find , or . at end of word
def customize_tokenizer(nlp):
# Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, 
                     infix_finditer=infix_re.finditer,
                     suffix_search=suffix_re.search, 
                     token_match=None)

nlp.tokenizer = customize_tokenizer(nlp)

We can now define some functions to clean the addresses. The first one corrects spelling error and the second one harmonizes all the results.

In [158]:
def correct_adrs(adrs):
    clean_adrs = ''
    
    #Tokenize the address using SpaCy tokenizer
    adrs_parts = nlp(adrs)
    #print ([token.text for token in adrs_parts])
    
    #Find the street number
    last = 1
    if str(adrs_parts[-1]) in punctuation:
        number = adrs_parts[-2]
        last = 2
    else:
        number = adrs_parts[-1]
        
    #Correction of errors
    for i in range(len(adrs_parts)-last):
        if str(adrs_parts[i]) in punctuation:
            if str(adrs_parts[i]) == '-':
                clean_adrs = clean_adrs[:-1] + str(adrs_parts[i])
        else:
            clean_adrs += correction(str(adrs_parts[i])).capitalize()
            clean_adrs += ' '
            
    return clean_adrs + str(number)

convert_adrs = {'av':'Avenue', 
        'r':'Rue', 
        'bd':'Boulevard',
        'pl':'Place',
        'fr':'Faubourg'}

def clean_adrs(adrs):
    adrs = correct_adrs(adrs)
    adrs_part_punct = adrs.split()
    adrs_part = remove_punct(adrs).split()
    for i in range(len(adrs_part)):
        if lowercase_all(adrs_part[i]) in convert_adrs:
            adrs_part_punct[i] = convert_adrs[lowercase_all(adrs_part[i])]
    adrs = ' '.join(adrs_part_punct)
    return adrs

Let's have some tests (still need to get rid of NaN values)

In [159]:
test_adrs = 'Ff S\'-llonoré. 21'
clean_adrs(test_adrs)

'Faubourg St-Honoré 21'

In [174]:
for adr in df['table_11'][2]:
        print(adr + ' : ' + clean_adrs(adr))

B1 Haussman, 111. : Boulevard Haussman 111
. Rue de Rigny, 5. : Rue De Rigny 5
. R. de la Bienfaisance, 17. : Rue De La Bienfaisance 17
. Rue JoufTroy, 81. : Rue Jouftroy 81
. Rue de l’Université; 70. : Rue De L’université; 70
. Av. d’Antin , 1. : Avenue Antin 1
. Ba Malesherbes, 81. : La Malesherbes 81
. Rue Bayard, 20. : Rue Bayard 20
. Rue Miromesnil, 20. : Rue Miromesnil 20
. Rue de Sèvres. 85. : Rue De Sèvres 85
. Rue de Sèvres, 85. : Rue De Sèvres 85
. Rue de Sèvres, 85. : Rue De Sèvres 85
. Rue de Sèvres, 85. : Rue De Sèvres 85
. Rue Royale, 8. : Rue Royale 8
. Bd Malesherbes, 8. : Boulevard Malesherbes 8
Rue Sl-Lazare, 89. : Rue L-Lazare 89
Rue Ville-rÉvêque, 25. : Rue Ville-Révêque 25
Rue Barbet-de-Jouy, 30. : Rue Barbet-De-Jouy 30
Rue de Yerneuil, 43. : Rue De Yerneuil 43
Cité Martignac, 6. : Cité Martignac 6
Rue de Grenelle, 125. : Rue De Grenelle 125
Rue Blanche, 44. : Rue Blanche 44
Rue Miromesnil, 72. : Rue Miromesnil 72
Bd Latour-Maubourg, 02. : Boulevard Latour-Maubourg

TypeError: unsupported operand type(s) for +: 'float' and 'str'