In [23]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from geopy.geocoders import Nominatim
from tqdm.autonotebook import tqdm



### Utility functions

In [24]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

# Loading the data cleaned in the respective parser

In [25]:
df_1884 = pd.read_csv('data/data_1884_cleaned.csv')
df_1908 = pd.read_csv('data/data_1908_cleaned.csv')

# Preprocessing

In [26]:
def remove_accent(string):
    string = string.replace('é','e')
    string = string.replace('è','e')
    string = string.replace('ê','e')
    string = string.replace('ë','e')
    string = string.replace('à','a')
    string = string.replace('â','a')
    string = string.replace('ô','o')
    return string

def simplest(string): #Return the simplest form (no punctuation, all lowercase, no accents) of a string
    new_string = ''
    if type(string) == str:
        for c in string:
            if c.isalpha():
                new_string += c
    return remove_punct(lowercase_all(remove_accent(new_string)))

def simplest_adr(string): #Format: Avenue St-Honoré 21 -> avenuesthonore21
    num = ''
    if type(string) == str:
        for c in string:
            if c.isnumeric():
                num += c   
    return(simplest(string)+num)

In [27]:
df_1884['Simplest'] = df_1884['Addresses'].apply(simplest_adr)
df_1908['Simplest'] = df_1908['Addresses'].apply(simplest_adr)

# Getting the coordinates

### Paris street names

In [28]:
coord = pd.read_csv('data/All_nums.csv')
coord['Simplest'] = coord['nom_entier'] + coord['num'].map(lambda x: str(x))
coord['Simplest'] = coord['Simplest'].apply(simplest_adr)
coord.head()

Unnamed: 0,id,type,article,nom,nom_entier,num,debut,fin,source,Y,X,Simplest
0,7646,allee,d',antin,Allée d'Antin,23,,,Vasserot,48.868123,2.309918,alleedantin23
1,7647,allee,d',antin,Allée d'Antin,21,,,Vasserot,48.867949,2.309923,alleedantin21
2,7648,allee,d',antin,Allée d'Antin,19,,,Vasserot,48.867724,2.309931,alleedantin19
3,7649,allee,d',antin,Allée d'Antin,17,,,Vasserot,48.867548,2.309941,alleedantin17
4,7650,allee,d',antin,Allée d'Antin,15,,,Vasserot,48.867392,2.309942,alleedantin15


In [29]:
df_1884_coord = df_1884.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')

In [30]:
df_1908_coord = df_1908.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')

In [31]:
print("For the year 1884, we have %d addresses with coordinates." %count_elmt(df_1884_coord))
print("For the year 1908, we have %d addresses with coordinates." %count_elmt(df_1908_coord))

For the year 1884, we have 2765 addresses with coordinates.
For the year 1908, we have 5594 addresses with coordinates.


In [32]:
df_1884_no_coord = pd.concat([df_1884,df_1884_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)
df_1908_no_coord = pd.concat([df_1908,df_1908_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)

In [42]:
df_1884_no_coord = df_1884_no_coord.reset_index()[["Names", "Addresses"]]
df_1884_no_coord.head()

Unnamed: 0,Names,Addresses
0,M. Adam Alfred.,Rue Monceau 67
1,Mlle Addenet.,Rue Blairault 19
2,Bon Bne cl' Adelsward Gustave...,Boulevard Courcelles 65
3,Ysed' Adhémard.,Rue Cle Bourgogne 63
4,Gt0 Affrede S1-Rome Denis,Rue Cle Rennes 127


In [43]:
df_1908_no_coord = df_1908_no_coord.reset_index()[["Names", "Addresses"]]
df_1908_no_coord.head()

Unnamed: 0,Names,Addresses
0,Michel-Robert ABRADIE-d’ARRAST,5S bis rue Jouffroy
1,René ACOLLS,29 avenue Friedland
2,Achille ADAM,21 avenue d’Antin
3,Hippolytel ADAM,47 bis rue Ampère
4,AGlEN,74 rue Michel-Ange [Ht 239.81]—et fljj le Pri...


In [44]:
print("For the year 1884, we still have %d addresses without coordinates." %count_elmt(df_1884_no_coord))
print("For the year 1908, we still have %d addresses without coordinates." %count_elmt(df_1908_no_coord))

For the year 1884, we still have 1497 addresses without coordinates.
For the year 1908, we still have 2103 addresses without coordinates.


### Geopy

In [45]:
def get_coord(df):

    notfound_addresses = 0
    people = df.copy()
    people["latitude"] = "0"
    people["longitude"] = "0"
    geolocator = Nominatim(user_agent="agent")
    coord = None
    for i in tqdm(range(len(people))):
        try:
            coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
        except: 
            print("Try again...")
            try:
                coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
            except:
                print("I give up")
        if(coord):
            people["latitude"][i] = coord.latitude
            people["longitude"][i] = coord.longitude
        else: 
            notfound_addresses += 1
    print("Missed " + str(notfound_addresses) + "/" + str(len(people)) + \
          " addresses (= " + str(notfound_addresses/len(people)*100) + "%)")

    return people

In [46]:
get_coord(df_1908_no_coord[:30])

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

Try again...
Try again...
Try again...
Try again...

Missed 23/30 addresses (= 76.66666666666667%)


Unnamed: 0,Names,Addresses,latitude,longitude
0,Michel-Robert ABRADIE-d’ARRAST,5S bis rue Jouffroy,0.0,0.0
1,René ACOLLS,29 avenue Friedland,0.0,0.0
2,Achille ADAM,21 avenue d’Antin,0.0,0.0
3,Hippolytel ADAM,47 bis rue Ampère,48.8852,2.30681
4,AGlEN,74 rue Michel-Ange [Ht 239.81]—et fljj le Pri...,0.0,0.0
5,AIGNAN,60 rue François-lor,48.3937,-4.46959
6,ALBERT-LAMBERT,224 rue de Rivoli IJ- 243.89]. ALBERT-MORIX,0.0,0.0
7,ALBIGNAC et Mme née d’OIfFEUILLE,6 impasse des Gendarmes à Versailles — et ^ d...,0.0,0.0
8,ALBRECHT MARTENS,3 rue St Didier,48.8661,2.2887
9,ALBUFÉRA SUCHT,35 rue St-Bominique,0.0,0.0


In [47]:
get_coord(df_1884_no_coord[:30])

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

Try again...
Try again...
Try again...
Try again...

Missed 13/30 addresses (= 43.333333333333336%)


Unnamed: 0,Names,Addresses,latitude,longitude
0,M. Adam Alfred.,Rue Monceau 67,48.8796,2.30816
1,Mlle Addenet.,Rue Blairault 19,0.0,0.0
2,Bon Bne cl' Adelsward Gustave...,Boulevard Courcelles 65,48.8794,2.30436
3,Ysed' Adhémard.,Rue Cle Bourgogne 63,0.0,0.0
4,Gt0 Affrede S1-Rome Denis,Rue Cle Rennes 127,0.0,0.0
5,Mc d' Affry de i.a Monnoye.,Rue St-Placide 62,48.8471,2.32648
6,l> Aguado A.,Rue De Villejust 38,48.8716,2.28836
7,Mis Ms° de F Aigle.,Rue Aguesseau 20,0.0,0.0
8,Cte CSJ de 1’Aigle.,Rue Astorg 12,0.0,0.0
9,C°> d' Aillières.,Rue Du D Septembre 28,0.0,0.0


# Final output

In [None]:
df_1884.to_csv('data/data_1884_coord.csv')
df_1908.to_csv('data/data_1908_coord.csv')