In [1]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from geopy.geocoders import Nominatim
from tqdm.autonotebook import tqdm



### Utility functions

In [2]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

# Loading the data cleaned in the respective parser

In [3]:
df_1884 = pd.read_csv('data/data_1884_cleaned.csv')
df_1908 = pd.read_csv('data/data_1908_cleaned.csv')

# Preprocessing

In [4]:
def remove_accent(string):
    string = string.replace('é','e')
    string = string.replace('è','e')
    string = string.replace('ê','e')
    string = string.replace('ë','e')
    string = string.replace('à','a')
    string = string.replace('â','a')
    string = string.replace('ô','o')
    return string

def simplest(string): #Return the simplest form (no punctuation, all lowercase, no accents) of a string
    new_string = ''
    if type(string) == str:
        for c in string:
            if c.isalpha():
                new_string += c
    return remove_punct(lowercase_all(remove_accent(new_string)))

def simplest_adr(string): #Format: Avenue St-Honoré 21 -> avenuesthonore21
    num = ''
    if type(string) == str:
        for c in string:
            if c.isnumeric():
                num += c   
    return(simplest(string)+num)

In [5]:
df_1884['Simplest'] = df_1884['Addresses'].apply(simplest_adr)
df_1908['Simplest'] = df_1908['Addresses'].apply(simplest_adr)

# Getting the coordinates

### Paris street names

In [6]:
coord = pd.read_csv('data/All_nums.csv')
coord['Simplest'] = coord['nom_entier'] + coord['num'].map(lambda x: str(x))
coord['Simplest'] = coord['Simplest'].apply(simplest_adr)
coord.head()

Unnamed: 0,id,type,article,nom,nom_entier,num,debut,fin,source,Y,X,Simplest
0,7646,allee,d',antin,Allée d'Antin,23,,,Vasserot,48.868123,2.309918,alleedantin23
1,7647,allee,d',antin,Allée d'Antin,21,,,Vasserot,48.867949,2.309923,alleedantin21
2,7648,allee,d',antin,Allée d'Antin,19,,,Vasserot,48.867724,2.309931,alleedantin19
3,7649,allee,d',antin,Allée d'Antin,17,,,Vasserot,48.867548,2.309941,alleedantin17
4,7650,allee,d',antin,Allée d'Antin,15,,,Vasserot,48.867392,2.309942,alleedantin15


In [7]:
df_1884_coord = df_1884.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')
df_1908_coord = df_1908.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')

In [8]:
df_1884_coord = df_1884_coord.reset_index()[["Names", "Addresses"]]
df_1908_coord = df_1908_coord.reset_index()[["Names", "Addresses"]]

In [9]:
print("For the year 1884, we have %d addresses with coordinates." %count_elmt(df_1884_coord))
print("For the year 1908, we have %d addresses with coordinates." %count_elmt(df_1908_coord))

For the year 1884, we have 2765 addresses with coordinates.
For the year 1908, we have 5594 addresses with coordinates.


In [10]:
df_1884_no_coord = pd.concat([df_1884,df_1884_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)
df_1908_no_coord = pd.concat([df_1908,df_1908_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)

In [11]:
df_1884_no_coord = df_1884_no_coord.reset_index()[["Names", "Addresses"]]
df_1884_no_coord.head()

Unnamed: 0,Names,Addresses
0,M. Adam Alfred.,Rue Monceau 67
1,Mlle Addenet.,Rue Blairault 19
2,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"
3,Bon Bne cl' Adelsward Gustave...,Boulevard Courcelles 65
4,M. Adenis de la Roserie ...,Rue Tronchet 27


In [12]:
df_1908_no_coord = df_1908_no_coord.reset_index()[["Names", "Addresses"]]
df_1908_no_coord.head()

Unnamed: 0,Names,Addresses
0,Michel-Robert ABRADIE-d’ARRAST,5S bis rue Jouffroy
1,Georges ABOILARI,46 avenue de Breteuil
2,Paul ABZAC,181 rue de La Pompe
3,ACHERY de SAS DONNINO,62 avenue de la Grande-Armée
4,ACLOQLE SEBERT,9 place des Etats- Unis


In [13]:
print("For the year 1884, we still have %d addresses without coordinates." %count_elmt(df_1884_no_coord))
print("For the year 1908, we still have %d addresses without coordinates." %count_elmt(df_1908_no_coord))

For the year 1884, we still have 2352 addresses without coordinates.
For the year 1908, we still have 3779 addresses without coordinates.


### Geopy

In [14]:
def get_coord(df):

    notfound_addresses = 0
    people = df.copy()
    failed = pd.DataFrame(columns = ['Names','Addresses'])
    success = pd.DataFrame(columns = ['Names','Addresses'])
    people["latitude"] = "0"
    people["longitude"] = "0"
    geolocator = Nominatim(user_agent="agent")
    coord = None
    for i in tqdm(range(len(people))):
        try:
            coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
        except: 
            try:
                coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
            except:
                print("I give up")
                failed = failed.append(people.iloc[[i]])
                
        if(coord):
            people["latitude"][i] = coord.latitude
            people["longitude"][i] = coord.longitude
            success = success.append(people.iloc[[i]])
        else: 
            notfound_addresses += 1
    print("Missed " + str(notfound_addresses) + "/" + str(len(people)) + \
          " addresses (= " + str(notfound_addresses/len(people)*100) + "%)")

    return (success, failed)

In [15]:
(people_1908 ,failed) = get_coord(df_1908_no_coord)
n_iter = 0
while (len(failed)>0 and n_iter < 10):
    (success,failed) = get_coord(failed)
    people_1908 = people_1908.append(success, sort = False)
    n_iter += 1

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,



Missed 12/30 addresses (= 40.0%)


In [17]:
(people_1884 ,failed) = get_coord(df_1884_no_coord)
n_iter = 0
while (len(failed)>0 and n_iter < 10):
    (success,failed) = get_coord(failed)
    people_1884 = people_1884.append(success, sort = False)
    n_iter += 1

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


Missed 10/30 addresses (= 33.33333333333333%)


In [19]:
final_1908 = people_1908.append(df_1908_coord, sort = False)
final_1884 = people_1884.append(df_1884_coord, sort = False)

# Final output

In [20]:
final_1884.head()

Unnamed: 0,Addresses,Names,latitude,longitude
0,Rue Monceau 67,M. Adam Alfred.,48.8796,2.30816
2,"Rue De La Bienfaisance,44",Bno d’ Adelsward.,48.8766,2.31487
3,Boulevard Courcelles 65,Bon Bne cl' Adelsward Gustave...,48.8794,2.30436
4,Rue Tronchet 27,M. Adenis de la Roserie ...,48.8727,2.32617
7,Rue St-Placide 62,Mc d' Affry de i.a Monnoye.,48.8471,2.32648


In [21]:
final_1908.head()

Unnamed: 0,Addresses,Names,latitude,longitude
1,46 avenue de Breteuil,Georges ABOILARI,48.8501,2.31147
2,181 rue de La Pompe,Paul ABZAC,48.871,2.28325
3,62 avenue de la Grande-Armée,ACHERY de SAS DONNINO,48.8769,2.2863
4,9 place des Etats- Unis,ACLOQLE SEBERT,48.8678,2.29397
7,47 bis rue Ampère,Hippolytel ADAM,48.8852,2.30681


In [None]:
final_1884.to_csv('data/final_1884.csv')
final_1908.to_csv('data/final_1908.csv')