In [1]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from geopy.geocoders import Nominatim
from tqdm.autonotebook import tqdm
import time



### Utility functions

In [2]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

# Loading the data cleaned in the respective parser

In [3]:
df_1884 = pd.read_csv('data/data_1884_cleaned.csv')
df_1908 = pd.read_csv('data/data_1908_cleaned.csv')

# Preprocessing

In [4]:
def remove_accent(string):
    string = string.replace('é','e')
    string = string.replace('è','e')
    string = string.replace('ê','e')
    string = string.replace('ë','e')
    string = string.replace('à','a')
    string = string.replace('â','a')
    string = string.replace('ô','o')
    return string

def simplest(string): #Return the simplest form (no punctuation, all lowercase, no accents) of a string
    new_string = ''
    if type(string) == str:
        for c in string:
            if c.isalpha():
                new_string += c
    return remove_punct(lowercase_all(remove_accent(new_string)))

def simplest_adr(string): #Format: Avenue St-Honoré 21 -> avenuesthonore21
    num = ''
    if type(string) == str:
        for c in string:
            if c.isnumeric():
                num += c   
    return(simplest(string)+num)

In [5]:
df_1884['Simplest'] = df_1884['Addresses'].apply(simplest_adr)
df_1908['Simplest'] = df_1908['Addresses'].apply(simplest_adr)

# Getting the coordinates

### Paris street names

In [6]:
coord = pd.read_csv('data/utils/All_nums.csv')
coord['Simplest'] = coord['nom_entier'] + coord['num'].map(lambda x: str(x))
coord['Simplest'] = coord['Simplest'].apply(simplest_adr)
coord.head()

Unnamed: 0,id,type,article,nom,nom_entier,num,debut,fin,source,Y,X,Simplest
0,7646,allee,d',antin,Allée d'Antin,23,,,Vasserot,48.868123,2.309918,alleedantin23
1,7647,allee,d',antin,Allée d'Antin,21,,,Vasserot,48.867949,2.309923,alleedantin21
2,7648,allee,d',antin,Allée d'Antin,19,,,Vasserot,48.867724,2.309931,alleedantin19
3,7649,allee,d',antin,Allée d'Antin,17,,,Vasserot,48.867548,2.309941,alleedantin17
4,7650,allee,d',antin,Allée d'Antin,15,,,Vasserot,48.867392,2.309942,alleedantin15


In [7]:
df_1884_coord = df_1884.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')
df_1908_coord = df_1908.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')

In [8]:
df_1884_coord = df_1884_coord.reset_index()[["Names", "Addresses", "Y", "X"]]
df_1884_coord = df_1884_coord.rename(columns = {"Y": "latitude", "X":"longitude"})
df_1908_coord = df_1908_coord.reset_index()[["Names", "Addresses", "Y", "X"]]
df_1908_coord = df_1908_coord.rename(columns = {"Y": "latitude", "X":"longitude"})

In [9]:
print("For the year 1884, we have %d addresses with coordinates." %count_elmt(df_1884_coord))
print("For the year 1908, we have %d addresses with coordinates." %count_elmt(df_1908_coord))

For the year 1884, we have 3502 addresses with coordinates.
For the year 1908, we have 6572 addresses with coordinates.


In [10]:
df_1884_no_coord = pd.concat([df_1884,df_1884_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)
df_1908_no_coord = pd.concat([df_1908,df_1908_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)

In [11]:
df_1884_no_coord = df_1884_no_coord.reset_index()[["Names", "Addresses"]]
df_1884_no_coord.head()

Unnamed: 0,Names,Addresses
0,M. Adam Alfred.,Rue Monceau 67
1,Mlle Addenet.,Rue Blairault 19
2,Bno d’ Adelsward.,"Rue De La Bienfaisance,44"
3,Bon Bne cl' Adelsward Gustave...,Boulevard Courcelles 65
4,M. Adenis de la Roserie ...,Rue Tronchet 27


In [12]:
df_1908_no_coord = df_1908_no_coord.reset_index()[["Names", "Addresses"]]
df_1908_no_coord.head()

Unnamed: 0,Names,Addresses
0,Michel-Robert ABRADIE-d’ARRAST,5S bis rue Jouffroy
1,Adolphe ABEILLE,27 Faubourg-Saint-Honoré
2,Georges ABOILARI,46 avenue de Breteuil
3,Paul ABZAC,181 rue de La Pompe
4,Jeannede fACHER de MONTGASCON et Mme née COURCY,18 avenue d’Antin


In [13]:
print("For the year 1884, we still have %d addresses without coordinates." %count_elmt(df_1884_no_coord))
print("For the year 1908, we still have %d addresses without coordinates." %count_elmt(df_1908_no_coord))

For the year 1884, we still have 2606 addresses without coordinates.
For the year 1908, we still have 4193 addresses without coordinates.


### Geopy

In [14]:
def get_coord(df):

    notfound_addresses = 0
    people = df.copy()
    failed = pd.DataFrame(columns = ['Names','Addresses'])
    success = pd.DataFrame(columns = ['Names','Addresses'])
    people["latitude"] = "0"
    people["longitude"] = "0"
    geolocator_name = "0"
    geolocator = Nominatim(user_agent=geolocator_name)
    nb_failed = 0
    last_failed = False
    coord = None
    for i in tqdm(range(len(people))):
        if(i%10 == 0):
            # We don't want the server to freeze
            time.sleep(1)
            geolocator_name = str(i)
            geolocator = Nominatim(user_agent=geolocator_name)
        try:
            coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
        except: 
            try:
                coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
            except:
                print("Failed")
                if(last_failed):
                    nb_failed +=1
                else:
                    nb_failed = 1
                    last_failed = True
                if(nb_failed >= 15): # after 15 failures in a row we assume that the server blocks us
                    failed = failed.append(people[i:])
                    return (success, failed)
                failed = failed.append(people.iloc[[i]])
                
                
        if(coord):
            people["latitude"][i] = coord.latitude
            people["longitude"][i] = coord.longitude
            last_failed = False
            success = success.append(people.iloc[[i]])
        else: 
            notfound_addresses += 1
    score = 0
    if(len(people) > 0):
        score = notfound_addresses/len(people)*100
    print("Missed " + str(notfound_addresses) + "/" + str(len(people)) + \
          " addresses (= " + str(score) + "%)")

    return (success, failed)

# 1908

In [15]:
(people_1908 ,failed) = get_coord(df_1908_no_coord)

HBox(children=(IntProgress(value=0, max=4193), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed


In [16]:
failed = failed.reset_index().drop("index", axis=1)
failed.to_csv('data/failed_1908.csv', index = None, header = True)

people_1908 = people_1908.reset_index().drop("index", axis=1)
people_1908.to_csv('data/people_1908_temporary.csv', index = None, header = True)

In [22]:
failed_ = pd.read_csv('data/failed_1908.csv')
people_1908 = pd.read_csv('data/people_1908_temporary.csv')

(success,failed) = get_coord(failed_)
people_1908 = people_1908.append(success, sort = False)
print("Still " + str(len(failed))+ " failed")

failed = failed.reset_index().drop("index", axis=1)
failed.to_csv('data/failed_1908.csv', index = None, header = True)
people_1908.to_csv('data/people_1908_temporary.csv', index = None, header = True)

HBox(children=(IntProgress(value=0, max=317), HTML(value='')))


Missed 130/317 addresses (= 41.00946372239748%)
Still 0 failed


# 1884

In [15]:
(people_1884 ,failed) = get_coord(df_1884_no_coord)

failed = failed.reset_index().drop("index", axis=1)
failed.to_csv('data/failed_1884.csv', index = None, header = True)

people_1884 = people_1884.reset_index().drop("index", axis=1)
people_1884.to_csv('data/people_1884_temporary.csv', index = None, header = True)

HBox(children=(IntProgress(value=0, max=2606), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed
Failed

In [50]:
failed_ = pd.read_csv('data/failed_1884.csv')
people_1884 = pd.read_csv('data/people_1884_temporary.csv')

(success,failed) = get_coord(failed_)
people_1884 = people_1884.append(success, sort = False)
print("Still " + str(len(failed))+ " failed")

failed = failed.reset_index().drop("index", axis=1)
failed.to_csv('data/failed_1884.csv', index = None, header = True)
people_1884.to_csv('data/people_1884_temporary.csv', index = None, header = True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Missed 0/0 addresses (= 0%)
Still 0 failed


# Final output

In [23]:
final_1908 = people_1908.append(df_1908_coord, sort = False)
final_1908 = final_1908.reset_index()[["Addresses", "Names", "latitude", "longitude"]]
final_1908.to_csv('data/final_1908.csv')

In [52]:
final_1884 = people_1884.append(df_1884_coord, sort = False)
final_1884 = final_1884.reset_index()[["Addresses", "Names", "latitude", "longitude"]]
final_1884.to_csv('data/final_1884.csv', header=True, index=None)

# Statistics and post-adjustments

In [24]:
found = len(final_1908)/len(df_1908)*100
print("We found %0.2f%% of addresses coordinates of 1908." %found)

We found 91.72% of addresses coordinates of 1908.


In [25]:
# We noticed that some addresses from 1908 and 1884 were only numbers and were still found with geopy. 
# Here is an adjustment
final_1908 = pd.read_csv('data/final_1908.csv')
final_1908 = final_1908[[len(x.replace(' ', '')) > 3 for x in final_1908["Addresses"]]]
final_1908 = final_1908.reset_index()[["Addresses", "Names", "latitude", "longitude"]]
final_1908.to_csv('data/final_1908.csv', header=True, index=None)



In [None]:
final_1884 = pd.read_csv('data/final_1884.csv')
final_1884 = final_1884[[len(x.replace(' ', '')) > 4 for x in final_1884["Addresses"]]]
final_1884 = final_1884.reset_index()[["Addresses", "Names", "latitude", "longitude"]]
final_1884.to_csv('data/final_1884.csv', header=True, index=None)

In [68]:
found = len(final_1884)/len(df_1884)*100
print("We found %0.2f%% of addresses coordinates of 1884." %found)

We found 106.31% of addresses coordinates of 1884.


Hum... after a quick look it seems that we have duplicate people and addresses with slightly different coordinates... Let's fix that.

In [71]:
final_1884 = final_1884.drop_duplicates()
final_1884 = final_1884.groupby(["Addresses", "Names"]).mean().reset_index().sort_values("Addresses")



In [26]:
final_1908 = final_1908.drop_duplicates()
final_1908 = final_1908.groupby(["Addresses", "Names"]).mean().reset_index().sort_values("Addresses")

In [72]:
final_1884.head()

Unnamed: 0,Addresses,Names,latitude,longitude
0,-Cité De Varenne 4,Gse de Rougé Louis.,48.853836,2.321465
1,-Rue Royale 6,Bon de Noirmont-Dunoyer .,48.863103,2.336913
2,Av.bois-De-Boulogne 62,M. Verberckmoës.,48.868152,2.288788
3,Av.champs-Elysées 129,Mis Mse Amelot de Chaillou. ..,48.862215,2.318372
4,Avenue Antin 14,M. Me Aclocque ..,48.872149,2.329517


In [27]:
final_1908.head()

Unnamed: 0,Addresses,Names,latitude,longitude
0,0 avenue des Chasseurs,BARRIAS,48.886772,2.305545
1,0 place du Palais-Bourbon,VALDRIMEY d’AYOLT,48.86037,2.318281
2,0 rue ' de Phalsbourg,Henri DESAINS et Mme née de BENNETOT,48.882707,2.308982
3,0 rue Daubigny,Fernand LE ESNE,48.885405,2.30808
4,0 rue Monsieur,HAVENEAU et Mme née de MAN VILLE,48.849934,2.316642


In [85]:
# We also group df_1884 and df_1908 by Addresses and Names just in case
found = len(final_1884)/len(df_1884.groupby(["Addresses", "Names"]))*100
print("We found %0.2f%% of addresses coordinates of 1884." %found)

We found 64.03% of addresses coordinates of 1884.
We found 63.46% of addresses coordinates of 1908.


In [28]:
found = len(final_1908)/len(df_1908.groupby(["Addresses", "Names"]))*100
print("We found %0.2f%% of addresses coordinates of 1908." %found)

We found 64.20% of addresses coordinates of 1908.


In [86]:
final_1884.to_csv('data/final_1884.csv', header=True, index=None)

In [29]:
final_1908.to_csv('data/final_1908.csv', header=True, index=None)