In [1]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from geopy.geocoders import Nominatim
from tqdm.autonotebook import tqdm



### Utility functions

In [2]:
#Function to get the number of elements
def count_elmt(df):
    return len(df.index)

# Text handling utilities
from string import punctuation
def lowercase_all(text):
    return text.lower()
def remove_punct(text):
    return ''.join([ch for ch in text if ch not in punctuation])

# Loading the data cleaned in the respective parser

In [3]:
df_1884 = pd.read_csv('data/data_1884_cleaned.csv')
df_1908 = pd.read_csv('data/data_1908_cleaned.csv')

# Preprocessing

In [4]:
def remove_accent(string):
    string = string.replace('é','e')
    string = string.replace('è','e')
    string = string.replace('ê','e')
    string = string.replace('ë','e')
    string = string.replace('à','a')
    string = string.replace('â','a')
    string = string.replace('ô','o')
    return string

def simplest(string): #Return the simplest form (no punctuation, all lowercase, no accents) of a string
    new_string = ''
    if type(string) == str:
        for c in string:
            if c.isalpha():
                new_string += c
    return remove_punct(lowercase_all(remove_accent(new_string)))

def simplest_adr(string): #Format: Avenue St-Honoré 21 -> avenuesthonore21
    num = ''
    if type(string) == str:
        for c in string:
            if c.isnumeric():
                num += c   
    return(simplest(string)+num)

In [5]:
df_1884['Simplest'] = df_1884['Addresses'].apply(simplest_adr)
df_1908['Simplest'] = df_1908['Addresses'].apply(simplest_adr)

# Getting the coordinates

### Paris street names

In [6]:
coord = pd.read_csv('data/All_nums.csv')
coord['Simplest'] = coord['nom_entier'] + coord['num'].map(lambda x: str(x))
coord['Simplest'] = coord['Simplest'].apply(simplest_adr)
coord.head()

Unnamed: 0,id,type,article,nom,nom_entier,num,debut,fin,source,Y,X,Simplest
0,7646,allee,d',antin,Allée d'Antin,23,,,Vasserot,48.868123,2.309918,alleedantin23
1,7647,allee,d',antin,Allée d'Antin,21,,,Vasserot,48.867949,2.309923,alleedantin21
2,7648,allee,d',antin,Allée d'Antin,19,,,Vasserot,48.867724,2.309931,alleedantin19
3,7649,allee,d',antin,Allée d'Antin,17,,,Vasserot,48.867548,2.309941,alleedantin17
4,7650,allee,d',antin,Allée d'Antin,15,,,Vasserot,48.867392,2.309942,alleedantin15


In [7]:
df_1884_coord = df_1884.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')

In [8]:
df_1908_coord = df_1908.merge(coord[['Simplest', 'Y', 'X']], on = 'Simplest')

In [9]:
print("For the year 1884, we have %d addresses with coordinates." %count_elmt(df_1884_coord))
print("For the year 1908, we have %d addresses with coordinates." %count_elmt(df_1908_coord))

For the year 1884, we have 2765 addresses with coordinates.
For the year 1908, we have 5594 addresses with coordinates.


In [10]:
df_1884_no_coord = pd.concat([df_1884,df_1884_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)
df_1908_no_coord = pd.concat([df_1908,df_1908_coord], sort = True).drop_duplicates(subset = 'Simplest', keep = False)

In [17]:
df_1884_no_coord.head()

Unnamed: 0.1,Addresses,Names,Simplest,Unnamed: 0,X,Y
1,Rue Monceau 67,M. Adam Alfred.,ruemonceau67,1,,
3,Rue Blairault 19,Mlle Addenet.,rueblairault19,3,,
5,Boulevard Courcelles 65,Bon Bne cl' Adelsward Gustave...,boulevardcourcelles65,5,,
9,Rue Cle Bourgogne 63,Ysed' Adhémard.,rueclebourgogne63,9,,
10,Rue Cle Rennes 127,Gt0 Affrede S1-Rome Denis,rueclerennes127,10,,


In [12]:
print("For the year 1884, we still have %d addresses without coordinates." %count_elmt(df_1884_no_coord))
print("For the year 1908, we still have %d addresses without coordinates." %count_elmt(df_1908_no_coord))

For the year 1884, we still have 1497 addresses without coordinates.
For the year 1908, we still have 2103 addresses without coordinates.


### Geopy

In [20]:
def get_coord(df):

    notfound_addresses = 0
    people = df.copy()
    people["latitude"] = "0"
    people["longitude"] = "0"
    geolocator = Nominatim(user_agent="agent")
    coord = None
    for i in tqdm(range(len(people))):
        try:
            coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
        except: 
            print("Try again...")
            try:
                coord = geolocator.geocode(people['Addresses'][i] + ", Paris")
            except:
                print("I give up")
        if(coord):
            people["latitude"][i] = coord.latitude
            people["longitude"][i] = coord.longitude
        else: 
            notfound_addresses += 1
    print("Missed " + str(notfound_addresses) + "/" + str(len(people)) + \
          " addresses (= " + str(notfound_addresses/len(people)*100) + "%)")

    return people

In [23]:
get_coord(df_1884_no_coord[:30])

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

Try again...
I give up


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up
Try again...
I give up

Missed 15/30 addresses (= 50.0%)


Unnamed: 0.1,Addresses,Names,Simplest,Unnamed: 0,X,Y,latitude,longitude
1,Rue Monceau 67,M. Adam Alfred.,ruemonceau67,1,,,48.8796,2.30816
3,Rue Blairault 19,Mlle Addenet.,rueblairault19,3,,,0.0,0.0
5,Boulevard Courcelles 65,Bon Bne cl' Adelsward Gustave...,boulevardcourcelles65,5,,,0.0,0.0
9,Rue Cle Bourgogne 63,Ysed' Adhémard.,rueclebourgogne63,9,,,0.0,0.0
10,Rue Cle Rennes 127,Gt0 Affrede S1-Rome Denis,rueclerennes127,10,,,0.0,0.0
11,Rue St-Placide 62,Mc d' Affry de i.a Monnoye.,ruestplacide62,11,,,0.0,0.0
14,Rue De Villejust 38,l> Aguado A.,ruedevillejust38,14,,,0.0,0.0
16,Rue Aguesseau 20,Mis Ms° de F Aigle.,rueaguesseau20,16,,,0.0,0.0
17,Rue Astorg 12,Cte CSJ de 1’Aigle.,rueastorg12,17,,,0.0,0.0
21,Rue Du D Septembre 28,C°> d' Aillières.,ruedudseptembre28,21,,,0.0,0.0


# Final output

In [None]:
df_1884.to_csv('data/data_1884_coord.csv')
df_1908.to_csv('data/data_1908_coord.csv')