In [26]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from geopy.geocoders import Nominatim
from tqdm.autonotebook import tqdm
import time
import os

In [27]:
path = os.path.join('.', 'data', 'locations.csv')
locations = pd.read_csv(path)
locations.columns = ["chronoscope", "place"]
locations = locations.set_index("place")
locations.head(5)

Unnamed: 0_level_0,chronoscope
place,Unnamed: 1_level_1
"National Museum of American History, Behring Center",chronoscope_13482
"National Museum of American History, Behring Center",chronoscope_10107
UNIL Lausanne,chronoscope_12692
"Science museum, London",chronoscope_13074
"National Museum of American History, Behring Center",chronoscope_13184


In [28]:
path = os.path.join('.', 'data', 'coordinates.csv')
coordinates = pd.read_csv(path)
coordinates = coordinates.reset_index().drop("Unnamed: 2", axis=1)
coordinates.columns = ["place", "latitude", "longitude"]
coordinates.head(5)

Unnamed: 0,place,latitude,longitude
0,"""Chicago""","""49.33725","11.42854"""
1,"""Munich""","""48.1372","11.5755"""
2,"""Neuchatel""","""46.990277777778","6.9305555555556"""
3,"""University of Chicago""","""41.789722222222","-87.599722222222"""
4,"""Washington""","""38.895","-77.036666666667"""


In [29]:
def clean(df): 
    df["place"] = df["place"].apply(lambda x: x.replace('"', ''))
    df["place"] = df["place"].apply(lambda x: x[1:-1])
    df["latitude"] = df["latitude"].apply(lambda x: float(x.replace('"', '')))
    df["longitude"] = df["longitude"].apply(lambda x: float(x.replace('"', '')))

In [30]:
clean(coordinates)
coordinates = coordinates.set_index("place")
coordinates.head(5)

Unnamed: 0_level_0,latitude,longitude
place,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicago,49.33725,11.42854
Munich,48.1372,11.5755
Neuchatel,46.990278,6.930556
University of Chicago,41.789722,-87.599722
Washington,38.895,-77.036667


In [31]:
def get_coord(places, coordinates):
    completed = places.copy()
    geolocator = Nominatim(user_agent="Hipp")
    nb_failed = 0
    notfound = []
    coord = None
    completed["current_coordinates"] = ""
    all_places = coordinates.index.tolist()
    wiki_coord = 0
    geopy_coord = 0
    
    for i in tqdm(range(len(places))):
        place = completed.index[i]
        if place in all_places:
            completed["current_coordinates"][i] = str(coordinates.loc[place].latitude) + ", " + str(coordinates.loc[place].longitude)
            wiki_coord += 1
        else:
            try:
                coord = geolocator.geocode(place)
            except: 
                try:
                    coord = geolocator.geocode(place)
                except:
                    print("Failed")
                    nb_failed +=1
            
            if(coord):
                completed["current_coordinates"][i] = str(coord.latitude) + ", " + str(coord.longitude)
                geopy_coord += 1
            else: 
                notfound.append(place)
                
    print("Failed " + str(nb_failed) + " requests.")
    print("Missed " + str(len(notfound))+ " places.")
    print(str(wiki_coord)+ " coordinates were already here, found "+ str(geopy_coord) + " more.")
    
    return (completed, notfound)

In [32]:
path = os.path.join(".", "data", "production.csv")
production = pd.read_csv(path)
production.columns = ["chronoscope", "manufacter", "production_place", "place", "time_production"]
production = production.set_index("place")
production.head()

Unnamed: 0_level_0,chronoscope,manufacter,production_place,time_production
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"National Museum of American History, Behring Center",chronoscope_10107,Hipp_manufacture,Neuchatel,1875-1879
UNIL Lausanne,chronoscope_12692,Hipp_manufacture,Neuchatel,1886-1887
"Science museum, London",chronoscope_13074,Hipp_manufacture,Neuchatel,1888
"Dipartimento di Scienze Fisiche e Astronomiche, Palermo, Italy",chronoscope_2505,Hipp_manufacture,Neuchatel,1865
Université de Strasbourg,chronoscope_3716,Hipp_manufacture,Neuchatel,1850-1899


In [33]:
(production_coord, notfound_prod) = get_coord(production, coordinates)

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


Failed 0 requests.
Missed 5 places.
2 coordinates were already here, found 11 more.


In [34]:
production_coord.head()

Unnamed: 0_level_0,chronoscope,manufacter,production_place,time_production,current_coordinates
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"National Museum of American History, Behring Center",chronoscope_10107,Hipp_manufacture,Neuchatel,1875-1879,
UNIL Lausanne,chronoscope_12692,Hipp_manufacture,Neuchatel,1886-1887,"46.5225695, 6.58095049765429"
"Science museum, London",chronoscope_13074,Hipp_manufacture,Neuchatel,1888,"51.4973861, -0.17465652444465"
"Dipartimento di Scienze Fisiche e Astronomiche, Palermo, Italy",chronoscope_2505,Hipp_manufacture,Neuchatel,1865,"38.11023805, 13.3734100676556"
Université de Strasbourg,chronoscope_3716,Hipp_manufacture,Neuchatel,1850-1899,"48.5785039, 7.76360811797386"


In [35]:
notfound_prod

['National Museum of American History, Behring Center',
 'Musée Histoire des Sciences, Genève',
 'National Museum of American History, Behring Center',
 'Musée Histoire des Sciences, Genève',
 'National Museum of American History, Behring Center']

In [36]:
missing_places = pd.DataFrame({"place": notfound_prod}).drop_duplicates()["place"].tolist()
missing_places

['National Museum of American History, Behring Center',
 'Musée Histoire des Sciences, Genève']

In [37]:
renamed_places = pd.DataFrame({"place": ["Behring Center", "Genève"]
                               , "cuurent_coordinates": ""})
renamed_places = renamed_places.reset_index().drop("index", axis=1).set_index("place")
renamed_places

Unnamed: 0_level_0,cuurent_coordinates
place,Unnamed: 1_level_1
Behring Center,
Genève,


In [38]:
(last_places, _) = get_coord(renamed_places, coordinates)
last_places

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


Failed 0 requests.
Missed 0 places.
0 coordinates were already here, found 2 more.


Unnamed: 0_level_0,cuurent_coordinates,current_coordinates
place,Unnamed: 1_level_1,Unnamed: 2_level_1
Behring Center,,"52.37785035, 9.73754758207386"
Genève,,"46.2017559, 6.1466014"


In [42]:
final = production_coord.copy()

for i in tqdm(range(len(final))):
        place = final.index[i]
        for j in range(len(missing_places)):
            if place == missing_places[j]:
                final["current_coordinates"][i] = str(last_places.loc[last_places.index[j]].current_coordinates)

final

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))




Unnamed: 0_level_0,chronoscope,manufacter,production_place,time_production,current_coordinates
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"National Museum of American History, Behring Center",chronoscope_10107,Hipp_manufacture,Neuchatel,1875-1879,"52.37785035, 9.73754758207386"
UNIL Lausanne,chronoscope_12692,Hipp_manufacture,Neuchatel,1886-1887,"46.5225695, 6.58095049765429"
"Science museum, London",chronoscope_13074,Hipp_manufacture,Neuchatel,1888,"51.4973861, -0.17465652444465"
"Dipartimento di Scienze Fisiche e Astronomiche, Palermo, Italy",chronoscope_2505,Hipp_manufacture,Neuchatel,1865,"38.11023805, 13.3734100676556"
Université de Strasbourg,chronoscope_3716,Hipp_manufacture,Neuchatel,1850-1899,"48.5785039, 7.76360811797386"
Rijksmuseum boerhaave,chronoscope_5370,Hipp_manufacture,Neuchatel,1870-1871,"52.1615338, 4.4889042"
Yale Peabody Museum of Natural History,chronoscope_7001,Hipp_manufacture,Neuchatel,1860-1875,"41.3160498, -72.9211317278171"
Humboldt-universität zu berlin,chronoscope_8319,Hipp_manufacture,Neuchatel,1875,"52.51875685, 13.3935604936378"
"Musée Histoire des Sciences, Genève",chronoscope_9258,Hipp_manufacture,Neuchatel,1880,"46.2017559, 6.1466014"
"National Museum of American History, Behring Center",chronoscope_13184,Hipp_manufacture,Neuchatel,1885-1889,"52.37785035, 9.73754758207386"


In [50]:
neuch_coord = "46.990278, 6.930556"
final["initial_coordinates"] = neuch_coord
final.head()

Unnamed: 0_level_0,chronoscope,manufacter,production_place,time_production,current_coordinates,initial_coordinates
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"National Museum of American History, Behring Center",chronoscope_10107,Hipp_manufacture,Neuchatel,1875-1879,"52.37785035, 9.73754758207386","46.990278, 6.930556"
UNIL Lausanne,chronoscope_12692,Hipp_manufacture,Neuchatel,1886-1887,"46.5225695, 6.58095049765429","46.990278, 6.930556"
"Science museum, London",chronoscope_13074,Hipp_manufacture,Neuchatel,1888,"51.4973861, -0.17465652444465","46.990278, 6.930556"
"Dipartimento di Scienze Fisiche e Astronomiche, Palermo, Italy",chronoscope_2505,Hipp_manufacture,Neuchatel,1865,"38.11023805, 13.3734100676556","46.990278, 6.930556"
Université de Strasbourg,chronoscope_3716,Hipp_manufacture,Neuchatel,1850-1899,"48.5785039, 7.76360811797386","46.990278, 6.930556"


In [51]:
path = os.path.join(".", "data", "production_coordinates.csv")
final.to_csv(path, index= True)