In [1]:
import pandas as pd
import re
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from geopy.geocoders import Nominatim
from tqdm.autonotebook import tqdm
import time
import os



In [2]:
path = os.path.join('.', 'data', 'locations.csv')
locations = pd.read_csv(path)
locations.columns = ["chronoscope", "place"]
locations = locations.set_index("place")
locations.head(5)

Unnamed: 0_level_0,chronoscope
place,Unnamed: 1_level_1
"National Museum of American History, Behring Center",chronoscope_13482
"National Museum of American History, Behring Center",chronoscope_10107
UNIL Lausanne,chronoscope_12692
"Science museum, London",chronoscope_13074
"National Museum of American History, Behring Center",chronoscope_13184


In [3]:
path = os.path.join('.', 'data', 'coordinates.csv')
coordinates = pd.read_csv(path)
coordinates = coordinates.reset_index().drop("Unnamed: 2", axis=1)
coordinates.columns = ["place", "latitude", "longitude"]
coordinates.head(5)

Unnamed: 0,place,latitude,longitude
0,"""Chicago""","""49.33725","11.42854"""
1,"""Munich""","""48.1372","11.5755"""
2,"""Neuchatel""","""46.990277777778","6.9305555555556"""
3,"""University of Chicago""","""41.789722222222","-87.599722222222"""
4,"""Washington""","""38.895","-77.036666666667"""


In [4]:
def clean(df): 
    df["place"] = df["place"].apply(lambda x: x.replace('"', ''))
    df["place"] = df["place"].apply(lambda x: x[1:-1])
    df["latitude"] = df["latitude"].apply(lambda x: float(x.replace('"', '')))
    df["longitude"] = df["longitude"].apply(lambda x: float(x.replace('"', '')))

In [5]:
clean(coordinates)
coordinates = coordinates.set_index("place")
coordinates.head(5)

Unnamed: 0_level_0,latitude,longitude
place,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicago,49.33725,11.42854
Munich,48.1372,11.5755
Neuchatel,46.990278,6.930556
University of Chicago,41.789722,-87.599722
Washington,38.895,-77.036667


In [6]:
def get_coord(places, coordinates):
    completed = places.copy()
    geolocator = Nominatim(user_agent="Hipp")
    nb_failed = 0
    notfound = []
    coord = None
    completed["latitude"] = 0.0
    completed["longitude"] = 0.0
    all_places = coordinates.index.tolist()
    wiki_coord = 0
    geopy_coord = 0
    
    for i in tqdm(range(len(places))):
        place = completed.index[i]
        if place in all_places:
            completed["latitude"][i] = coordinates.loc[place].latitude
            completed["longitude"][i] = coordinates.loc[place].longitude
            wiki_coord += 1
        else:
            try:
                coord = geolocator.geocode(place)
            except: 
                try:
                    coord = geolocator.geocode(place)
                except:
                    print("Failed")
                    nb_failed +=1
            
            if(coord):
                completed["latitude"][i] = coord.latitude
                completed["longitude"][i] = coord.longitude
                geopy_coord += 1
            else: 
                notfound.append(place)
                
    print("Failed " + str(nb_failed) + " requests.")
    print("Missed " + str(len(notfound))+ " places.")
    print(str(wiki_coord)+ " coordinates were already here, found "+ str(geopy_coord) + " more.")
    
    return (completed, notfound)

In [7]:
(completed, notfound) = get_coord(locations, coordinates)
completed

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Failed 0 requests.
Missed 8 places.
3 coordinates were already here, found 13 more.


Unnamed: 0_level_0,chronoscope,latitude,longitude
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"National Museum of American History, Behring Center",chronoscope_13482,0.0,0.0
"National Museum of American History, Behring Center",chronoscope_10107,0.0,0.0
UNIL Lausanne,chronoscope_12692,46.52257,6.58095
"Science museum, London",chronoscope_13074,51.497386,-0.174657
"National Museum of American History, Behring Center",chronoscope_13184,0.0,0.0
"Musée Histoire des Sciences, Genève",chronoscope_13548,0.0,0.0
University of Toronto Scientific Instruments Collection,chronoscope_13919,43.662917,-79.395746
University of Toronto Scientific Instruments Collection,chronoscope_14453,43.662917,-79.395746
"National Museum of American History, Behring Center",chronoscope_14480,0.0,0.0
KU Leuven - Faculty of Psychology and Educational Sciences,chronoscope_15057,0.0,0.0


In [9]:
missing_places = pd.DataFrame({"place": notfound}).drop_duplicates()["place"].tolist()
missing_places

['National Museum of American History, Behring Center',
 'Musée Histoire des Sciences, Genève',
 'KU Leuven - Faculty of Psychology and Educational Sciences',
 'The Barnard College Psychology Department']

In [10]:
renamed_places = pd.DataFrame({"place": ["Behring Center", "Genève", "KU Leuven", "Barnard College"]
                               , "latitude": 0, "longitude": 0})
renamed_places = renamed_places.reset_index().drop("index", axis=1).set_index("place")
renamed_places

Unnamed: 0_level_0,latitude,longitude
place,Unnamed: 1_level_1,Unnamed: 2_level_1
Behring Center,0,0
Genève,0,0
KU Leuven,0,0
Barnard College,0,0


In [11]:
(last_places, _) = get_coord(renamed_places, coordinates)
last_places

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Failed 0 requests.
Missed 0 places.
0 coordinates were already here, found 4 more.


Unnamed: 0_level_0,latitude,longitude
place,Unnamed: 1_level_1,Unnamed: 2_level_1
Behring Center,52.37785,9.737548
Genève,46.201756,6.146601
KU Leuven,50.873931,4.708316
Barnard College,40.809705,-73.963389


In [19]:
final = completed.copy()

for i in tqdm(range(len(final))):
        place = final.index[i]
        for j in range(len(missing_places)):
            if place == missing_places[j]:
                final["latitude"][i] = last_places.loc[last_places.index[j]].latitude
                final["longitude"][i] = last_places.loc[last_places.index[j]].longitude

final

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,chronoscope,latitude,longitude
place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"National Museum of American History, Behring Center",chronoscope_13482,52.37785,9.737548
"National Museum of American History, Behring Center",chronoscope_10107,52.37785,9.737548
UNIL Lausanne,chronoscope_12692,46.52257,6.58095
"Science museum, London",chronoscope_13074,51.497386,-0.174657
"National Museum of American History, Behring Center",chronoscope_13184,52.37785,9.737548
"Musée Histoire des Sciences, Genève",chronoscope_13548,46.201756,6.146601
University of Toronto Scientific Instruments Collection,chronoscope_13919,43.662917,-79.395746
University of Toronto Scientific Instruments Collection,chronoscope_14453,43.662917,-79.395746
"National Museum of American History, Behring Center",chronoscope_14480,52.37785,9.737548
KU Leuven - Faculty of Psychology and Educational Sciences,chronoscope_15057,50.873931,4.708316


In [20]:
path = os.path.join(".", "data", "exported.csv")
final.to_csv(path, index= True)