In [4]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

In [14]:
def start(): 
    global API_URL, geolocator
    API_URL = "http://universities.hipolabs.com/search?country="
    pd.options.display.max_columns = None
    geolocator = Nominatim(user_agent='geo')

In [15]:
def get_universities_by_country(country):

    response = requests.get(url= API_URL + country)

    return pd.json_normalize(response.json())

In [16]:
def get_data():
    df_uni = get_universities_by_country('Canada')
    df_uni = pd.concat([df_uni, get_universities_by_country('United States')])
    df_uni = pd.concat([df_uni, get_universities_by_country('Argentina')])
    return df_uni

In [17]:
def standarize_data(df_uni):

    #Dict comprehension para modificar las columnas, rename para usar el dict, inplace=True para sobreescribir
    nuevas_columnas = {col: col.replace("-", "_").lower() for col in df_uni.columns}
    df_uni.rename(columns = nuevas_columnas, inplace = True)

    #Eliminamos domains al ser redundante
    df_uni = df_uni.drop('domains', axis=1)

    #Eliminamos los duplicados
    df_uni.explode('web_pages')
    df_uni.drop_duplicates(subset = ['name'], inplace = True)

    #Sustituimos None por np.NaN
    df_uni['state_province'].isnull().sum()
    df_uni['state_province'].fillna(np.NaN, inplace=True)

    #Sustituimos nulos por Unknown
    df_uni['state_province'].fillna('Unknown', inplace=True)

    #Estandarizamos los estados y usamos el replace con inplace=True
    new_states = {
        'NV':'Nevada', 
        'TX':'Texas',
        'IN':'Indianapolis',
        'CA':'California',
        'VA':'Virginia',
        'NY':'New York',
        'MI':'Michigan',
        'GA':'Georgia',
        'ND':'North Dakota',
        'New York, NY': 'New York',
        'Ciudad Autónoma de Buenos Aires': 'Buenos Aires'   
    }

    df_uni['state_province'].replace(new_states, inplace = True)

    #Hacemos un return de df_uni, el parámetro de la función, siendo el dataframe ahora modificado
    return df_uni


In [18]:
def geolocalize_data(df_uni):

    #Usando unique() en la columna state_province conseguimos los estados, lo convertimos a lista y
    #almacenamos en una variable, creamos un dataframe a partir de ella
    states_list = df_uni['state_province'].unique().tolist()
    df_geo = pd.DataFrame(states_list, columns= ['state'])

    #Recuperamos la latitud y longitud para cada estado iterando por el dataframe en sus filas usando 
    #iterrows
    for i, row in df_geo.iterrows():
        location = geolocator.geocode(row['state'])
        df_geo.at[i, 'latitude'] = location.latitude
        df_geo.at[i, 'longitude'] = location.longitude

    #Unimos los dataframes usando merge, con un inner, en las columnas state_province y state
    df_uni = df_uni.merge(df_geo, how = 'inner', left_on = 'state_province', right_on = 'state')

    #Eliminamos la columna state de df_geo ya que en df_uni tenemos state_province con la misma información
    df_uni = df_uni.drop('state', axis = 1)

    return df_uni


In [24]:
def main():
    start()
    data = get_data()
    data = standarize_data(data)
    data = geolocalize_data(data)

    return data

In [25]:
main()

Unnamed: 0,state_province,name,country,web_pages,alpha_two_code,latitude,longitude
0,Quebec,Cégep de Saint-Jérôme,Canada,"[https://www.cstj.qc.ca, https://ccmt.cstj.qc....",CA,52.476089,-71.825867
1,Quebec,Concordia University,Canada,[http://www.concordia.ca/],CA,52.476089,-71.825867
2,Quebec,"École nationale d'administration publique, Uni...",Canada,[http://www.enap.uquebec.ca/],CA,52.476089,-71.825867
3,Quebec,"École de technologie supérieure, Université du...",Canada,[http://www.etsmtl.ca/],CA,52.476089,-71.825867
4,Quebec,École des Hautes Études Commerciales,Canada,[http://www.hec.ca/],CA,52.476089,-71.825867
...,...,...,...,...,...,...,...
2502,La Rioja,Universidad Nacional de La Rioja,Argentina,[http://www.unlar.edu.ar/],AR,42.281464,-2.482805
2503,La Pampa,Universidad Nacional de La Pampa,Argentina,[http://www.unlpam.edu.ar/],AR,-37.231464,-65.397295
2504,San Juan,Universidad Nacional de San Juan,Argentina,[http://www.unsj.edu.ar/],AR,18.465299,-66.116666
2505,San Luis,Universidad Nacional de San Luis,Argentina,[http://www.unsl.edu.ar/],AR,-33.276220,-65.951555
