In [1]:
!pip install geopy
!pip install tqdm



In [2]:
import pandas as pd
from geopy.geocoders import GoogleV3
from geopy.distance import vincenty
from tqdm import tqdm
import math

In [3]:
# Load spreadsheet
spreadsheet = pd.ExcelFile('imd_student_blind.xlsx')

# Load a sheet into a DataFrame by index
imd_student_blind_df = spreadsheet.parse(0)

In [4]:
imd_student_blind_df.head()

Unnamed: 0,a_ID,CEP,ano_ingresso,periodo_ingresso,status,ano_disciplina,periodo_disciplina,nota,disciplina_ID,status.disciplina
0,0,59015430,2014,1,CANCELADO,2014,2,2.6,0,Reprovado
1,0,59015430,2014,1,CANCELADO,2015,1,8.0,0,Aprovado
2,1,59073120,2014,1,CANCELADO,2014,2,0.1,0,Reprovado
3,2,59072580,2014,1,ATIVO,2014,2,6.1,0,Aprovado
4,3,59088150,2014,1,ATIVO,2014,1,3.0,0,Reprovado


In [5]:
list(imd_student_blind_df.columns)

['a_ID',
 'CEP',
 'ano_ingresso',
 'periodo_ingresso',
 'status',
 'ano_disciplina',
 'periodo_disciplina',
 'nota',
 'disciplina_ID',
 'status.disciplina']

In [6]:
imd_student_blind_df.shape

(4842, 10)

In [7]:
imd_student_blind_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4842 entries, 0 to 4841
Data columns (total 10 columns):
a_ID                  4842 non-null int64
CEP                   4842 non-null int64
ano_ingresso          4842 non-null int64
periodo_ingresso      4842 non-null int64
status                4842 non-null object
ano_disciplina        4842 non-null int64
periodo_disciplina    4842 non-null int64
nota                  4842 non-null float64
disciplina_ID         4842 non-null int64
status.disciplina     4842 non-null object
dtypes: float64(1), int64(7), object(2)
memory usage: 416.1+ KB


In [8]:
students_geolocation_df = pd.DataFrame(
    imd_student_blind_df.groupby(['a_ID', 'CEP'])
        .size()
        .to_frame('total_entries')
        .reset_index()
)
students_geolocation_df.rename(
    columns = {
        'a_ID' : 'id',
        'CEP'  : 'zipcode'
    },
    inplace = True
)

In [9]:
# Initiliaze GoogleV3 geolocator from geopy
geolocator = GoogleV3(timeout = 10)

# Get geolocation data using zipcode values
for i in tqdm(range(len(students_geolocation_df))):
    zipcode  = int(students_geolocation_df.loc[i, 'zipcode'])
    location = geolocator.geocode(zipcode)
    
    if location:
        students_geolocation_df.loc[i, ['lat', 'lng', 'address']] = [
            location.latitude,
            location.longitude,
            location.address
        ]
    else:
        students_geolocation_df.loc[i, ['lat', 'lng', 'address']] = [None, None, None]

print('Geocoding complete!')

  0%|          | 0/900 [00:00<?, ?it/s]


GeocoderQuotaExceeded: The given key has gone over the requests limit in the 24 hour period or has submitted too many requests in too short a period of time.

In [216]:
# Calculate distance between student and IMD using Vincenty formula
# https://en.wikipedia.org/wiki/Vincenty's_formulae

imd_coordinates = (-5.832151, -35.205397)

for i in tqdm(range(len(students_geolocation_df))):
    address = students_geolocation_df.loc[i, 'address']

    if address is None:
        students_geolocation_df.ix[i, 'distance'] = None
    else:
        lat = students_geolocation_df.loc[i, 'lat']
        lng = students_geolocation_df.loc[i, 'lng']
        student_coordinates = (lat, lng)
        
        distance = vincenty(imd_coordinates, student_coordinates).kilometers
        students_geolocation_df.ix[i, 'distance'] = distance

100%|██████████| 900/900 [00:01<00:00, 589.23it/s]

Distance complete!





In [218]:
students_geolocation_df.head()

Unnamed: 0,id,zipcode,total_entries,lat,lng,address,distance
0,0,59015430,12,-5.816641,-35.200015,"Nova Descoberta, Natal - RN, 59015-430, Brazil",1.815783
1,1,59073120,6,-5.853337,-35.252804,"Planalto, Natal - RN, 59073-120, Brazil",5.749127
2,2,59072580,12,-5.832998,-35.242542,"Cidade Nova, Natal - RN, 59072-580, Brazil",4.114743
3,3,59088150,9,-5.872282,-35.2066,"Neópolis, Natal - RN, 59088-150, Brazil",4.439972
4,4,59064245,9,,,,


In [221]:
# Save the dataframe to a CSV file
students_geolocation_df.to_csv('students-geolocation.csv', index = False)