In [None]:
!pip install geopy
!pip install tqdm

In [None]:
import pandas as pd
from geopy.geocoders import GoogleV3
from geopy.distance import vincenty
from tqdm import tqdm
import math

In [None]:
# Load spreadsheet
spreadsheet = pd.ExcelFile('datasets/imd_student_blind.xlsx')

# Load a sheet into a DataFrame by index
imd_student_blind_df = spreadsheet.parse(0)

In [None]:
imd_student_blind_df.head()

In [None]:
list(imd_student_blind_df.columns)

In [None]:
imd_student_blind_df.shape

In [None]:
imd_student_blind_df.info()

In [None]:
students_geolocation_df = pd.DataFrame(
    imd_student_blind_df.groupby(['a_ID', 'CEP'])
        .size()
        .to_frame('total_entries')
        .reset_index()
)
students_geolocation_df.rename(
    columns = {
        'a_ID' : 'id',
        'CEP'  : 'zipcode'
    },
    inplace = True
)

In [None]:
# Initiliaze GoogleV3 geolocator from geopy
geolocator = GoogleV3(timeout = 10)

# Get geolocation data using zipcode values
for i in tqdm(range(len(students_geolocation_df))):
    zipcode  = int(students_geolocation_df.loc[i, 'zipcode'])
    location = geolocator.geocode(zipcode)
    
    if location:
        students_geolocation_df.loc[i, ['lat', 'lng', 'address']] = [
            location.latitude,
            location.longitude,
            location.address
        ]
    else:
        students_geolocation_df.loc[i, ['lat', 'lng', 'address']] = [None, None, None]

print('Geocoding complete!')

In [None]:
# Calculate distance between student and IMD using Vincenty formula
# https://en.wikipedia.org/wiki/Vincenty's_formulae

imd_coordinates = (-5.832151, -35.205397)

for i in tqdm(range(len(students_geolocation_df))):
    address = students_geolocation_df.loc[i, 'address']

    if address is None:
        students_geolocation_df.ix[i, 'distance'] = None
    else:
        lat = students_geolocation_df.loc[i, 'lat']
        lng = students_geolocation_df.loc[i, 'lng']
        student_coordinates = (lat, lng)
        
        distance = vincenty(imd_coordinates, student_coordinates).kilometers
        students_geolocation_df.ix[i, 'distance'] = distance

In [None]:
students_geolocation_df.head()

In [None]:
# Save the dataframe to a CSV file
students_geolocation_df.to_csv('../datasets/students_geolocation.csv', index = False)