In [24]:
!pip install folium



In [27]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap

In [2]:
# Load spreadsheet
spreadsheet = pd.ExcelFile('imd_student_blind.xlsx')

# Load a sheet into a DataFrame by index
imd_student_blind_df = spreadsheet.parse(0)

In [35]:
# Print the head of the DataFrame df
imd_student_blind_df.head()

Unnamed: 0,a_ID,CEP,ano_ingresso,periodo_ingresso,status,ano_disciplina,periodo_disciplina,nota,disciplina_ID,status.disciplina
0,0,59015430,2014,1,CANCELADO,2014,2,2.6,0,Reprovado
1,0,59015430,2014,1,CANCELADO,2015,1,8.0,0,Aprovado
2,1,59073120,2014,1,CANCELADO,2014,2,0.1,0,Reprovado
3,2,59072580,2014,1,ATIVO,2014,2,6.1,0,Aprovado
4,3,59088150,2014,1,ATIVO,2014,1,3.0,0,Reprovado


In [4]:
imd_student_blind_df.columns

Index(['a_ID', 'CEP', 'ano_ingresso', 'periodo_ingresso', 'status',
       'ano_disciplina', 'periodo_disciplina', 'nota', 'disciplina_ID',
       'status.disciplina'],
      dtype='object')

In [5]:
imd_student_blind_df.shape

(4842, 10)

In [6]:
id_by_year = imd_student_blind_df.groupby('ano_ingresso')['a_ID'].count()
id_by_year

ano_ingresso
2014    1600
2015    1980
2016    1262
Name: a_ID, dtype: int64

In [7]:
from bokeh.charts import Bar, Histogram, output_notebook, show

p = Bar(id_by_year, values='a_ID', title="ID por ano", color='a_ID')

output_notebook()

show(p)

In [8]:
# Load geolocation extracted data
students_geolocation = pd.read_csv('students_geolocation.csv')

In [9]:
students_geolocation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 7 columns):
id               900 non-null int64
zipcode          900 non-null int64
total_entries    900 non-null int64
lat              736 non-null float64
lng              736 non-null float64
address          736 non-null object
distance         736 non-null float64
dtypes: float64(3), int64(3), object(1)
memory usage: 49.3+ KB


In [10]:
students_geolocation.head()

Unnamed: 0,id,zipcode,total_entries,lat,lng,address,distance
0,0,59015430,12,-5.816641,-35.200015,"Nova Descoberta, Natal - RN, 59015-430, Brazil",1.815783
1,1,59073120,6,-5.853337,-35.252804,"Planalto, Natal - RN, 59073-120, Brazil",5.749127
2,2,59072580,12,-5.832998,-35.242542,"Cidade Nova, Natal - RN, 59072-580, Brazil",4.114743
3,3,59088150,9,-5.872282,-35.2066,"Neópolis, Natal - RN, 59088-150, Brazil",4.439972
4,4,59064245,9,,,,


In [22]:
grouped_df = imd_student_blind_df.groupby('a_ID')

students_summary_df = grouped_df['nota'].agg({
    'grades_mean': 'mean',
    'grades_amount': 'count',
}).reset_index()

students_summary_df.rename(
    columns = {
        'a_ID' : 'id'
    },
    inplace = True
)

students_summary_df['distance'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['distance']
students_summary_df['lat'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['lat']
students_summary_df['lng'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['lng']


In [23]:
students_summary_df.head()

Unnamed: 0,id,grades_mean,grades_amount,distance,lat,lng
0,0,2.916667,12,1.815783,-5.816641,-35.200015
1,1,3.216667,6,5.749127,-5.853337,-35.252804
2,2,4.116667,12,4.114743,-5.832998,-35.242542
3,3,5.588889,9,4.439972,-5.872282,-35.2066
4,4,3.422222,9,,,


In [18]:
grades_x_distance = pd.DataFrame({
    'distance': students_summary_df['distance'],
    'grades_mean': students_summary_df['grades_mean']
})

# Drop NaN values
grades_x_distance.dropna(inplace = True)

# Get only entries that have less than 300km from distance
grades_x_distance = grades_x_distance[grades_x_distance['distance'] < 50]

In [19]:
grades_x_distance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 0 to 899
Data columns (total 2 columns):
distance       596 non-null float64
grades_mean    596 non-null float64
dtypes: float64(2)
memory usage: 14.0 KB


In [20]:
# Import modules from Bokeh visualization library
from bokeh.charts import Scatter
from bokeh.io import output_notebook, show

p_grades_x_distance = Scatter(
        grades_x_distance,
        x='grades_mean',
        y='distance', 
        xlabel='Grades Mean',
        ylabel='Distance from IMD in kilometers',
        title='Grades Mean vs Distance from IMD',
        plot_width=800, plot_height=400
)

output_notebook()

show(p_grades_x_distance)

In [38]:
# Create a coordinates list with the filtered data and show the heatmap

coordinates = []

for i in range(len(students_summary_df)):
    if all(~np.isnan([students_summary_df.ix[i,'lat'], students_summary_df.ix[i,'lng']])):
        coordinates.append([students_summary_df.ix[i,'lat'], students_summary_df.ix[i,'lng'], students_summary_df.ix[i,'grades_mean']])
        
f_map = folium.Map(
    location = [-5.791659, -35.28385],
    zoom_start = 11
)

HeatMap(coordinates).add_to(f_map)

f_map