# Big-Data

Description

In [None]:
# Install necessary libraries to run this IPython Notebook
!pip install folium

## Treatment of data

In [None]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap

In [None]:
# Load spreadsheet
spreadsheet = pd.ExcelFile('imd_student_blind.xlsx')

# Load a sheet into a DataFrame by index
imd_student_blind_df = spreadsheet.parse(0)

In [None]:
# Print the head of the DataFrame df
imd_student_blind_df.head()

In [None]:
# Print the columns' name
imd_student_blind_df.columns

In [None]:
imd_student_blind_df.shape

In [None]:
id_by_year = imd_student_blind_df.groupby('ano_ingresso')['a_ID'].count()
id_by_year

In [None]:
from bokeh.charts import Bar, Histogram, output_notebook, show

p = Bar(id_by_year, values='a_ID', title="ID por ano", color='a_ID')

output_notebook()

show(p)

In [None]:
# Load geolocation extracted data
students_geolocation = pd.read_csv('students_geolocation.csv')

In [None]:
students_geolocation.info()

In [None]:
students_geolocation.head()

In [None]:
grouped_df = imd_student_blind_df.groupby('a_ID')

students_summary_df = grouped_df['nota'].agg({
    'grades_mean': 'mean',
    'grades_amount': 'count',
}).reset_index()

students_summary_df.rename(
    columns = {
        'a_ID' : 'id'
    },
    inplace = True
)

students_summary_df['distance'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['distance']
students_summary_df['lat'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['lat']
students_summary_df['lng'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['lng']


In [None]:
students_summary_df.head()

In [None]:
grades_x_distance = pd.DataFrame({
    'distance': students_summary_df['distance'],
    'grades_mean': students_summary_df['grades_mean']
})

# Drop NaN values
grades_x_distance.dropna(inplace = True)

# Get only entries that have less than 300km from distance
grades_x_distance = grades_x_distance[grades_x_distance['distance'] < 50]

In [None]:
grades_x_distance.info()

## Graphics

In [None]:
# Import modules from Bokeh visualization library
#import matplotlib.pyplot as plt
#from bokeh.models import HoverTool
from bokeh.charts import Scatter
#from bokeh.layouts import row, gridplot
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, ColumnDataSource

In [None]:
'''
data_groupby = ['ano_disciplina', 'periodo_disciplina', 'disciplina_ID']
nota = imd_student_blind_df.groupby(data_groupby)['nota']
nota.mean()
'''
# Gráfico
graph_lines = figure(plot_width=900, plot_height=300, title="Média de nota de cada disciplina, por período letivo")
colors = ['orange', 'blue', 'red', 'green', 'yellow', 'pink', 'purple', 'brown']
i = 0

# Array
ano_periodo_select = ['ano_disciplina', 'periodo_disciplina']

# Imprimir linhas de cada disciplina
for disciplina_ID in imd_student_blind_df['disciplina_ID'].unique():
    # Dados da disciplina
    data_disciplina = imd_student_blind_df[imd_student_blind_df['disciplina_ID'] == disciplina_ID]
    # Percorre o ano e o período
    ano_periodo = data_disciplina[ano_periodo_select].drop_duplicates()
    xline = []
    yline = []
    
    for ano in ano_periodo['ano_disciplina'].sort_values().unique():
        periodos_data = data_disciplina[data_disciplina['ano_disciplina'] == ano][['periodo_disciplina', 'nota']].sort_values('periodo_disciplina')
        for periodo in ano_periodo['periodo_disciplina'].sort_values().unique():
            # Se foi no primeiro ou segundo período
            if(periodo == 2):
                periodo_graph = 5
            else:
                periodo_graph = 0
            # X
            ap = "{0}.{1}".format(ano, periodo_graph)
            xline.append(float(ap))
            # Y
            media_periodo = periodos_data[periodos_data['periodo_disciplina'] == periodo]['nota'].mean()
            yline.append(float(media_periodo))
    # Data source para criar a linha
    line_disciplina_ds = ColumnDataSource(
        data=dict(
            x=xline,
            y=yline
        )
    )
    title = '{0}'.format(disciplina_ID)
    graph_lines.line('x', 'y', line_width=3, source=line_disciplina_ds, legend=title, color=colors[i])
    graph_lines.circle(xline, yline, color=colors[i], size=6, alpha=1.0)
    i += 1 # Avança no contador de cor
    
output_notebook()
show(graph_lines)

In [None]:
p_grades_x_distance = Scatter(
        grades_x_distance,
        x='grades_mean',
        y='distance', 
        xlabel='Grades Mean',
        ylabel='Distance from IMD in kilometers',
        title='Grades Mean vs Distance from IMD',
        plot_width=800, plot_height=400
)

output_notebook()

show(p_grades_x_distance)

In [None]:
# Create a coordinates list with the filtered data and show the heatmap

coordinates = []

for i in range(len(students_summary_df)):
    if all(~np.isnan([students_summary_df.ix[i,'lat'], students_summary_df.ix[i,'lng']])):
        coordinates.append([students_summary_df.ix[i,'lat'], students_summary_df.ix[i,'lng'], students_summary_df.ix[i,'grades_mean']])
        
f_map = folium.Map(
    location = [-5.791659, -35.28385],
    zoom_start = 11
)

HeatMap(coordinates).add_to(f_map)

f_map