# Big-Data

Description

In [None]:
# Install necessary libraries to run this IPython Notebook
!pip install folium

## Treatment of data

In [None]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap

In [None]:
# Load spreadsheet
spreadsheet = pd.ExcelFile('datasets/imd_student_blind.xlsx')

# Load a sheet into a DataFrame by index
imd_student_blind_df = spreadsheet.parse(0)

In [None]:
# Replace 'disciplina_ID' for its abreviation name

classes_map = {
    0: 'ITP',
    1: 'PLE1',
    2: 'RPMTI',
    3: 'CDI1',
    4: 'PLE2',
    5: 'FMC1',
    6: 'VGA' 
}

imd_student_blind_df['disciplina_ID'].replace(classes_map, inplace = True)

In [None]:
# Print the head of the DataFrame df
imd_student_blind_df.head()

In [None]:
# Print the columns' name
imd_student_blind_df.columns

In [None]:
imd_student_blind_df.shape

In [None]:
# Load geolocation extracted data
students_geolocation = pd.read_csv('datasets/students_geolocation.csv')

In [None]:
students_geolocation.info()

In [None]:
students_geolocation.head()

In [None]:
grouped_df = imd_student_blind_df.groupby('a_ID')

students_summary_df = grouped_df['nota'].agg({
    'grades_mean': 'mean',
    'grades_amount': 'count',
}).reset_index()

students_summary_df.rename(
    columns = {
        'a_ID' : 'id'
    },
    inplace = True
)

students_summary_df['distance'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['distance']
students_summary_df['lat'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['lat']
students_summary_df['lng'] = students_geolocation[students_geolocation['id'] == students_summary_df['id']]['lng']


In [None]:
students_summary_df.head()

In [None]:
grades_x_distance = pd.DataFrame({
    'distance': students_summary_df['distance'],
    'grades_mean': students_summary_df['grades_mean']
})

# Drop NaN values
grades_x_distance.dropna(inplace = True)

# Get only entries that have less than 30km from distance
grades_x_distance = grades_x_distance[grades_x_distance['distance'] < 30]

In [None]:
grades_x_distance.info()

## Analysis

In [None]:
# Import modules from Bokeh visualization library
from bokeh.charts import Scatter, BoxPlot, Bar
from bokeh.layouts import gridplot
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, ColumnDataSource

In [None]:
imd_student_blind_df[['a_ID', 'ano_ingresso']].drop_duplicates().groupby('ano_ingresso').count()

#899

In [None]:
id_by_year = imd_student_blind_df[['a_ID', 'ano_ingresso']].drop_duplicates().groupby('ano_ingresso').count()
id_by_year

p = Bar(id_by_year, values='a_ID', xlabel='Ano', ylabel='Quantidade de estudantes', 
        title="Estudantes por ano", color='a_ID')

output_notebook()
show(p)

In [None]:
'''
data_groupby = ['ano_disciplina', 'periodo_disciplina', 'disciplina_ID']
nota = imd_student_blind_df.groupby(data_groupby)['nota']
nota.mean()
'''
# Gráfico
graph_lines = figure(plot_width=900, plot_height=300, title="Média de nota de cada disciplina, por período letivo")
colors = ['orange', 'blue', 'red', 'green', 'yellow', 'pink', 'purple', 'brown']
i = 0

# Array
ano_periodo_select = ['ano_disciplina', 'periodo_disciplina']

# Imprimir linhas de cada disciplina
for disciplina_ID in imd_student_blind_df['disciplina_ID'].unique():
    # Dados da disciplina
    data_disciplina = imd_student_blind_df[imd_student_blind_df['disciplina_ID'] == disciplina_ID]
    # Percorre o ano e o período
    ano_periodo = data_disciplina[ano_periodo_select].drop_duplicates()
    xline = []
    yline = []
    
    for ano in ano_periodo['ano_disciplina'].sort_values().unique():
        periodos_data = data_disciplina[data_disciplina['ano_disciplina'] == ano][['periodo_disciplina', 'nota']].sort_values('periodo_disciplina')
        for periodo in ano_periodo['periodo_disciplina'].sort_values().unique():
            # Se foi no primeiro ou segundo período
            if(periodo == 2):
                periodo_graph = 5
            else:
                periodo_graph = 0
            # X
            ap = "{0}.{1}".format(ano, periodo_graph)
            xline.append(float(ap))
            # Y
            media_periodo = periodos_data[periodos_data['periodo_disciplina'] == periodo]['nota'].mean()
            yline.append(float(media_periodo))
    # Data source para criar a linha
    line_disciplina_ds = ColumnDataSource(
        data=dict(
            x=xline,
            y=yline
        )
    )
    title = '{0}'.format(disciplina_ID)
    graph_lines.line('x', 'y', line_width=3, source=line_disciplina_ds, legend=title, color=colors[i])
    graph_lines.circle(xline, yline, color=colors[i], size=6, alpha=1.0)
    i += 1 # Avança no contador de cor
    
output_notebook()
show(graph_lines)

In [None]:
p_grades_x_distance = Scatter(
        grades_x_distance,
        x='grades_mean',
        y='distance', 
        xlabel='Grades Mean',
        ylabel='Distance from IMD in kilometers',
        title='Grades Mean vs Distance from IMD',
        plot_width=800, plot_height=400
)

output_notebook()

show(p_grades_x_distance)

In [None]:
# Create a coordinates list with the filtered data and show the heatmap

coordinates = []

for i in range(len(students_summary_df)):
    if all(~np.isnan([students_summary_df.ix[i,'lat'], students_summary_df.ix[i,'lng']])):
        coordinates.append([students_summary_df.ix[i,'lat'], students_summary_df.ix[i,'lng'], students_summary_df.ix[i,'grades_mean']])
        
f_map = folium.Map(
    location = [-5.791659, -35.28385],
    zoom_start = 11
)

HeatMap(coordinates).add_to(f_map)

f_map

### RPMTI dependents subjects analysis

In [None]:
def get_group_by_grade(grade):
    if(grade >= 5 and grade < 6):
        return '5 - 6'
    if(grade >= 6 and grade < 7):
        return '6 - 7'
    if(grade >= 7 and grade < 8):
        return '7 - 8'
    if(grade >= 8 and grade < 9):
        return '8 - 9'
    if(grade >= 9):
        return '9 - 10'

In [None]:
# Add the lib package to the system path, so that we can include SubjectConnections from there.

import os
import sys
module_path = os.path.abspath(os.path.join('./'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from lib.subject_connections import SubjectConnections

In [None]:
RPMTI_student = imd_student_blind_df['disciplina_ID'] == 'RPMTI'
has_approved_status = imd_student_blind_df['status.disciplina'] == 'Aprovado'

RPMTI_students_df = imd_student_blind_df[RPMTI_student & has_approved_status]

RPMTI_connections = SubjectConnections(RPMTI_students_df, 'a_ID')
RPMTI_connections.parse_column('nota', 'grade_group', get_group_by_grade)

In [None]:
def generate_RPMTI_boxplot(df, subject_key): 
    return BoxPlot(
        df,
        values = '{}_grade'.format(subject_key),
        label  = 'grade_group',
        color  = 'grade_group',
        xlabel = 'RPMTI grade groups',
        ylabel = '{} grades'.format(subject_key), 
        title  = '{} student grades (first enrollment) grouped by RPMTI grades'.format(subject_key)
    )

def RPMTI_dependent_subjects(list):
    boxplot_list = []

    for subject in list:
        column_name = '{}_grade'.format(subject)

        students_df = imd_student_blind_df[imd_student_blind_df['disciplina_ID'] == subject]
        df = RPMTI_connections.obtain_values_from(students_df, column_name, ['grade_group'])
        
        box = generate_RPMTI_boxplot(df, subject)
        boxplot_list.append(box)
    
    return boxplot_list
    

boxplots = RPMTI_dependent_subjects(['ITP', 'FMC1', 'CDI1', 'VGA'])

grid = gridplot(boxplots, ncols=2, plot_width=460, plot_height=400)

output_notebook()
show(grid)

In [None]:
RPMTI_connections.base_df.groupby('grade_group', as_index=False)['a_ID'].agg({
    'amount': 'count'
})

In [None]:
ITP_students_df = imd_student_blind_df[imd_student_blind_df['disciplina_ID'] == 'ITP']

In [None]:
# Total enrollments per student in ITP
ITP_enrollments = ITP_students_df.groupby('a_ID', as_index = False)['nota'].agg({
    'total': 'count'
})

ITP_enrollments.groupby('total', as_index = False)['a_ID'].count()