In [951]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Sklearn
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization

# Other utilities
import sys
import os

import locale
import dask.dataframe as dd
from utils import load_parquets, info_sum_isna

from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

from subprocess import check_output
from random import choices

# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
# Assign main directory to a variable
# main_dir=os.path.dirname(sys.path[0])
#print(main_dir)

In [952]:
year = 2020
path = f'../data/integrated_datas_{year}.parquet.gzip'
df = dd.read_parquet(path, ignore_metadata_file=True)
df.head()

Unnamed: 0,NU_ANO,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ESTADO_CIVIL,TP_COR_RACA,...,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,TP_SES_INCOME,TP_SES_POINTS
0,2020,Cerro Largo,RS,3,F,1,1,1,1,3,...,A,B,B,A,D,A,B,B,E,DE
1,2020,João Pessoa,PB,5,F,1,3,1,2,3,...,A,B,A,A,C,A,A,A,E,DE
2,2020,Eunápolis,BA,7,M,1,0,1,1,1,...,A,B,A,A,C,A,B,B,E,DE
3,2020,Maceió,AL,7,F,1,3,1,1,3,...,A,B,B,A,C,A,B,B,E,DE
4,2020,Belém,PA,13,M,1,0,1,1,2,...,A,B,B,A,C,A,A,A,E,DE


In [953]:
df = df[['NU_NOTA_MT', 'NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
       'TP_ESTADO_CIVIL', 'TP_COR_RACA', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'Q001', 'Q002', 'Q003', 'Q004', 'TP_SES_INCOME',
       'TP_SES_POINTS']]

In [954]:
df = df.categorize().compute()

In [955]:
df_transformed = dd.get_dummies(df, columns = ['NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
'TP_ESTADO_CIVIL', 'TP_COR_RACA'])

In [956]:
def tranform_data(data):
    mapping_dict_q001_q002 = {
        'H':0, 'A':1,'B':2,'C':3,'D':4,
        'E':5,'F':6,'G':7
        }

    data['Q001'] = data.Q001.map(mapping_dict_q001_q002)
    data['Q002'] = data.Q002.map(mapping_dict_q001_q002)

    mapping_dict_q003_q004 = {
        'G':0, 'A':1,'B':2,'C':3,'D':4,
        'E':5,'F':6
        }

    data['Q003'] = data.Q003.map(mapping_dict_q003_q004)
    data['Q004'] = data.Q004.map(mapping_dict_q003_q004)

    mapping_dict_TP_SES_INCOME = {
        'A':0,'B':1,'C':2,'D':3,'E':4
        }

    data['TP_SES_INCOME'] = data.TP_SES_INCOME.map(mapping_dict_TP_SES_INCOME)

    mapping_dict_TP_SES_POINTS = {
        'A':0,'B1':1,'B2':2,'C1':3,'C2':4, 'DE':5
        }

    data['TP_SES_POINTS'] = data.TP_SES_POINTS.map(mapping_dict_TP_SES_POINTS)

    return data

In [957]:
df = tranform_data(df_transformed)

In [958]:
def transform_grade(score):
    if score >= 570: return 2
    elif score >= 450: return 1
    else : return 0

In [959]:
# df['NU_NOTA_MT'].head(30) 

In [960]:
columns_grades = ['NU_NOTA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO']

In [961]:
for column in columns_grades:
    df[column] = df[column].apply(transform_grade)

In [962]:
df.head()

Unnamed: 0,NU_NOTA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO,...,TP_ESTADO_CIVIL_1,TP_ESTADO_CIVIL_2,TP_ESTADO_CIVIL_3,TP_ESTADO_CIVIL_4,TP_COR_RACA_0,TP_COR_RACA_1,TP_COR_RACA_2,TP_COR_RACA_3,TP_COR_RACA_4,TP_COR_RACA_5
0,1,1,2,1,0,0,0,0,0,2,...,1,0,0,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0,0,2,...,0,1,0,0,0,0,0,1,0,0
2,1,1,2,2,0,0,0,0,0,2,...,1,0,0,0,0,1,0,0,0,0
3,0,0,1,1,0,0,0,0,0,2,...,1,0,0,0,0,0,0,1,0,0
4,0,1,1,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0


In [963]:
df.NU_NOTA_MT.value_counts()

1    861932
0    850324
2    849048
Name: NU_NOTA_MT, dtype: int64

In [964]:
# removed 'Q005', 'Q006', 'Q007', 'Q008', 'Q009',
    #    'Q010', 'Q011', 'Q012', 'Q013', 'Q014', 'Q015', 'Q016', 'Q017', 'Q018',
    #    'Q019', 'Q020', 'Q021', 'Q022', 'Q023', 'Q024', 'Q025', 

In [965]:
num_samples=100000
ids_sample= sorted(choices(range(len(df)), k=num_samples))

In [966]:
df = df.iloc[ids_sample]

In [967]:
#Preprocessing data to encode categorical values for the y-target column
le = preprocessing.LabelEncoder()

y = le.fit_transform(df.NU_NOTA_MT)

print (y)

[2 1 2 ... 1 1 1]


In [968]:
# Extract the training and test data
data = df.values
X = data[:, 1:]  # all rows, no label

In [969]:
X

array([[1, 2, 2, ..., 0, 0, 0],
       [1, 2, 2, ..., 0, 0, 0],
       [2, 2, 2, ..., 0, 0, 0],
       ...,
       [1, 2, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 0, 0],
       [2, 1, 2, ..., 0, 0, 0]])

In [970]:
y

array([2, 1, 2, ..., 1, 1, 1])

In [971]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [977]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

tuned hpyerparameters :(best parameters)  {'C': 0.01, 'penalty': 'l2'}
accuracy : 0.6185733333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [979]:
logreg2=LogisticRegression(multi_class='multinomial', solver='lbfgs', C=0.01, penalty="l2")
logreg2.fit(X_train,y_train)

print("score",logreg2.score(X_test,y_test))

score 0.6232


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2020

'NU_NOTA_MT', 'NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
       'TP_ESTADO_CIVIL', 'TP_COR_RACA', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'Q001', 'Q002', 'Q003', 'Q004', 'TP_SES_INCOME',
       'TP_SES_POINTS'

100000

0.61816


2020

'NU_NOTA_MT', 'NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
       'TP_ESTADO_CIVIL', 'TP_COR_RACA', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'Q001', 'Q002', 'Q003', 'Q004', 'TP_SES_INCOME',
       'TP_SES_POINTS'

200000

0.62022


2020

'NU_NOTA_MT', 'NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
       'TP_ESTADO_CIVIL', 'TP_COR_RACA', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'Q001', 'Q002', 'Q003', 'Q004', 'TP_SES_INCOME',
       'TP_SES_POINTS'

1000000

0.619228


2020

'NU_NOTA_MT', 'NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
       'TP_ESTADO_CIVIL', 'TP_COR_RACA', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'Q001', 'Q002', 'Q003', 'Q004', 'TP_SES_INCOME',
       'TP_SES_POINTS'

ALL_2020_dataset

0.6174760981125239


2020

'NU_NOTA_MT', 'NU_ANO', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
       'TP_ESTADO_CIVIL', 'TP_COR_RACA', 'NU_NOTA_CN', 'NU_NOTA_CH',
       'NU_NOTA_LC', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2',
       'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO',
       'Q001', 'Q002', 'Q003', 'Q004', 'TP_SES_INCOME',
       'TP_SES_POINTS'

ALL_2020_dataset

0.6183006780920969
