# Imports

In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

from utils import load_data_year, load_parquets, info_sum_isna, add_ses_income, add_ses_points

# Loading the dataset

In [2]:
cols_used = ['NU_INSCRICAO', 'NU_ANO', 'NO_MUNICIPIO_RESIDENCIA', 'SG_UF_RESIDENCIA',
             'NU_IDADE', 'TP_SEXO', 'TP_ESTADO_CIVIL', 'TP_COR_RACA',
             'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
             'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
             'TP_PRESENCA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
             'NU_NOTA_MT', 'NU_NOTA_COMP1',
             'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5',
             'NU_NOTA_REDACAO', 'Q001', 'Q002', 'Q003', 'Q004', 'Q005', 'Q006',
             'Q007', 'Q008', 'Q009', 'Q010', 'Q011', 'Q012', 'Q013', 'Q014',
             'Q015', 'Q016', 'Q017', 'Q018', 'Q019', 'Q020', 'Q021', 'Q022',
             'Q023', 'Q024', 'Q025']

In [3]:
df = load_data_year(2020, cols_used)

In [4]:
df.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,200006300000.0,2020.0,Belém,PA,11.0,F,1.0,2.0,1.0,11.0,...,,,,,,,,,,
1,200001200000.0,2020.0,Natal,RN,11.0,M,2.0,3.0,1.0,11.0,...,,,,,,,,,,
2,200001900000.0,2020.0,Salvador,BA,4.0,F,2.0,3.0,2.0,0.0,...,B,A,A,B,A,A,A,A,A,A
3,200001900000.0,2020.0,Santana de Parnaíba,SP,2.0,M,1.0,3.0,2.0,0.0,...,,,,,,,,,,
4,200001600000.0,2020.0,Diamantina,MG,4.0,F,1.0,3.0,1.0,1.0,...,A,A,A,B,A,B,B,A,A,B


# Pre-processing

In [5]:
info_sum_isna(df)

Unnamed: 0,types,missing
NU_INSCRICAO,float64,0
NU_ANO,float64,0
NO_MUNICIPIO_RESIDENCIA,object,0
SG_UF_RESIDENCIA,object,0
NU_IDADE,float64,0
TP_SEXO,object,0
TP_ESTADO_CIVIL,float64,0
TP_COR_RACA,float64,0
TP_ST_CONCLUSAO,float64,0
TP_ANO_CONCLUIU,float64,0


In [6]:
df = df.dropna()

In [7]:
info_sum_isna(df)

Unnamed: 0,types,missing
NU_INSCRICAO,float64,0
NU_ANO,float64,0
NO_MUNICIPIO_RESIDENCIA,object,0
SG_UF_RESIDENCIA,object,0
NU_IDADE,float64,0
TP_SEXO,object,0
TP_ESTADO_CIVIL,float64,0
TP_COR_RACA,float64,0
TP_ST_CONCLUSAO,float64,0
TP_ANO_CONCLUIU,float64,0


# Social Economic Status (SES)

In [8]:
meta = dict(zip(df.columns, df._meta.dtypes))

In [9]:
meta['TP_SES_INCOME'] = 'O'
df['TP_SES_INCOME'] = ''

In [10]:
df = df.apply(lambda x: add_ses_income(x), axis=1, meta=meta)

In [11]:
df.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,...,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,TP_SES_INCOME
5,200003100000.0,2020.0,Cerro Largo,RS,3.0,F,1.0,3.0,1.0,1.0,...,A,A,B,B,A,D,A,B,B,E
8,200006800000.0,2020.0,João Pessoa,PB,5.0,F,2.0,3.0,1.0,3.0,...,A,A,B,A,A,C,A,A,A,E
11,200006100000.0,2020.0,Eunápolis,BA,7.0,M,1.0,1.0,1.0,0.0,...,B,A,B,A,A,C,A,B,B,E
18,200003300000.0,2020.0,Maceió,AL,7.0,F,1.0,3.0,1.0,3.0,...,A,A,B,B,A,C,A,B,B,E
19,200006000000.0,2020.0,Belém,PA,13.0,M,1.0,2.0,1.0,0.0,...,A,A,B,B,A,C,A,A,A,E


## Points System

In [12]:
meta['TP_SES_POINTS'] = 'O'
df['TP_SES_POINTS'] = ''

In [13]:
df = df.apply(lambda x: add_ses_points(x), axis=1, meta=meta)

In [14]:
df.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,...,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,TP_SES_INCOME,TP_SES_POINTS
5,200003100000.0,2020.0,Cerro Largo,RS,3.0,F,1.0,3.0,1.0,1.0,...,A,B,B,A,D,A,B,B,E,C2
8,200006800000.0,2020.0,João Pessoa,PB,5.0,F,2.0,3.0,1.0,3.0,...,A,B,A,A,C,A,A,A,E,DE
11,200006100000.0,2020.0,Eunápolis,BA,7.0,M,1.0,1.0,1.0,0.0,...,A,B,A,A,C,A,B,B,E,C2
18,200003300000.0,2020.0,Maceió,AL,7.0,F,1.0,3.0,1.0,3.0,...,A,B,B,A,C,A,B,B,E,DE
19,200006000000.0,2020.0,Belém,PA,13.0,M,1.0,2.0,1.0,0.0,...,A,B,B,A,C,A,A,A,E,DE


# Save

In [None]:
#%%time --- Wall time: 12min 42s

#df.to_parquet('../datasets/enem_2020.parquet.gzip', engine='fastparquet', compression='gzip', write_index=False)

# Load parquet files

In [18]:
# First part
path = '../datasets/enem_2020.parquet.gzip/part.0.parquet'

df_enem_parquet = load_parquets(path, 32)

In [19]:
df_enem_parquet.head(5)

Unnamed: 0,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,NO_MUNICIPIO_PROVA,...,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,TP_SES_INCOME,TP_SES_POINTS
0,200003100000.0,2020.0,3.0,F,1.0,3.0,1.0,1.0,1.0,Cerro Largo,...,A,B,B,A,D,A,B,B,E,C2
1,200006800000.0,2020.0,5.0,F,2.0,3.0,1.0,3.0,1.0,João Pessoa,...,A,B,A,A,C,A,A,A,E,DE
2,200006100000.0,2020.0,7.0,M,1.0,1.0,1.0,0.0,1.0,Eunápolis,...,A,B,A,A,C,A,B,B,E,C2
3,200003300000.0,2020.0,7.0,F,1.0,3.0,1.0,3.0,1.0,Maceió,...,A,B,B,A,C,A,B,B,E,DE
4,200006000000.0,2020.0,13.0,M,1.0,2.0,1.0,0.0,1.0,Belém,...,A,B,B,A,C,A,A,A,E,DE
