In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import zipfile
import json
import dask.dataframe as dd

In [2]:
cols_used = ['NU_INSCRICAO', 'NU_ANO', 'NO_MUNICIPIO_PROVA', 'SG_UF_PROVA',
             'TP_FAIXA_ETARIA', 'TP_SEXO', 'TP_ESTADO_CIVIL', 'TP_COR_RACA',
             'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
             'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
             'TP_PRESENCA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
             'NU_NOTA_MT', 'NU_NOTA_COMP1',
             'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5',
             'NU_NOTA_REDACAO', 'Q001', 'Q002', 'Q003', 'Q004', 'Q005', 'Q006',
             'Q007', 'Q008', 'Q009', 'Q010', 'Q011', 'Q012', 'Q013', 'Q014',
             'Q015', 'Q016', 'Q017', 'Q018', 'Q019', 'Q020', 'Q021', 'Q022',
             'Q023', 'Q024', 'Q025']

In [3]:
file = '../../../microdados_anos/MICRODADOS_ENEM_2020.csv'
df = dd.read_csv(file, encoding='cp1252', sep=';', usecols=cols_used, assume_missing=True)

In [4]:
type(df)

dask.dataframe.core.DataFrame

In [5]:
pd.DataFrame({'types': df.dtypes, 'missing': df.isna().compute().sum()})

Unnamed: 0,types,missing
NU_INSCRICAO,float64,0
NU_ANO,float64,0
TP_FAIXA_ETARIA,float64,0
TP_SEXO,object,0
TP_ESTADO_CIVIL,float64,0
TP_COR_RACA,float64,0
TP_ST_CONCLUSAO,float64,0
TP_ANO_CONCLUIU,float64,0
TP_ESCOLA,float64,0
NO_MUNICIPIO_PROVA,object,0


In [6]:
df = df.dropna()

In [7]:
pd.DataFrame({'types': df.dtypes, 'missing': df.isna().sum()})

Unnamed: 0,types,missing
NU_INSCRICAO,float64,0
NU_ANO,float64,0
TP_FAIXA_ETARIA,float64,0
TP_SEXO,object,0
TP_ESTADO_CIVIL,float64,0
TP_COR_RACA,float64,0
TP_ST_CONCLUSAO,float64,0
TP_ANO_CONCLUIU,float64,0
TP_ESCOLA,float64,0
NO_MUNICIPIO_PROVA,object,0


In [8]:
def add_class(x):
    income_dict = {'A': 0, 'B': 1, 'C': 1.5, 'D': 2, 'E': 2.5, 'F': 3, 'G': 4, 'H': 5, 'I': 6, 
                   'J': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 12, 'O': 15, 'P': 20}

    if(x['Q006'] == 'Q'): x['TP_SES_INCOME'] = 'A'
    else: 
        per_capita = income_dict[x['Q006']]/x['Q005']
        
        if (per_capita <= 1): x['TP_SES_INCOME'] = 'E'
        elif (per_capita <= 4):  x['TP_SES_INCOME'] = 'D'
        elif (per_capita <= 10): x['TP_SES_INCOME'] = 'C'
        else: x['TP_SES_INCOME'] = 'B'
        
    return x

In [11]:
meta = dict(zip(df.columns, df._meta.dtypes))

In [12]:
meta['TP_SES_INCOME'] = 'O'
df['TP_SES_INCOME'] = ''

In [13]:
df = df.apply(lambda x: add_class(x), axis=1, meta=meta)

In [None]:
df.head()

In [14]:
with open("../datasets/sistema_pontos.json", encoding='utf-8') as json_:
    sistema_pontos = json.load(json_)

In [15]:
def add_class_point(x):
    points = 0 
    for index in x.index[25:]:
        if(index in sistema_pontos):
            if(index in ['Q001', 'Q002']):
                pm = sistema_pontos['Q001'][x['Q001']]
                pp = sistema_pontos['Q002'][x['Q002']]
                
                points += max(pm, pp)
            else:
                points += sistema_pontos[index][x[index]]
    
    if (points <= 16): x['TP_SES_POINTS'] = 'DE'
    elif (points <= 22):  x['TP_SES_POINTS'] = 'C2'
    elif (points <= 28): x['TP_SES_POINTS'] = 'C1'
    elif (points <= 37): x['TP_SES_POINTS'] = 'B2'
    elif (points <= 44): x['TP_SES_POINTS'] = 'B1'
    else: x['TP_SES_POINTS'] = 'A'
    
    return x

In [16]:
meta['TP_SES_POINTS'] = 'O'
df['TP_SES_POINTS'] = ''

In [17]:
df = df.apply(lambda x: add_class_point(x), axis=1, meta=meta)

In [18]:
df.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,NO_MUNICIPIO_PROVA,...,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,TP_SES_INCOME,TP_SES_POINTS
5,200003100000.0,2020.0,3.0,F,1.0,3.0,1.0,1.0,1.0,Cerro Largo,...,A,B,B,A,D,A,B,B,E,C2
8,200006800000.0,2020.0,5.0,F,2.0,3.0,1.0,3.0,1.0,João Pessoa,...,A,B,A,A,C,A,A,A,E,DE
11,200006100000.0,2020.0,7.0,M,1.0,1.0,1.0,0.0,1.0,Eunápolis,...,A,B,A,A,C,A,B,B,E,C2
18,200003300000.0,2020.0,7.0,F,1.0,3.0,1.0,3.0,1.0,Maceió,...,A,B,B,A,C,A,B,B,E,DE
19,200006000000.0,2020.0,13.0,M,1.0,2.0,1.0,0.0,1.0,Belém,...,A,B,B,A,C,A,A,A,E,DE


In [26]:
%%time

df.to_parquet('../datasets/enem_2020.parquet.gzip', engine='fastparquet', compression='gzip', write_index=False)

Wall time: 12min 42s


(None,)

In [36]:
df_parquet = dd.read_parquet(f'../datasets/enem_2020.parquet.gzip/part.0.parquet')

for p in range(1, 32):
    df_ = dd.read_parquet(f'../datasets/enem_2020.parquet.gzip/part.{p}.parquet')
    df_parquet = dd.concat([df_parquet, df_])

In [37]:
df_parquet.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,NO_MUNICIPIO_PROVA,...,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,TP_SES_INCOME,TP_SES_POINTS
0,200003100000.0,2020.0,3.0,F,1.0,3.0,1.0,1.0,1.0,Cerro Largo,...,A,B,B,A,D,A,B,B,E,C2
1,200006800000.0,2020.0,5.0,F,2.0,3.0,1.0,3.0,1.0,João Pessoa,...,A,B,A,A,C,A,A,A,E,DE
2,200006100000.0,2020.0,7.0,M,1.0,1.0,1.0,0.0,1.0,Eunápolis,...,A,B,A,A,C,A,B,B,E,C2
3,200003300000.0,2020.0,7.0,F,1.0,3.0,1.0,3.0,1.0,Maceió,...,A,B,B,A,C,A,B,B,E,DE
4,200006000000.0,2020.0,13.0,M,1.0,2.0,1.0,0.0,1.0,Belém,...,A,B,B,A,C,A,A,A,E,DE
