In [1]:
import pandas as pd
import numpy as np
import zipfile

In [2]:
def load_data_year(year, cols):
    file = f'../../../microdados_anos/MICRODADOS_ENEM_{year}.csv'
    df = pd.read_csv(file, encoding='cp1252', sep=';', usecols=cols, nrows=2000)
    
    return df

In [15]:
cols_used = ['NU_INSCRICAO', 'NU_ANO', 'NO_MUNICIPIO_RESIDENCIA', 'SG_UF_RESIDENCIA',
             'NU_IDADE', 'TP_SEXO', 'TP_ESTADO_CIVIL', 'TP_COR_RACA',
             'TP_ST_CONCLUSAO', 'TP_ANO_CONCLUIU', 'TP_ESCOLA',
             'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
             'TP_PRESENCA_MT', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
             'NU_NOTA_MT', 'NU_NOTA_COMP1',
             'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5',
             'NU_NOTA_REDACAO', 'Q001', 'Q002', 'Q003', 'Q004', 'Q005', 'Q006',
             'Q007', 'Q008', 'Q009', 'Q010', 'Q011', 'Q012', 'Q013', 'Q014',
             'Q015', 'Q016', 'Q017', 'Q018', 'Q019', 'Q020', 'Q021', 'Q022',
             'Q023', 'Q024', 'Q025']

In [4]:
def union_datas(years, cols):
    df_final = pd.DataFrame([], columns=cols)
    
    for y in years:
        df_final = pd.concat([df_final, load_data_year(y, cols)])
    
    return df_final

In [5]:
years = [2016, 2017, 2018, 2019]

In [16]:
%%time

df_union = union_datas(years, cols_used)

Wall time: 152 ms


In [7]:
df_union.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,160000301384,2016,João Pessoa,PB,23,M,0.0,3,1,7,...,A,A,A,C,B,B,C,B,B,B
1,160000000001,2016,Vitorino,PR,20,M,0.0,1,1,3,...,B,A,A,B,B,A,C,B,B,B
2,160000000002,2016,Salvador,BA,21,M,0.0,3,1,1,...,A,A,A,B,A,A,C,B,B,B
3,160000000003,2016,Belém,PA,17,M,0.0,1,1,1,...,A,A,A,B,B,A,B,A,A,B
4,160000000004,2016,Brasília,DF,36,F,2.0,1,1,10,...,A,A,A,A,A,A,A,A,A,A


# Pre-processing

In [17]:
pd.DataFrame({'types': df_union.dtypes, 'missing': df_union.isna().sum()})

Unnamed: 0,types,missing
NU_INSCRICAO,object,0
NU_ANO,object,0
NO_MUNICIPIO_RESIDENCIA,object,0
SG_UF_RESIDENCIA,object,0
NU_IDADE,object,0
TP_SEXO,object,0
TP_ESTADO_CIVIL,float64,267
TP_COR_RACA,object,0
TP_ST_CONCLUSAO,object,0
TP_ANO_CONCLUIU,object,0


In [21]:
df_final = df_union.dropna(how='any')

In [22]:
df_final.head()

Unnamed: 0,NU_INSCRICAO,NU_ANO,NO_MUNICIPIO_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,160000301384,2016,João Pessoa,PB,23,M,0.0,3,1,7,...,A,A,A,C,B,B,C,B,B,B
1,160000000001,2016,Vitorino,PR,20,M,0.0,1,1,3,...,B,A,A,B,B,A,C,B,B,B
2,160000000002,2016,Salvador,BA,21,M,0.0,3,1,1,...,A,A,A,B,A,A,C,B,B,B
3,160000000003,2016,Belém,PA,17,M,0.0,1,1,1,...,A,A,A,B,B,A,B,A,A,B
5,160000000005,2016,Fortaleza,CE,20,M,0.0,3,1,2,...,A,A,A,B,A,A,C,A,A,B


In [23]:
df_final.isna().sum()

NU_INSCRICAO               0
NU_ANO                     0
NO_MUNICIPIO_RESIDENCIA    0
SG_UF_RESIDENCIA           0
NU_IDADE                   0
TP_SEXO                    0
TP_ESTADO_CIVIL            0
TP_COR_RACA                0
TP_ST_CONCLUSAO            0
TP_ANO_CONCLUIU            0
TP_ESCOLA                  0
TP_PRESENCA_CN             0
TP_PRESENCA_CH             0
TP_PRESENCA_LC             0
TP_PRESENCA_MT             0
NU_NOTA_CN                 0
NU_NOTA_CH                 0
NU_NOTA_LC                 0
NU_NOTA_MT                 0
NU_NOTA_COMP1              0
NU_NOTA_COMP2              0
NU_NOTA_COMP3              0
NU_NOTA_COMP4              0
NU_NOTA_COMP5              0
NU_NOTA_REDACAO            0
Q001                       0
Q002                       0
Q003                       0
Q004                       0
Q005                       0
Q006                       0
Q007                       0
Q008                       0
Q009                       0
Q010          

In [None]:
%%time

# Save
#file = '../datasets/integrated_data.csv'
#df_final.to_csv(file, index=False)

#z = zipfile.ZipFile('../datasets/integrated_data.zip', 'w', zipfile.ZIP_DEFLATED)
#z.write(file)    
#z.close()