## **Criação da base de dados de treinamento e teste.**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# Importando base de dados completa

In [2]:
df_completo = pd.read_csv("/content/drive/MyDrive/MICRODADOS_ENEM_2021.csv", header = 0, delimiter = ';', encoding='latin-1')

In [3]:
df_completo.sample(4)

Unnamed: 0,NU_INSCRICAO,NU_ANO,TP_FAIXA_ETARIA,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
2470767,210053468425,2021,4,F,1,1,1,1,2,1,...,B,A,B,D,B,B,E,A,C,B
2527046,210053986629,2021,3,M,1,1,1,2,0,2,...,A,A,A,B,A,A,C,A,B,B
2660187,210053623550,2021,1,F,1,1,1,3,0,1,...,B,A,A,E,B,A,E,A,C,B
776525,210052193687,2021,2,F,1,3,1,2,0,2,...,A,A,A,B,A,A,D,A,A,B


# Criação de dataframe com os campos importantes para o estudo. Isto reduzirá o tamanho da base de dados.

In [4]:
df_enem_2021 = df_completo[["NU_INSCRICAO", "TP_FAIXA_ETARIA","TP_SEXO","Q006", "Q025", 
                        "NU_NOTA_REDACAO", "NU_INSCRICAO", "TP_DEPENDENCIA_ADM_ESC",
                         "CO_MUNICIPIO_PROVA"]].query("NU_NOTA_REDACAO > 0")

In [5]:
print(len(df_enem_2021))

2293797


In [6]:
df_enem_2021.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2293797 entries, 0 to 3389830
Data columns (total 9 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   NU_INSCRICAO            int64  
 1   TP_FAIXA_ETARIA         int64  
 2   TP_SEXO                 object 
 3   Q006                    object 
 4   Q025                    object 
 5   NU_NOTA_REDACAO         float64
 6   NU_INSCRICAO            int64  
 7   TP_DEPENDENCIA_ADM_ESC  float64
 8   CO_MUNICIPIO_PROVA      int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 175.0+ MB


In [7]:
df_enem_2021.isnull().mean()

NU_INSCRICAO              0.000000e+00
TP_FAIXA_ETARIA           0.000000e+00
TP_SEXO                   0.000000e+00
Q006                      4.359584e-07
Q025                      4.359584e-07
NU_NOTA_REDACAO           0.000000e+00
NU_INSCRICAO              0.000000e+00
TP_DEPENDENCIA_ADM_ESC    7.352547e-01
CO_MUNICIPIO_PROVA        0.000000e+00
dtype: float64

# O campo TP_DEPENDENCIA_ADM_ESC será retirado do estudo por possuir muitos dados faltantes, cerca de 74%.

In [8]:
df_enem_2021.drop(["TP_DEPENDENCIA_ADM_ESC"], axis = 1, inplace = True)

In [9]:
df_enem_2021.isna().sum()

NU_INSCRICAO          0
TP_FAIXA_ETARIA       0
TP_SEXO               0
Q006                  1
Q025                  1
NU_NOTA_REDACAO       0
NU_INSCRICAO          0
CO_MUNICIPIO_PROVA    0
dtype: int64

In [10]:
df_enem_2021.fillna(axis=0, method='ffill')

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,TP_SEXO,Q006,Q025,NU_NOTA_REDACAO,NU_INSCRICAO.1,CO_MUNICIPIO_PROVA
0,210053865474,5,F,D,B,760.0,210053865474,3144805
1,210052384164,12,M,B,A,560.0,210052384164,2704302
3,210052128335,3,M,B,B,780.0,210052128335,2304202
4,210051353021,2,F,C,B,780.0,210051353021,2300150
6,210052615524,9,F,C,B,480.0,210052615524,4321600
...,...,...,...,...,...,...,...,...
3389793,210054306230,15,F,D,B,240.0,210054306230,3530706
3389807,210051254419,14,M,E,B,320.0,210051254419,3167202
3389814,210051121001,10,M,B,B,640.0,210051121001,3303203
3389815,210051173067,5,F,A,B,540.0,210051173067,1304237


In [11]:
df_enem_2021.describe()

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,NU_NOTA_REDACAO,NU_INSCRICAO.1,CO_MUNICIPIO_PROVA
count,2293797.0,2293797.0,2293797.0,2293797.0,2293797.0
mean,210052700000.0,4.53246,636.1228,210052700000.0,3121965.0
std,996879.4,3.54592,152.717,996879.4,989632.8
min,210051000000.0,1.0,40.0,210051000000.0,1100015.0
25%,210051900000.0,2.0,540.0,210051900000.0,2409407.0
50%,210052700000.0,3.0,620.0,210052700000.0,3132404.0
75%,210053600000.0,6.0,740.0,210053600000.0,3550308.0
max,210054500000.0,20.0,1000.0,210054500000.0,5300108.0


# Criação da base de treinamento e teste

In [12]:
msk = np.random.rand(len(df_enem_2021)) < 0.8 # cria uma mascara com as posicoes
train = df_enem_2021[msk]
test = df_enem_2021[~msk]

In [13]:
train.sample(4)

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,TP_SEXO,Q006,Q025,NU_NOTA_REDACAO,NU_INSCRICAO.1,CO_MUNICIPIO_PROVA
1100392,210052920633,2,M,H,B,900.0,210052920633,3106200
1252666,210052453219,2,F,B,A,460.0,210052453219,2109106
468845,210051905443,2,F,D,B,760.0,210051905443,3304557
2974998,210052539772,1,M,F,B,560.0,210052539772,2307809


In [14]:
test.sample(4)

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,TP_SEXO,Q006,Q025,NU_NOTA_REDACAO,NU_INSCRICAO.1,CO_MUNICIPIO_PROVA
773666,210052329217,5,F,D,B,920.0,210052329217,2929305
1700482,210051196377,3,F,B,B,580.0,210051196377,2609501
3365306,210052960575,13,F,D,B,560.0,210052960575,3550308
929081,210053982024,2,M,B,B,560.0,210053982024,2112704


In [15]:
print(len(train))
print(len(test))

1834835
458962


In [16]:
train.to_csv("/content/drive/MyDrive/ENEM_2021_TRAIN.csv")
test.to_csv("/content/drive/MyDrive/ENEM_2021_TEST.csv")

# Separando variáveis de Entrada (x) e Saída (y)

In [17]:
y_train = train[["NU_NOTA_REDACAO"]]
y_test = test[["NU_NOTA_REDACAO"]]
x_train = train
x_train.drop(["NU_NOTA_REDACAO"], axis=1, inplace=True)
x_test = test
x_test.drop(["NU_NOTA_REDACAO"], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [18]:
x_train.sample(2)

Unnamed: 0,NU_INSCRICAO,TP_FAIXA_ETARIA,TP_SEXO,Q006,Q025,NU_INSCRICAO.1,CO_MUNICIPIO_PROVA
491021,210052402414,3,M,C,B,210052402414,2304400
2049685,210051697127,3,M,P,B,210051697127,3549805
