In [1]:
import os
import joblib
import numpy as np
import pandas as pd

In [2]:
data_folder = './data'
df = pd.read_csv(os.path.join(data_folder, 'prep_test.csv'))
df.shape

(1783345, 54)

In [3]:
regressor = joblib.load('./models/2_neural_net.pkl')
regressor



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpregressor',
                 MLPRegressor(hidden_layer_sizes=(40, 20, 10),
                              random_state=7))])

In [21]:
y_pred = regressor.predict(df)
y_pred.shape

(1783345, 5)

In [22]:
target_columns = [
    'nu_nota_lc',
    'nu_nota_ch',
    'nu_nota_cn',
    'nu_nota_mt',
    'nu_nota_redacao'
]

In [17]:
df_test = pd.read_csv(os.path.join(data_folder, 'test.csv'), usecols=['NU_INSCRICAO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC', 'TP_PRESENCA_MT', 'TP_STATUS_REDACAO'])
df_test.head()

Unnamed: 0,NU_INSCRICAO,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,TP_STATUS_REDACAO
0,5d5b362b-7388-4ac6-81b3-23573e4e2d3a,0,0,0,0,
1,52356efd-3239-4cd2-a444-416625dfc560,1,1,1,1,1.0
2,1ba42e9a-dd61-4405-9b08-79b728ad23c9,1,1,1,1,1.0
3,16fa0f84-a88f-43e6-bcbb-8d5ea41e5f03,1,1,1,1,1.0
4,fa663d11-5052-4ab2-b771-3a3de3bdec55,1,1,1,1,1.0


In [18]:
df_test.isna().sum()

NU_INSCRICAO              0
TP_PRESENCA_CN            0
TP_PRESENCA_CH            0
TP_PRESENCA_LC            0
TP_PRESENCA_MT            0
TP_STATUS_REDACAO    409802
dtype: int64

In [19]:
df_test['TP_STATUS_REDACAO'] = df_test['TP_STATUS_REDACAO'].fillna(0)

In [23]:
df_pred = pd.DataFrame(data=y_pred, columns=[c.upper() for c in target_columns])
df_pred.describe()

Unnamed: 0,NU_NOTA_LC,NU_NOTA_CH,NU_NOTA_CN,NU_NOTA_MT,NU_NOTA_REDACAO
count,1783345.0,1783345.0,1783345.0,1783345.0,1783345.0
mean,521.7763,509.6157,478.2514,520.0521,573.9425
std,34.95157,42.55279,41.70356,62.52117,84.38451
min,408.055,400.9019,379.9966,381.1247,229.45
25%,496.5091,478.2292,446.9828,473.123,521.6369
50%,517.7017,501.6768,468.3003,504.572,562.1603
75%,543.5147,533.4538,499.751,552.5057,616.1572
max,643.5661,684.1307,634.8929,775.9007,881.398


In [24]:
df_pred.loc[df_test['TP_PRESENCA_LC'] != 1, 'NU_NOTA_LC'] = 0
df_pred.loc[df_test['TP_PRESENCA_CH'] != 1, 'NU_NOTA_CH'] = 0
df_pred.loc[df_test['TP_PRESENCA_CN'] != 1, 'NU_NOTA_CN'] = 0
df_pred.loc[df_test['TP_PRESENCA_MT'] != 1, 'NU_NOTA_MT'] = 0
df_pred.loc[df_test['TP_STATUS_REDACAO'] != 1, 'NU_NOTA_REDACAO'] = 0
df_pred.describe()

Unnamed: 0,NU_NOTA_LC,NU_NOTA_CH,NU_NOTA_CN,NU_NOTA_MT,NU_NOTA_REDACAO
count,1783345.0,1783345.0,1783345.0,1783345.0,1783345.0
mean,403.0806,393.7197,350.1709,382.366,432.3372
std,222.3219,218.3365,216.8229,239.6582,265.2915
min,0.0,0.0,0.0,0.0,0.0
25%,464.964,448.7671,0.0,0.0,0.0
50%,506.5524,488.7179,453.5224,483.9319,540.9527
75%,536.5451,524.2522,488.7262,536.9073,602.0769
max,640.52,684.1307,634.8929,775.9007,881.398


In [25]:
df_submission = df_test.join(df_pred).drop(columns=['TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC', 'TP_PRESENCA_MT', 'TP_STATUS_REDACAO'])
df_submission.head()

Unnamed: 0,NU_INSCRICAO,NU_NOTA_LC,NU_NOTA_CH,NU_NOTA_CN,NU_NOTA_MT,NU_NOTA_REDACAO
0,5d5b362b-7388-4ac6-81b3-23573e4e2d3a,0.0,0.0,0.0,0.0,0.0
1,52356efd-3239-4cd2-a444-416625dfc560,573.815418,565.555627,532.116832,590.616825,722.941374
2,1ba42e9a-dd61-4405-9b08-79b728ad23c9,540.854777,528.494109,500.01808,548.42004,628.782482
3,16fa0f84-a88f-43e6-bcbb-8d5ea41e5f03,556.094278,554.426727,520.631035,595.904716,613.951444
4,fa663d11-5052-4ab2-b771-3a3de3bdec55,558.077786,558.761156,530.753124,606.569177,659.912803


In [26]:
df_submission.to_csv('submissions/3_pred.csv', index=False)