In [2]:
import joblib
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [3]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    #print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            #print("dtype after: ",props[col].dtype)
            #print("******************************")
    
    # Print final result
    #print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    #print("Memory usage is: ",mem_usg," MB")
    #print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [4]:
df_train = pd.read_csv('dataset/df_train__reduce.csv')
df_train, nalist = reduce_mem_usage(df_train)

df_test = pd.read_csv('dataset/df_test__reduce.csv')
df_test, nalist = reduce_mem_usage(df_test)

In [5]:
df_train = df_train.drop(columns=['NO_MUNICIPIO_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NO_MUNICIPIO_PROVA', 'SG_UF_PROVA'])

In [6]:
X_train = df_train.drop(columns=['NU_NOTA_CN', 'NU_NOTA_CH',	'NU_NOTA_LC', 'NU_NOTA_MT',	'NU_NOTA_REDACAO', 'NU_INSCRICAO'])
X_test = df_test.drop(columns=['NU_NOTA_CN', 'NU_NOTA_CH',	'NU_NOTA_LC', 'NU_NOTA_MT',	'NU_NOTA_REDACAO', 'NU_INSCRICAO'])

y_train_CN = df_train[['NU_NOTA_CN']]
y_train_CH = df_train[['NU_NOTA_CH']]
y_train_LC = df_train[['NU_NOTA_LC']]
y_train_MT = df_train[['NU_NOTA_MT']]
y_train_REDACAO = df_train[['NU_NOTA_REDACAO']]

y_test_CN = df_test[['NU_NOTA_CN']]
y_test_CH = df_test[['NU_NOTA_CH']]
y_test_LC = df_test[['NU_NOTA_LC']]
y_test_MT = df_test[['NU_NOTA_MT']]
y_test_REDACAO = df_test[['NU_NOTA_REDACAO']]

In [7]:
from sklearn.preprocessing import PowerTransformer

scaler = PowerTransformer()

sc = StandardScaler()
X_train_scaler = sc.fit_transform(X_train)
X_test_scaler = sc.transform(X_test)

joblib.dump(sc, open('scaler.pkl', 'wb'))

# Geral

In [8]:
# input_layer = Input(shape=(X_train.shape[1],))
# x = Dense(200, activation='relu')(input_layer)
# x = Dense(200, activation='relu')(x)
# x = Dense(200, activation='relu')(x)

# x = Dense(200, activation='relu')(x)
# x = Dense(200, activation='relu')(x)
# x = Dense(200, activation='relu')(x)

initializer = tf.keras.initializers.GlorotNormal()

input_layer = Input(shape=(X_train.shape[1],))
x = Dense(1024, activation='relu', kernel_initializer=initializer)(input_layer)
x = Dense(1024, activation='relu', kernel_initializer=initializer)(x)
x = Dense(1024, activation='relu', kernel_initializer=initializer)(x)
x = Dropout(0.6)(x)
x = Dense(1024, activation='relu', kernel_initializer=initializer)(x)
x = Dropout(0.6)(x)

output_CN = Dense(1, name='CN')(x)
output_CH = Dense(1, name='CH')(x)
output_LC = Dense(1, name='LC')(x)
output_MT = Dense(1, name='MT')(x)
output_REDACAO = Dense(1, name='REDACAO')(x)

model = Model(inputs=input_layer, outputs=[output_CN, output_CH, output_LC, output_MT, output_REDACAO])
model.compile(loss="mean_squared_error" , optimizer=Adam(learning_rate=0.001), metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 95)]         0           []                               
                                                                                                  
 dense (Dense)                  (None, 1024)         98304       ['input_1[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 1024)         1049600     ['dense[0][0]']                  
                                                                                                  
 dense_2 (Dense)                (None, 1024)         1049600     ['dense_1[0][0]']                
                                                                                              

In [10]:
class MCRMSE(tf.keras.callbacks.Callback):
    def __init__(self,**kargs):
        super(MCRMSE,self).__init__(**kargs)


    def on_epoch_end(self, epoch, logs={}):
        logs['mcrmse'] = (logs["CN_root_mean_squared_error"]+logs["CH_root_mean_squared_error"]+logs["LC_root_mean_squared_error"]+logs["MT_root_mean_squared_error"]+logs["REDACAO_root_mean_squared_error"])/5
        logs['val_mcrmse'] = (logs["val_CN_root_mean_squared_error"]+logs["val_CH_root_mean_squared_error"]+logs["val_LC_root_mean_squared_error"]+logs["val_MT_root_mean_squared_error"]+logs["val_REDACAO_root_mean_squared_error"])/5

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

checkpoint_filepath = 'dataset/dados_8/'
model_check_callback = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, monitor='val_mcrmse', mode='min', save_best_only=True, verbose=1)
early_stopping_callback = EarlyStopping(monitor='val_mcrmse', patience=20, mode='min')

mcrmse = MCRMSE()

In [12]:
# NU_NOTA_CN - RMSE:54.93000030517578
# NU_NOTA_CH - RMSE:62.029998779296875
# NU_NOTA_LC - RMSE:47.7599983215332
# NU_NOTA_MT - RMSE:76.41000366210938
# NU_NOTA_REDACAO - RMSE:145.60000610351562
# Média RMSE:77.3499984741211

# 75.74781

# val_CN_root_mean_squared_error: 53.6484 
# val_CH_root_mean_squared_error: 60.3808 
# val_LC_root_mean_squared_error: 46.2767 
# val_MT_root_mean_squared_error: 74.3013 
# val_REDACAO_root_mean_squared_error: 114.2221 
# mcrmse: 69.5857 - val_mcrmse: 69.7659
# Epoch 37/100

In [13]:
# history = model.fit(X_train_scaler, [y_train_CN, y_train_CH, y_train_LC, y_train_MT , y_train_REDACAO], 
#                     batch_size=500, epochs=100, verbose=1, validation_data=(X_test_scaler, [y_test_CN, y_test_CH, y_test_LC, y_test_MT, y_test_REDACAO]), 
#                     callbacks=[mcrmse, model_check_callback, early_stopping_callback])

history = model.fit(X_train_scaler, [y_train_CN, y_train_CH, y_train_LC, y_train_MT , y_train_REDACAO], batch_size=1024, epochs=100, verbose=1, 
                    validation_data=(X_test_scaler, [y_test_CN, y_test_CH, y_test_LC, y_test_MT, y_test_REDACAO]), callbacks=[mcrmse, model_check_callback, early_stopping_callback])

Epoch 1/100

KeyboardInterrupt: 

In [None]:
#import gc
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
mcrmse = history.history['mcrmse']
val_mcrmse = history.history['val_mcrmse']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs  = range(len(mcrmse)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, mcrmse)
plt.plot( epochs, val_mcrmse)
plt.title('Training and validation accuracy')
plt.figure()

# #------------------------------------------------
# # Plot training and validation loss per epoch
# #------------------------------------------------
plt.plot( epochs,     loss )
plt.plot( epochs, val_loss )
plt.title('Training and validation loss'   )

#del model
#del history
#gc.collect()

In [None]:
model.load_weights(checkpoint_filepath)

# Create Submissions

In [None]:
# test = pd.read_csv('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/test.csv')
# test, nalist = reduce_mem_usage(test)

# dict_value = {'NU_NOTA_MT':0, 'NU_NOTA_CN':0, 'NU_NOTA_REDACAO':0, 'NU_NOTA_LC':0, 'NU_NOTA_CH':0}
# test.fillna(value=dict_value, inplace=True)

# test.drop(columns=['CO_MUNICIPIO_ESC', 'TP_SIT_FUNC_ESC', 'TP_LOCALIZACAO_ESC', 
#                     'TP_DEPENDENCIA_ADM_ESC', 'SG_UF_ESC', 'CO_UF_ESC', 'NO_MUNICIPIO_ESC',
#                     'CO_ESCOLA', 'TP_ENSINO', 'TP_STATUS_REDACAO', 'CO_MUNICIPIO_NASCIMENTO', 
#                     'SG_UF_NASCIMENTO', 'CO_UF_NASCIMENTO', 'NO_MUNICIPIO_NASCIMENTO'], inplace=True)

# (test.isnull().sum()*100/len(test)).sort_values(ascending=False)[0:50]

test_clear = pd.read_csv('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_5/test_clear.csv')
test_clear.head()

Unnamed: 0,NU_INSCRICAO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,TP_NACIONALIDADE,...,Q016,Q017,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025
0,5d5b362b-7388-4ac6-81b3-23573e4e2d3a,2305506,Iguatu,23,CE,22.0,F,1,3,1,...,A,A,A,B,B,A,B,A,B,B
1,52356efd-3239-4cd2-a444-416625dfc560,3548500,Santos,35,SP,19.0,F,1,1,1,...,B,A,A,B,A,B,E,B,B,B
2,1ba42e9a-dd61-4405-9b08-79b728ad23c9,3552106,Socorro,35,SP,16.0,F,1,1,1,...,B,A,B,C,A,A,E,A,B,B
3,16fa0f84-a88f-43e6-bcbb-8d5ea41e5f03,3541000,Praia Grande,35,SP,22.0,M,1,1,1,...,B,A,A,C,A,B,C,B,B,B
4,fa663d11-5052-4ab2-b771-3a3de3bdec55,4106902,Curitiba,41,PR,17.0,M,1,1,1,...,B,A,B,B,A,B,B,B,C,B


In [None]:
# columns_categorical = ['NO_MUNICIPIO_RESIDENCIA', 'SG_UF_RESIDENCIA', 'TP_SEXO', 
#                        'NO_MUNICIPIO_PROVA', 'SG_UF_PROVA', 'Q001',	'Q002',	'Q003',	
#                        'Q004',	'Q005',	'Q006',	'Q007',	'Q008',	'Q009',	'Q010',	'Q011',	
#                        'Q012',	'Q013',	'Q014',	'Q015',	'Q016',	'Q017',	'Q018',	'Q019',	
#                        'Q020',	'Q021',	'Q022',	'Q023',	'Q024',	'Q025']

# dict_labels = joblib.load('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_2/dict_encoder.pkl', 'r')
# for column in columns_categorical:
#   enc = dict_labels[column]
#   test[column] = enc.transform(test[column].values)

columns_remove = ['NO_MUNICIPIO_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NO_MUNICIPIO_PROVA', 'SG_UF_PROVA']
columns_categorical = ['TP_SEXO', 'Q001',	'Q002',	'Q003',	
                       'Q004',	'Q005',	'Q006',	'Q007',	'Q008',	'Q009',	'Q010',	'Q011',	
                       'Q012',	'Q013',	'Q014',	'Q015',	'Q016',	'Q017',	'Q018',	'Q019',	
                       'Q020',	'Q021',	'Q022',	'Q023',	'Q024',	'Q025']

dict_labels = joblib.load('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_5/dict_encoder.pkl', 'r')
X_test = test_clear.copy()

for column in columns_categorical:
  lenc = dict_labels[column]
  X_test[column] = lenc.transform(X_test[column].values)

In [None]:
X_test = X_test.drop(columns=columns_remove)
X_test, nalist = reduce_mem_usage(X_test)

In [None]:
# scaler = joblib.load('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_5/scaler.pkl', 'r')

# X_test = scaler.transform(test.drop(columns=['NU_INSCRICAO']))

scaler = joblib.load('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_6/scaler.pkl', 'r')

X_test = scaler.transform(X_test.drop(columns=['NU_INSCRICAO']))

In [None]:
# input_layer = Input(shape=(X_test.shape[1],))
# x = Dense(200, activation='relu')(input_layer)
# x = Dense(200, activation='relu')(x)
# x = Dense(200, activation='relu')(x)

# output_CN = Dense(1, name='CN')(x)
# output_CH = Dense(1, name='CH')(x)
# output_LC = Dense(1, name='LC')(x)
# output_MT = Dense(1, name='MT')(x)
# output_REDACAO = Dense(1, name='REDACAO')(x)

# model = Model(inputs=input_layer, outputs=[output_CN, output_CH, output_LC, output_MT, output_REDACAO])
# model.compile(loss="mean_squared_error" , optimizer="adam", metrics=[tf.keras.metrics.RootMeanSquaredError()])

#model.load_weights('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_3/')
#model.load_weights('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_5/')


initializer = tf.keras.initializers.GlorotNormal()

input_layer = Input(shape=(X_test.shape[1],))
x = Dense(128, activation='relu', kernel_initializer=initializer)(input_layer)
x = Dense(128, activation='relu', kernel_initializer=initializer)(x)
x = Dense(128, activation='relu', kernel_initializer=initializer)(x)

output_CN = Dense(1, name='CN')(x)
output_CH = Dense(1, name='CH')(x)
output_LC = Dense(1, name='LC')(x)
output_MT = Dense(1, name='MT')(x)
output_REDACAO = Dense(1, name='REDACAO')(x)

model = Model(inputs=input_layer, outputs=[output_CN, output_CH, output_LC, output_MT, output_REDACAO])
model.compile(loss="mean_squared_error" , optimizer=Adam(learning_rate=0.01), metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.load_weights('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/datas/dados_7/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f10ff101c90>

In [None]:
predicts = model.predict(X_test)

In [None]:
type(predicts[0])

list_df_predicts = []

list_CN = [value[0] for value in predicts[0]]
list_CH = [value[0] for value in predicts[1]]
list_LC = [value[0] for value in predicts[2]]
list_MT = [value[0] for value in predicts[3]]
list_REDACAO = [value[0] for value in predicts[4]]


df_predict = pd.DataFrame({'NU_NOTA_CN':list_CN,
                           'NU_NOTA_CH':list_CH,
                           'NU_NOTA_LC':list_LC,
                           'NU_NOTA_MT':list_MT,
                           'NU_NOTA_REDACAO':list_REDACAO})
#df_predict = pd.DataFrame(data=[list_CN, list_CH, list_LC, list_MT, list_REDACAO], columns=['NU_NOTA_CN', ,	, ,	])
df_predict
# for list_pred in predicts:
#   list_values
#   df_predict = pd.DataFrame(data=list_pred, columns=['NU_NOTA_CN', 'NU_NOTA_CH',	'NU_NOTA_LC', 'NU_NOTA_MT',	'NU_NOTA_REDACAO'])
#   list_df_predicts.append(df_predict)

# df_predict_full = pd.concat(list_df_predicts, ignore_index=True)

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO
0,0.199046,0.100500,0.307284,0.121560,0.089793
1,522.124451,563.309326,572.789978,575.788330,697.169800
2,486.739929,521.701660,533.843201,537.648865,594.140259
3,520.642151,563.035461,565.651245,599.006775,610.705688
4,536.949951,569.480286,567.868103,622.431030,653.754089
...,...,...,...,...,...
1783340,410.187012,433.475189,455.670624,425.421356,451.094788
1783341,469.316101,492.796936,502.332611,512.676331,501.590759
1783342,463.182037,495.754608,512.477600,488.293030,593.607117
1783343,437.615692,445.492035,455.337189,459.668579,487.263916


In [None]:
test_2 = test_clear.copy()
test_2 = pd.concat([test_2, df_predict], axis=1)
test_2 = test_2[['NU_INSCRICAO','NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_REDACAO']]

In [None]:
def round_notas(value):
  if value < 0:
    return 0

  # elif value > 0 and value < 200:
  #   return 0

  return np.round(value,2)

test_2['NU_NOTA_CN'] = test_2['NU_NOTA_CN'].apply(round_notas)
test_2['NU_NOTA_CH'] = test_2['NU_NOTA_CH'].apply(round_notas)
test_2['NU_NOTA_LC'] = test_2['NU_NOTA_LC'].apply(round_notas)
test_2['NU_NOTA_MT'] = test_2['NU_NOTA_MT'].apply(round_notas)
test_2['NU_NOTA_REDACAO'] = test_2['NU_NOTA_REDACAO'].apply(round_notas)
#test_2.clip(lower=0)

In [None]:
#test_2 = test_2[['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'NU_NOTA_REDACAO']].clip(lower=0, upper=1000)
test_2

Unnamed: 0,NU_INSCRICAO,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_MT,NU_NOTA_REDACAO
0,5d5b362b-7388-4ac6-81b3-23573e4e2d3a,0.20,0.10,0.31,0.12,0.09
1,52356efd-3239-4cd2-a444-416625dfc560,522.12,563.31,572.79,575.79,697.17
2,1ba42e9a-dd61-4405-9b08-79b728ad23c9,486.74,521.70,533.84,537.65,594.14
3,16fa0f84-a88f-43e6-bcbb-8d5ea41e5f03,520.64,563.04,565.65,599.01,610.71
4,fa663d11-5052-4ab2-b771-3a3de3bdec55,536.95,569.48,567.87,622.43,653.75
...,...,...,...,...,...,...
1783340,0bb71e78-20f8-4ba4-b790-4dcb70b06890,410.19,433.48,455.67,425.42,451.09
1783341,82be8f80-00e0-4aee-8b88-57ae6ca67d4d,469.32,492.80,502.33,512.68,501.59
1783342,dc3dd2ce-a828-4f53-a153-3c5a4ca3e4fc,463.18,495.75,512.48,488.29,593.61
1783343,1240df12-5a97-4aee-8065-c25e5e33d473,437.62,445.49,455.34,459.67,487.26


In [None]:
test_2.to_csv('drive/MyDrive/ML Olympiad - QUALITY EDUCATION - Kaggle/submissions/submission_deep_7.csv', index=False)