In [5]:
import pandas as pd
import numpy as np
from dataPreprocessing import preprocess_dataset
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
RANDOM_STATE = 41

In [6]:
columns = ["Pclass", "Age", "FamilySize", "Fare", "CabinsAmount","Sex","NamePrefix", "TicketPrefix", "CabinDeck", "Embarked"]
X_train, X_final_test, y, columnNames = preprocess_dataset(columns, dropColumn=None, degree=1)
X_tmp_train, X_tmp_test, y_tmp_train, y_tmp_test = train_test_split(X_train, y, random_state = RANDOM_STATE)
print("Left columns: ",columnNames)

Left columns:  ['1', 'x0', 'x1', 'x2', 'x3', 'x4', 'x0_female', 'x0_male', 'x1_Capt', 'x1_Col', 'x1_Don', 'x1_Dona', 'x1_Dr', 'x1_Jonkheer', 'x1_Lady', 'x1_Major', 'x1_Master', 'x1_Miss', 'x1_Mlle', 'x1_Mme', 'x1_Mr', 'x1_Mrs', 'x1_Ms', 'x1_Rev', 'x1_Sir', 'x1_th', 'x2_A/4', 'x2_A/5', 'x2_A/S', 'x2_A2', 'x2_A4', 'x2_A5', 'x2_AQ/3', 'x2_AQ/4', 'x2_C', 'x2_CA', 'x2_CA/SOTON', 'x2_FC', 'x2_FCC', 'x2_Fa', 'x2_LP', 'x2_N/A', 'x2_P/PP', 'x2_PC', 'x2_PP', 'x2_SC', 'x2_SC/A3', 'x2_SC/A4', 'x2_SC/AH', 'x2_SC/AHBasle', 'x2_SC/PARIS', 'x2_SC/Paris', 'x2_SCO/W', 'x2_SO/C', 'x2_SO/PP', 'x2_SOC', 'x2_SOP', 'x2_SOTON/O2', 'x2_SOTON/OQ', 'x2_SP', 'x2_STON/O2', 'x2_STON/OQ', 'x2_SW/PP', 'x2_W/C', 'x2_WE/P', 'x2_WEP', 'x3_A', 'x3_B', 'x3_C', 'x3_D', 'x3_E', 'x3_F', 'x3_G', 'x3_N/A', 'x3_T', 'x4_C', 'x4_N/A', 'x4_Q', 'x4_S']


In [7]:
layer_size = X_train.shape[1] + 1
# parameters={ 'solver': ['lbfgs',"sgd"],
# 'learning_rate': ["constant", "invscaling", "adaptive"],
# 'hidden_layer_sizes': [(layer_size,1), (layer_size,2), (layer_size,3)],
# 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ],
# 'alpha':[1e-5],
# 'random_state': [RANDOM_STATE]}
parameters={ 'solver': ["sgd","lbfgs"],
#'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(layer_size,1), (layer_size,2),(layer_size,3)],
'max_iter': [1000, 2000 ],
'alpha':[1e-4,1e-5,1e-6],
'random_state': [RANDOM_STATE]}

In [8]:
optimizer = GridSearchCV(MLPClassifier(),param_grid=parameters,n_jobs=-1,verbose=2,cv=3)
optimizer.fit(X_tmp_train, y_tmp_train)
scoreTrain = optimizer.score(X_tmp_train, y_tmp_train)
scoreTest = optimizer.score(X_tmp_test, y_tmp_test)
print("Train: ", round(scoreTrain,4), ", Test: ", round(scoreTest,4), ", Mean: ", round((scoreTest+scoreTrain)/2,4), ", Parameters: ", optimizer.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   31.2s finished
Train:  0.8563 , Test:  0.8655 , Mean:  0.8609 , Parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (79, 1), 'max_iter': 1000, 'random_state': 41, 'solver': 'sgd'}


In [9]:
## Write answer
answer = pd.DataFrame(columns=["PassengerId", "Survived"])
dfTest = pd.read_csv("./test.csv")
answer["PassengerId"] = dfTest["PassengerId"]
answer["Survived"] = optimizer.predict(X_final_test)
answer.to_csv("answer.csv", index=False)