In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# data processing
import numpy as np
import pandas as pd 

# machine learning
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 

# utils
import time
from datetime import timedelta

# plot
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Load the data

In [3]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv", index_col='PassengerId')
#test_df = pd.read_csv("/kaggle/input/titanic/test.csv", index_col='PassengerId') #validation data
#sub_df = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [4]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df.columns.values

array(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [6]:
train_df.shape

(891, 11)

# Data preprocessing

In [7]:
train_df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [8]:
def prep_data(df):
    # Drop unwanted features
    df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    
    # Fill missing data: Age and Fare with the mean, Embarked with most frequent value
    df[['Age']] = df[['Age']].fillna(value=df[['Age']].mean())
    df[['Fare']] = df[['Fare']].fillna(value=df[['Fare']].mean())
    df[['Embarked']] = df[['Embarked']].fillna(value=df['Embarked'].value_counts().idxmax())
    
    # Convert categorical  features into numeric
    df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
      
    # Convert Embarked to one-hot
    enbarked_one_hot = pd.get_dummies(df['Embarked'], prefix='Embarked')
    df = df.drop('Embarked', axis=1)
    df = df.join(enbarked_one_hot)

    return df

In [9]:
train_df = prep_data(train_df)
train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

# Data normalization

In [10]:
# X contains all columns except 'Survived'  
X = train_df.drop(['Survived'], axis=1).values.astype(float)

# It is almost always a good idea to perform some scaling of input values when using neural network models (jb).

scale = StandardScaler() #for sequential better when we built the topology alone, influence the optimazer to calculate the weight
X = scale.fit_transform(X)

# Y is just the 'Survived' column
Y = train_df['Survived'].values

# Model of NN- topology

In [11]:
def create_model(optimizer='adam', init='uniform'):
    # create model
    model = Sequential() # Keras squential- topologie neuronové sítě, 
    model.add(Dense(16, input_dim=X.shape[1], kernel_initializer=init, activation='relu'))
    model.add(Dense(8, kernel_initializer=init, activation='relu'))
    model.add(Dense(4, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid')) #last output must be sigmoid to obtain 0/1
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Hyperparameter tunning

In [12]:
best_epochs = 200
best_batch_size = 5
best_init = 'glorot_uniform'
best_optimizer = 'rmsprop'

# Models Training

## Keras Classifier and MLPClassifier

In [13]:
import tensorflow as tf

anhealer = tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, min_lr=0.0001) # Reduce learning rate when a metric has stopped improving.
early_stopping = tf.keras.callbacks.EarlyStopping(patience=3)
callback=[anhealer, early_stopping]

In [14]:
# Create a classifier with best parameters using KerasClassifier
model_pred1 = KerasClassifier(build_fn=create_model, optimizer=best_optimizer, init=best_init, epochs=best_epochs, batch_size=best_batch_size)
model_pred1.fit(X,Y)

# Create a classifier with best parameters using MLPClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
#X_train, X_val, y_train, y_val = train_test_split(X,Y,random_state=1, test_size=0.2)#split not used because it is not possible when we have test.csv file for training

model_pred2 = MLPClassifier(hidden_layer_sizes=(3,), activation="tanh",random_state=1).fit(X,Y)


# Read test data
test_df = pd.read_csv("/kaggle/input/titanic/test.csv", index_col='PassengerId')
# Prep and clean data
test_df = prep_data(test_df)
# Create X_test
X_test = test_df.values.astype(float)
# Scaling
X_test = scale.transform(X_test)

# Predict 'Survived' of both models
prediction1 = model_pred1.predict(X_test)
prediction2 = model_pred2.predict(X_test)


2023-02-04 19:45:58.897435: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-04 19:45:58.907610: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-04 19:45:58.908354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-04 19:45:58.909666: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78



## MLPClassifier and GridSearchCV- cross validation

### More info
https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [15]:
from sklearn.model_selection import GridSearchCV
param_grid = [
        {
            'activation' : ['identity', 'logistic', 'tanh', 'relu'], #Activation function for the hidden layer.
            'solver' : ['lbfgs', 'sgd', 'adam'], #The solver for weight optimization.
            #'hidden_layer_sizes': [(1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,), (12,),(13,),(14,),(15,),(16,),(17,),(18,),(19,),(20,),(21,)],
            #'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 
            'alpha': 10.0 ** -np.arange(1, 10), #Strength of the L2 regularization term. The L2 regularization term is divided by the sample size when added to the loss.
            'hidden_layer_sizes':np.arange(10, 15), #The ith element represents the number of neurons in the ith hidden layer.
            'random_state':[0,1,2,3,4,5,6,7,8,9], #Determines random number generation for weights and bias initialization
        }
       ]

In [None]:
model_pred3 = GridSearchCV(MLPClassifier(), param_grid, cv=3,
                           scoring='accuracy')
model_pred3.fit(X,Y)


print("Best parameters set found on development set:")
print(model_pred3.best_params_)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [None]:
prediction3 = model_pred3.predict(X_test)

# Prediction

In [None]:
prediction_train1 = model_pred1.predict(X)
prediction_train2 = model_pred2.predict(X)
prediction_train3 = model_pred3.predict(X)

# Data visualization

In [None]:
cm1 = confusion_matrix(Y, prediction_train1)
cm2 = confusion_matrix(Y, prediction_train2)
cm3 = confusion_matrix(Y, prediction_train3)

In [None]:
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm1)
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm2)
disp3 = ConfusionMatrixDisplay(confusion_matrix=cm3)

In [None]:
disp1.plot(cmap=plt.cm.Blues) #KerasClassifier
plt.show()

In [None]:
disp2.plot(cmap=plt.cm.Blues) #MLPClassifier
plt.show()

In [None]:
disp3.plot(cmap=plt.cm.Blues) #MLPClassifier with GridSearch
plt.show()

In [None]:
train_df["Survived"]

In [None]:
train_df.groupby("Survived").count

# Submission

In [None]:
#KerasClassifier-'Survived': prediction1[:,0], MLPClassifier 'Survived': prediction2, MLPClassifier+Gridsearch 'Survived': prediction3
submission = pd.DataFrame({
    'PassengerId': test_df.index,
    #'Survived': prediction1[:,0],
    #'Survived': prediction2,
    'Survived': prediction3,
})

submission.sort_values('PassengerId', inplace=True)    
submission.to_csv('submission-simple-cleansing.csv', index=False)

In [None]:
submission