In [149]:
import os
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.regularizers import l2

TITANIC_PATH = "datasets"

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path, index_col='PassengerId')

In [150]:
train_data, test_data = load_titanic_data("train.csv"), load_titanic_data("test.csv")
train_data.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)
test_data.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)
# catboost nie potrzebuje zmiany kategorycznych na inty. Można dać kategoryczne kolumny do Pool
cat_columns = train_data.select_dtypes(include=pd.Categorical).columns
num_columns = train_data.select_dtypes(exclude=pd.Categorical).columns

train_data[cat_columns] = SimpleImputer(strategy='most_frequent').fit_transform(train_data[cat_columns])
test_data[cat_columns] = SimpleImputer(strategy='most_frequent').fit_transform(test_data[cat_columns])

train_data[num_columns] = KNNImputer().fit_transform(train_data[num_columns])
test_data[num_columns.drop('Survived')] = KNNImputer().fit_transform(test_data[num_columns.drop('Survived')])

cat_encoder = OrdinalEncoder()
train_data[cat_columns] = cat_encoder.fit_transform(train_data[cat_columns])
test_data[cat_columns] = cat_encoder.transform(test_data[cat_columns])

scaler = StandardScaler()
scaler.fit(train_data.append(test_data)[num_columns.drop('Survived')])
train_data[num_columns.drop('Survived')] = scaler.transform(train_data[num_columns.drop('Survived')])

test_data[num_columns.drop('Survived')] = scaler.transform(test_data[num_columns.drop('Survived')])



X_train, y_train = train_data[[c for c in train_data.columns if c != 'Survived']], train_data['Survived']
X_test = test_data[[c for c in train_data.columns if c != 'Survived']]

In [151]:
# X_train, y_train = train_data[[c for c in train_data.columns if c != 'Survived']], train_data['Survived']
# X_test = test_data[[c for c in train_data.columns if c != 'Survived']]

In [152]:
y_train.head()

PassengerId
1    0.0
2    1.0
3    1.0
4    1.0
5    0.0
Name: Survived, dtype: float64

In [153]:
X_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,0.841916,1.0,0.327174,-0.479087,-0.445,-0.491999,1.0
893,0.841916,0.0,1.261126,0.481288,-0.445,-0.508031,2.0
894,-0.352091,1.0,2.381869,-0.479087,-0.445,-0.456071,1.0
895,0.841916,1.0,-0.233198,-0.479087,-0.445,-0.475888,2.0
896,0.841916,0.0,-0.606779,0.481288,0.710763,-0.405804,2.0


In [154]:
# from sklearn.preprocessing import StandardScaler

# to_be_scaled = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']

# scaler = StandardScaler()

# for var in to_be_scaled:
#     X_train[var] = X_train[var].astype('float64')
#     X_train[var] = scaler.fit_transform(X_train[var].values.reshape(-1, 1))

In [155]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.841916,1.0,-0.606779,0.481288,-0.445,-0.503197,2.0
2,-1.546098,0.0,0.58868,0.481288,-0.445,0.734798,0.0
3,0.841916,0.0,-0.307914,-0.479087,-0.445,-0.490147,2.0
4,-1.546098,0.0,0.364532,0.481288,-0.445,0.383249,2.0
5,0.841916,1.0,0.364532,-0.479087,-0.445,-0.48773,2.0


In [156]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization
from numpy.random import seed
import tensorflow as tf

def create_model(lyrs=[8], act='relu', opt='Adam', dr=0.0):
    # set random seed for reproducibility
    seed(42)
    tf.random.set_seed(42)

    model = Sequential()

    # create first hidden layer
    model.add(Dense(lyrs[0], input_dim=X_train.shape[1], activation=act))
    # hidden layers
    model.add(Dense(40, activation=act, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
    model.add(Dense(20, activation=act, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
    model.add(Dense(10, activation=act, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))

    # output layer
    model.add(Dense(1, activation='sigmoid'))  

    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

    return model

In [157]:
model = create_model()
print(model.summary())

Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_113 (Dense)            (None, 8)                 64        
_________________________________________________________________
dense_114 (Dense)            (None, 40)                360       
_________________________________________________________________
dense_115 (Dense)            (None, 20)                820       
_________________________________________________________________
dense_116 (Dense)            (None, 10)                210       
_________________________________________________________________
dense_117 (Dense)            (None, 1)                 11        
Total params: 1,465
Trainable params: 1,465
Non-trainable params: 0
_________________________________________________________________
None


In [158]:
import numpy as np
training = model.fit(X_train, y_train, epochs=300, batch_size=32, validation_split=0.2, verbose=0)
val_acc = np.mean(training.history['val_accuracy'])
print("\n%s: %.2f%%" % ('val_accuracy', val_acc*100))


val_accuracy: 84.83%


In [159]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [8, 16, 32, 64]
epochs = [50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)

# search the grid
grid = GridSearchCV(estimator=model, 
                    param_grid=param_grid,
                    cv=3,
                    verbose=2,
                    n_jobs=-1)

grid_result = grid.fit(train_data[[c for c in train_data.columns if c != 'Survived']], train_data['Survived'])
print("Best: %f using %s" % (grid_result.best_score_*100, grid_result.best_params_))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best: 82.154882 using {'batch_size': 8, 'epochs': 50}


In [160]:
model = create_model()
model.fit(X_train, y_train, epochs=300, batch_size=8, 
                     validation_split=0.2, verbose=0)

# evaluate the model
scores = model.evaluate(X_train, y_train)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


accuracy: 85.19%


In [161]:
preds = model.predict(X_test)
preds = pd.DataFrame(preds)
preds.columns = ['Survived']
preds['Survived'] = (preds['Survived'] > .5).astype(int)
preds.index = test_data.index
print(preds.head())
preds.to_csv("gender_submission.csv", index=True)

             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
