In [None]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

In [None]:
data_train.info()

In [None]:
data_train.describe()

In [None]:
data_test.info()

In [None]:
data_train.describe()

In [None]:
def titanic_preprocessing(dataset):
    # Delete Cabin and PassengerId features because they don't add nothing to our model
    dataset = dataset.drop(['Cabin','PassengerId','Ticket'],axis=1)
    
    # Fill Age column with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    
    # Fill Embarked with most occuring values
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
    
    # Fill Embarked with most occuring values
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
    
    return dataset

In [None]:
def titanic_feature_engineering(dataset):
    # Extract title from Name column
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(". ", expand=True)[0]
    rare_titles = dataset['Title'].value_counts()[:4].index.tolist()
    dataset.loc[~dataset['Title'].isin(rare_titles), 'Title'] = 'Rare'
    
#     # Fill Age column
#     grouped_dataset = dataset.iloc[:891].groupby(['Sex', 'Pclass', 'Title'])
#     grouped_median_dataset = grouped_dataset.median()
#     grouped_median_dataset = grouped_median_dataset.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
    
#     def fill_age(row):
#         condition = (
#             (grouped_median_dataset['Sex'] == row['Sex']) &
#             (grouped_median_dataset['Title'] == row['Title']) &
#             (grouped_median_dataset['Pclass'] == row['Pclass'])
#         )
#         return grouped_median_dataset[condition]['Age'].values[0]

#     def process_age(dataset):
#         dataset['Age'] = dataset.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
#         return dataset

#     dataset = process_age(dataset)
    
    # Bining
    dataset['Fare_cut'] = pd.qcut(dataset['Fare'], q=10)
    
    # Create dummy variables
    columns = ['Embarked', 'Title', 'Fare_cut']
    dummy = pd.get_dummies(dataset[columns], dtype=np.int32)
    dataset = pd.concat([dataset, dummy], axis=1)
    dataset['Sex_dummy'] = dataset['Sex'].map({'male': 0, 'female': 1})
    
    # Size of the family
    dataset['Family_size'] = dataset['SibSp'] + dataset['Parch']
    
    # Family size
    dataset['Marriage'] = 0
    dataset.loc[dataset['Family_size'] == 1, 'Marriage'] = 1
    
    dataset['S_family'] = 0
    dataset.loc[(dataset['Family_size'] <= 4) & (dataset['Family_size'] >= 2), 'S_family'] = 1
    
    dataset['B_family'] = 0
    dataset.loc[dataset['Family_size'] >= 5, 'B_family'] = 1
    
    dataset['IsAlone'] = 0
    dataset.loc[dataset['Family_size'] == 0, 'IsAlone'] = 1
    
    # Delete useless columns
    dataset = dataset.drop(['Name', 'Sex', 'Embarked', 'Fare_cut', 'Title'], axis=1)
    
    return dataset 

In [None]:
def model_evaluation(model):
    # Train our model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # F1-score
    f1 = f1_score(y_test, y_pred)
    print('F1-score: {}'.format(f1))
    
    
    # Confusion matrix
    print('Confusion Matrix: ')
    matrix = confusion_matrix(y_test, y_pred)
    group_names = ['True Negative','False Positive','False Negative','True Positive']
    group_counts =['{0:0.0f}'.format(value) for value in matrix.flatten()]
    group_percentages = ['{0:.2%}'.format(value) for value in matrix.flatten()/np.sum(matrix)]
    
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2, 2)

    sns.heatmap(matrix, annot=labels, fmt='', cmap='rocket_r')
    
    return f1

In [None]:
def dataset_preparation(dataset):
    dataset = titanic_preprocessing(dataset)
    dataset = titanic_feature_engineering(dataset)
    return dataset

In [None]:
data_train_prep = dataset_preparation(data_train)
data_test_prep = dataset_preparation(data_test)

In [None]:
data_train_prep

In [None]:
data_test_prep

In [None]:
X = data_train_prep.drop(['Survived'], axis=1)
y = data_train_prep['Survived']

# Scalling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
data_test_scaled = scaler.fit_transform(data_test_prep)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# K-nearest neighbors 
KNN_model = KNeighborsClassifier()

# SVC
SVC_model = SVC()

# Logistic regression
LR_model = LogisticRegression(max_iter=1000)

# Decision tree
DT_model = DecisionTreeClassifier()

# Random Forest
RF_model = RandomForestClassifier()

# Stochastic gradient descent
SGD_model = SGDClassifier()

# XGBoost
XGB_model = xgb.XGBClassifier(eta=0.01)

# LightGBM
LGBM_model = LGBMClassifier()

In [None]:
f1_for_KNN = model_evaluation(KNN_model)

In [None]:
f1_for_SVC = model_evaluation(SVC_model)

In [None]:
f1_for_LR = model_evaluation(LR_model)

In [None]:
f1_for_DT = model_evaluation(DT_model)

In [None]:
f1_for_RF = model_evaluation(RF_model)

In [None]:
f1_for_SGD = model_evaluation(SGD_model)

In [None]:
f1_for_XGB = model_evaluation(XGB_model)

In [None]:
f1_for_LGBM = model_evaluation(LGBM_model)

In [None]:
XGB_model = xgb.XGBClassifier(
    reg_alpha = 0.1,
    learning_rate =0.01,
    n_estimators=1000,
    # eta = 0.01,
    gamma = 0,
    reg_lambda = 0.5,
    max_depth = 4,
    main_child_weight = 0.5,
    sampling_method = 'uniform',
    subsample=1,
)

eval_set = [(X_train, y_train), (X_test, y_test)]
model = XGB_model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True, early_stopping_rounds=10)

Best Score for XGB: 0.7853876288287601 
With params: {'alpha': 0.5, 'eta': 0.05, 'gamma': 0, 'lambda': 0.5, 'max_depth': 4, 'min_child_weight': 0.5, 'sampling_method': 'uniform', 'subsample': 1}

In [None]:
parameters = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

# search = GridSearchCV(XGB_model, parameters, verbose=3, scoring='f1')
# search.fit(X_train, y_train)

In [None]:
# print('Best Score for XGB: {} \nWith params: {}'.format(search.best_score_, search.best_params_))

In [None]:
results = XGB_model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss')
plt.show()
# plot classification error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
plt.ylabel('Classification Error')
plt.title('XGBoost Classification Error')
plt.show()

In [None]:
y_hat = XGB_model.predict(X_test)
f1_score(y_test, y_hat)

In [None]:
y_pred = XGB_model.predict(data_test_scaled)

In [None]:
my_submission = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': y_pred})
my_submission.to_csv('submission_v8.csv', index=False)