<a href="https://www.kaggle.com/code/carlosdrebollar/titanic-project?scriptVersionId=157182475" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # data visualization
from sklearn.model_selection import train_test_split # train/validation splits
from sklearn.impute import KNNImputer, SimpleImputer # imputers 

# Load data
train_df = pd.read_csv("/kaggle/input/titanic-machine-learning-from-disaster/train.csv")

# View head of training data
train_df.head()

# Analyze missing values
train_df.isna().sum(axis = 0) # Age: 177, Cabin: 687, Embarked: 2

# Removed 2 obs w/ missing "Embarked"
train_df = train_df.dropna(subset = 'Embarked') 
train_df.shape # 889 observations, 12 variables

# Add "Cabin" and "Age" missing indicator
train_df["Cabin_NA"] = train_df["Cabin"].isna()
train_df["Age_NA"] = train_df["Age"].isna()

# Add "Alone" indicator
train_df["Alone"] = train_df[["SibSp", "Parch"]].sum(axis = 1) == 0

# Add "Adult_male" indicator
train_df["Adult_male"] = (train_df["Age"] >= 18) & (train_df["Sex"] == "male")

# Define a regular expression to extract the prefix and number
regex_pattern = r'(?P<ticket_prefix>.*?)(?P<ticket_number>\d+)$'

# Use str.extract to create new columns
train_df = pd.merge(train_df, train_df['Ticket'].str.extract(regex_pattern), left_index = True, right_index = True)
train_df['ticket_prefix'] = train_df['ticket_prefix'].replace("", 'NA').str.replace(".", "")
train_df['ticket_prefix'] = train_df['ticket_prefix'].str.replace(" ", "").str[:1]

# Extracting title into a new column
train_df["Name_title"] = train_df["Name"].str.split(',').str[1].str.strip().str.split(' ').str[0]

# List of valid titles
valid_titles = ['Mrs.', 'Mr.', 'Master.', 'Miss.']

# Replace titles not in the valid list with 'Other'
train_df["Name_title"] = train_df["Name_title"].apply(lambda x: x if x in valid_titles else 'Other')

# Calculate the median age for each title
median_age_by_title = train_df.groupby('Name_title')['Age'].transform('median')

# Fill missing values in 'Age' with the corresponding median for each title
train_df["Age_IMP"] = train_df['Age'].fillna(median_age_by_title)

# Define initial data sets
X = train_df.drop(["Survived", "PassengerId", "Name", "Cabin", "Ticket", "ticket_number", "Age"], axis = 1)
y = train_df.Survived

# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = .15, random_state = 42)

# Get dummies
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)

print(X_train.columns)

# Initialize KNN imputer
knn_imputer = KNNImputer()
knn_imputer.fit(X_train)

# Initialize simple median imputer
median_imputer = SimpleImputer(strategy = 'median')
median_imputer.fit(X_train)

# Impute Age with median imputer for training and validation
X_train_imp = pd.DataFrame(median_imputer.transform(X_train), columns = X_train.columns.tolist())
X_valid_imp = pd.DataFrame(median_imputer.transform(X_valid), columns = X_valid.columns.tolist())

# Logistic Regression
## Validation Accuracy = 85.82%
### Data preparation

In [None]:
def LogPrep(input_dat):
    
    dat = input_dat.copy()

    dat["Pclass"] = dat["Pclass"].astype("category")

    # Only 15 with "Parch" >= 3. Change to categorical
    dat["Parch"] = dat["Parch"].apply(lambda x: 3 if x >= 3 else x).astype(str)
    dat["Parch"] = dat["Parch"].replace("3", "3 or greater").astype("category")

    # Only 12 with "SibSp" >= 5. Change to categorical
    dat["SibSp"] = dat["SibSp"].apply(lambda x: 5 if x >= 5 else x).astype(str)
    dat["SibSp"] = dat["SibSp"].replace("5", "5 or greater").astype("category")
    
    dat = dat.drop(["Sex_female", "Embarked_C"], axis = 1)
    dat = pd.get_dummies(dat, drop_first = True)
    
    return(dat)

### Model Fitting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Prepare training and validation data
X_train_imp_lr = LogPrep(X_train_imp)
X_valid_imp_lr = LogPrep(X_valid_imp)

# Create a Logistic Regression model
logreg_model = LogisticRegression(max_iter = 2000, random_state = 42)

# Define the hyperparameter grid to search
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['lbfgs', 'liblinear'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]  # Inverse of regularization strength
}

# Create the GridSearchCV object
grid_search = GridSearchCV(logreg_model, param_grid, cv = 5, scoring = 'roc_auc')

# Fit the model to the training data
grid_search.fit(X_train_imp_lr, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Average AUC Score:", round(100 * grid_search.best_score_, 2))

# Store the best model
best_logreg_model = grid_search.best_estimator_

# Make predictions on the training set to get predicted probabilities
y_pred_proba = best_logreg_model.predict_proba(X_train_imp_lr)[:, 1]

# Calculate AUC on training: 86.57%
auc_score = roc_auc_score(y_train, y_pred_proba)
print("Training AUC:", round(100 * auc_score, 2))

# Make predictions on the validation set to get predicted probabilities
y_pred_proba_valid = best_logreg_model.predict_proba(X_valid_imp_lr)[:, 1]

# Calculate AUC on validation: 88.26%
auc_score_valid = roc_auc_score(y_valid, y_pred_proba_valid)
print("Validation AUC:", round(100 * auc_score_valid, 2))

### Determine the cutoff that maximized the accuracy

In [None]:
from sklearn.metrics import accuracy_score

lr_results = pd.DataFrame()
lr_results['p_hat'] = best_logreg_model.predict_proba(X_train_imp_lr)[:, 1]

accuracy = np.array([])

for cutoff in range(100):
  lr_results["pred"] = lr_results['p_hat'].map(lambda x: 1 if x > (cutoff / 100) else 0)
  value_a = accuracy_score(y_train, lr_results["pred"])
  accuracy = np.append(accuracy, value_a)

data = {'Accuracy': accuracy, 'Cut-off': range(100)}
acc_s = pd.DataFrame(data)

# Retrieve cutoff that maximizes accuracy
optimal_cutoff = acc_s.sort_values(by = ['Accuracy'], ascending = False).iloc[0][1]/100
print(optimal_cutoff)

### Determine accuracy on validation set

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the validation set to get predicted probabilities
y_prob_valid_lr = best_logreg_model.predict_proba(X_valid_imp_lr)[:,1]
y_pred_valid_lr = y_prob_valid_lr >= optimal_cutoff

accuracy_lr = accuracy_score(y_valid, y_pred_valid_lr)
print("Accuracy:", round(100 * accuracy_lr, 2)) # Accuracy of 85.82% on validation set.

# Tree-Based Methods
## Decision Tree 
### Cross-validated accuracy on training = 82.65%
### Validation Accuracy = 81.34%

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Create a DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state = 42)

# Define the hyperparameter grid to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(dt_classifier, param_grid, cv = 5, scoring = 'accuracy')

# Fit the model to the training data
grid_search.fit(X_train_imp, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Print the best accuracy score found (on training)
print("Best cross-validated training accuracy:", round(100 * grid_search.best_score_, 2))

# Store best model
best_dt_model = grid_search.best_estimator_

# Store validation predictions
y_pred_valid_dt = best_dt_model.predict(X_valid_imp)

# Evaluate the model's performance
accuracy_dt = best_dt_model.score(X_valid_imp, y_valid) * 100
print("Accuracy:", round(accuracy_dt, 2)) # Accuracy of 81.46% on validation set.

## Random Forest
### Mean training accuracy in cross-validation = 83.18%
### Validation accuracy = 84.33%

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state = 42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100],  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 9, 10, 11],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [2, 3, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_classifier, param_grid, cv = 5, scoring = 'accuracy')

# Fit the model to the training data
grid_search.fit(X_train_imp, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Print the best accuracy score found (on training)
print("Best cross-validated training accuracy:", round(100 * grid_search.best_score_, 2))

# Store the best model
best_rf_model = grid_search.best_estimator_

# Store validation predictions
y_pred_valid_rf = best_rf_model.predict(X_valid_imp)
y_prob_valid_rf = best_rf_model.predict_proba(X_valid_imp)[:,1]


# Evaluate the model's performance
accuracy_rf = best_rf_model.score(X_valid_imp, y_valid) * 100
print("Accuracy:", round(accuracy_rf, 2)) # Display accuracy on the validation set

## XGBoost Classifier 
### Mean training accuracy in cross-validation = 83.71%
### Validation accuracy = 79.85%

In [None]:
from xgboost import XGBClassifier

# Create an XGBClassifier
xgb_classifier = XGBClassifier(random_state = 42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [13, 14, 15],  # Number of boosting rounds
    'learning_rate': [.1, .2, .3],
    'max_depth': [9, 10, 11],
    'subsample': [.5, .6, .7],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 3]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_classifier, param_grid, cv = 5, scoring = 'accuracy')

# Fit the model to the training data
grid_search.fit(X_train_imp, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Print the best accuracy score found (on training)
print("Best cross-validated training accuracy:", round(100 * grid_search.best_score_, 2))

# Store the best model
best_xgb_model = grid_search.best_estimator_

# Store validation predictions
y_pred_valid_xgb = best_xgb_model.predict(X_valid_imp)
y_prob_valid_xgb = best_xgb_model.predict_proba(X_valid_imp)[:,1]

# Evaluate the model's performance
accuracy_xgb = best_xgb_model.score(X_valid_imp, y_valid) * 100
print("Accuracy:", round(accuracy_xgb, 2))  # Display accuracy on the validation set

# Naive Bayes
### Mean training accuracy in cross-validation = 80.26%
### Validation accuracy = 81.34%

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gnb_classifier = GaussianNB()

# Define the hyperparameter grid
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator = gnb_classifier, param_grid = param_grid, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train_imp, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

# Store the best model
best_gnb_model = grid_search.best_estimator_

# Store validation predictions
y_pred_valid_gnb = best_gnb_model.predict(X_valid_imp)
y_prob_valid_gnb = best_gnb_model.predict_proba(X_valid_imp)[:,1]

accuracy_nb = accuracy_score(y_valid, y_pred_valid_gnb)
print("Validation accuracy:", round(100 * accuracy_nb, 2))

# KNN Classifier
### Mean training accuracy in cross-validation = 82.38%
### Validation accuracy = 82.84%

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# Scale the features using StandardScaler
scaler_cont = StandardScaler().fit(X_train_imp)
X_train_knn = scaler_cont.transform(X_train_imp)
X_valid_knn = scaler_cont.transform(X_valid_imp)

# Cross validation to get best value of K
k_values = [i for i in range (1,31)]
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors = k)
    score = cross_val_score(knn, X_train_knn, y_train, cv = 6)
    scores.append(np.mean(score))
    
knn_dat = pd.DataFrame({"k_values": k_values, "scores": scores}) 

# Plot results
sns.lineplot(x = k_values, y = knn_dat["scores"], marker = 'o')
plt.xlabel("K Values")
plt.ylabel("Accuracy Score")

# Extract best k
print(knn_dat.sort_values("scores", ascending = False).head(1))
best_k = knn_dat.sort_values("scores", ascending = False).iloc[0]['k_values']

# Instantiate and fit the model
knn = KNeighborsClassifier(n_neighbors = int(best_k))
knn.fit(X_train_knn, y_train)

# Store validation predictions
y_pred_valid_knn = knn.predict(X_valid_knn)
y_prob_valid_knn = knn.predict_proba(X_valid_knn)[:,1]

# Accuracy
accuracy_knn = accuracy_score(y_valid, y_pred_valid_knn)
print("Validation accuracy:", round(100 * accuracy_knn , 2))

# Support Vector Machine
### Mean training accuracy in cross-validation = 82.65%
### Validation accuracy = 84.33%

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Scale the features using StandardScaler
scaler_cat = StandardScaler().fit(X_train_imp_lr)
X_train_svm = scaler_cat.transform(X_train_imp_lr)
X_valid_svm = scaler_cat.transform(X_valid_imp_lr)

# Initialize SVC
svm_classifier = SVC(probability = True, random_state = 42)

# Define the hyperparameter grid
param_grid = {'C': [.5, 1, 10, 100],
             'gamma': ['scale', 1, .1, .01, .001, .0001],
             'kernel': ['rbf']}

# Perform GridSearchCV
grid_search = GridSearchCV(svm_classifier, param_grid = param_grid, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train_svm, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

# Store the best model
best_svm_model = grid_search.best_estimator_

# Store validation predictions
y_pred_valid_svm = best_svm_model.predict(X_valid_svm)
y_prob_valid_svm = best_svm_model.predict_proba(X_valid_svm)[:,1]

accuracy_svm = accuracy_score(y_valid, y_pred_valid_svm)
print("Validation accuracy:", round(100 * accuracy_svm, 2))

# Neural Nets
## Sequential Model (Validation accuracy = 84.33%)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Build the neural network model
model = Sequential()
model.add(Dense(units = 64, activation = 'relu', input_dim = X_train_nn.shape[1]))
model.add(Dense(units = 1, activation = 'sigmoid'))

# Compile the model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Train the model
model.fit(X_train_svm, y_train, epochs = 15, batch_size = 32, validation_data = (X_valid_svm, y_valid))

# Store validation predictions
y_prob_valid_seq = model.predict(X_valid_svm)
y_pred_valid_seq = y_prob_valid_seq > .5

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_valid_svm, y_valid)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

## MLP Classifier
### Mean training accuracy in cross-validation = 82.25%
### Validation accuracy = 82.84%

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search
param_grid = {
    'hidden_layer_sizes': [2, 3, 4, 5, 6],
    'alpha': [0.00005, 0.0005, .005],
    'solver': ['lbfgs']
}

mlp_model = MLPClassifier(max_iter = 5000, random_state = 42)

# Train the model
grid_search = GridSearchCV(mlp_model, param_grid = param_grid, cv = 5)
grid_search.fit(X_train_svm, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

# Store the best model
best_mlp_model = grid_search.best_estimator_

# Store validation predictions
y_pred_valid_mlp = best_mlp_model.predict(X_valid_svm)
y_prob_valid_mlp = best_mlp_model.predict_proba(X_valid_svm)[:,1]

# Accuracy
accuracy_mlp = accuracy_score(y_valid, y_pred_valid_mlp)
print("Validation accuracy:", round(100 * accuracy_mlp , 2))

# Ensemble Predictions (Validation Accuracy = 84.33%)

In [None]:
ensemble = y_valid.to_frame().reset_index()
ensemble['logistic_reg'] = y_prob_valid_lr
ensemble['decision_tree'] = y_pred_valid_dt
ensemble['random_forest'] = y_prob_valid_rf
ensemble['xgboost'] = y_prob_valid_xgb
ensemble['naive_bayes'] = y_pred_valid_gnb
ensemble['knn'] = y_prob_valid_knn
ensemble['svm'] = y_prob_valid_svm
ensemble['sequential_nn'] = y_prob_valid_seq
ensemble['mlp_classifier'] = y_prob_valid_mlp

# Obtain ensemble predictions for validation set
ensemble["ens_probs"] = ensemble.drop(["index", "Survived"], axis = 1).mean(axis = 1)
ensemble["preds"] = ensemble["ens_probs"] >= .5

# Accuracy
accuracy_ensemble = accuracy_score(y_valid, ensemble["preds"])
print("Accuracy:", round(100 * accuracy_ensemble, 2)) # Accuracy of 84.33% on validation set.

# Create final predictions on test data
## Prepare test data

In [None]:
# Load data
test_df = pd.read_csv("/kaggle/input/titanic-machine-learning-from-disaster/test.csv")

# Add "Cabin" and "Age" missing indicator
test_df["Cabin_NA"] = test_df["Cabin"].isna()
test_df["Age_NA"] = test_df["Age"].isna()

# Add "Alone" indicator
test_df["Alone"] = test_df[["SibSp", "Parch"]].sum(axis = 1) == 0

# Add "Adult_male" indicator
test_df["Adult_male"] = (test_df["Age"] >= 18) & (test_df["Sex"] == "male")

# Use str.extract to create new columns
test_df = pd.merge(test_df, test_df['Ticket'].str.extract(regex_pattern), left_index = True, right_index = True)
test_df['ticket_prefix'] = test_df['ticket_prefix'].replace("", 'NA').str.replace(".", "")
test_df['ticket_prefix'] = test_df['ticket_prefix'].str.replace(" ", "").str[:1]

# Extracting title into a new column
test_df["Name_title"] = test_df["Name"].str.split(',').str[1].str.strip().str.split(' ').str[0]

# Replace titles not in the valid list with 'Other'
test_df["Name_title"] = test_df["Name_title"].apply(lambda x: x if x in valid_titles else 'Other')

# Calculate the median age for each title
median_age_by_title = test_df.groupby('Name_title')['Age'].transform('median')

# Fill missing values in 'Age' with the corresponding median for each title
test_df["Age_IMP"] = test_df['Age'].fillna(median_age_by_title)

# Define initial data sets
X_test = test_df.drop(["PassengerId", "Name", "Cabin", "Ticket", "ticket_number", "Age"], axis = 1)

# Get dummies
X_test = pd.get_dummies(X_test).drop(["ticket_prefix_L"], axis = 1)

# Impute Age with median imputer for test
X_test_imp = pd.DataFrame(median_imputer.transform(X_test), columns = X_test.columns.tolist())

# Prepare test for logistic
X_test_imp_lr = LogPrep(X_test_imp)

# Prepare test for KNN classifier
X_test_knn = scaler_cont.transform(X_test_imp)

# Prepare test for SVM, Sequential, NN, MLP classifier
X_test_svm = scaler_cat.transform(X_test_imp_lr)

# Create test set predictions

In [None]:
# Logistic Regression
y_pred_test_lr = best_logreg_model.predict_proba(X_test_imp_lr)[:,1]

# Decision Tree
y_pred_test_dt = best_dt_model.predict_proba(X_test_imp)[:,1]

# Random Forest
y_pred_test_rf = best_rf_model.predict_proba(X_test_imp)[:,1]

# XGBoost
y_pred_test_xgb = best_xgb_model.predict_proba(X_test_imp)[:,1]

# Naive Bayes
y_pred_test_gnb = best_gnb_model.predict(X_test_imp)

# KNN 
y_pred_test_knn = knn.predict(X_test_knn)

# SVM
y_pred_test_svm = best_svm_model.predict(X_test_svm)

# Sequential NN
y_pred_test_seq = model.predict(X_test_svm) > .5

# MLP Classifier
y_pred_test_mlp = best_mlp_model.predict_proba(X_test_svm)[:,1]

In [None]:
ensemble_test = pd.DataFrame()
ensemble_test['logistic_reg'] = y_pred_test_lr
ensemble_test['decision_tree'] = y_pred_test_dt
ensemble_test['random_forest'] = y_pred_test_rf
ensemble_test['xgboost'] = y_pred_test_xgb
ensemble_test['naive_bayes'] = y_pred_test_gnb
ensemble_test['knn'] = y_pred_test_knn
ensemble_test['svm'] = y_prob_test_svm
ensemble_test['sequential_nn'] = y_pred_test_seq
ensemble_test['mlp_classifier'] = y_pred_test_mlp

# Obtain ensemble_test predictions for validation set
ensemble_test["mean_proba"] = ensemble_test.mean(axis = 1)

# What cutoff to use? 
ensemble_test["preds"] = ensemble_test["mean_proba"] >= .5

test_preds = pd.DataFrame({"PassengerId": test_df["PassengerId"],
                          "Survived": ensemble_test["preds"].astype(int)})

In [None]:
test_preds.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")