## Titanic ML from Disaster

### 1. Setup

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys

### 2. Import Data

In [108]:
dir = os.getcwd()
train_data = pd.read_csv(os.path.join(dir,'input', 'train.csv'))
test_data = pd.read_csv(os.path.join(dir,'input', 'test.csv'))

### 3. Exploratory Data Analysis (EDA)

In [109]:
def basic_eda_info (df, name='DataFrame'):

    print(f'=== {name}: Basic Information ===')
    print(f'Shape: {df.shape}')

    print('='*60)
    print('=== Head of DataFrame ===')
    display(df.head())

    with pd.option_context('display.float_format', '{:,.2f}'.format):
        print('=== Numeric Describe ===')
        display(df.describe())
    
    print('='*60)
    print('=== Missing Values ===')
    print(df.isna().sum())
    print('='*60)

In [110]:
from scipy.stats import pointbiserialr, chi2_contingency

def correlation_analysis_type(df, task="classification", target_col=None, numeric_cols=None):
    """
    Function to analyze correlations based on the task (classification or regression)
    and plot a heatmap of correlations.
    """
    if target_col is None:
        raise ValueError("Please specify the target_col (dependent variable).")

    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()

    # Exclude the target column from numeric_cols
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)

    if task == "classification":
        if len(df[target_col].unique()) > 2:
            raise ValueError("For classification, the target_col must be binary (e.g., 0 or 1).")
        correlations = {}
        for col in numeric_cols:
            corr, _ = pointbiserialr(df[target_col], df[col])
            correlations[col] = corr
        # Convert to DataFrame for visualization
        corr_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation'])
        corr_df = corr_df.sort_values(by='Correlation', ascending=False)

        plt.figure(figsize=(6, 4))
        sns.heatmap(corr_df, annot=True, cmap='coolwarm', cbar=False, fmt='.2f')
        plt.title('Point-Biserial Correlation Heatmap (Classification)')
        plt.show()

    elif task == "regression":
        corr_df = df[numeric_cols + [target_col]].corr()
        plt.figure(figsize=(6, 4))
        sns.heatmap(corr_df, annot=True, cmap='coolwarm', fmt='.2f')
        plt.title('Correlation Heatmap (Regression)')
        plt.show()
    else:
        raise ValueError("Invalid task. Please choose 'classification' or 'regression'.")

In [111]:
basic_eda_info(train_data)

=== DataFrame: Basic Information ===
Shape: (891, 12)
=== Head of DataFrame ===


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


=== Numeric Describe ===


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


=== Missing Values ===
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


### 4. Data Cleaning

#### 4.1 Missing data

In [112]:
print("Missing in train_data:")
print(train_data.isnull().sum())

print("."*50)

Missing in train_data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
..................................................


#### 4.2 Missing data on Age

In [113]:
def replace_missing_age (df):
    df = df.reset_index()
    df = df.set_index('PassengerId')

    # Extract titles
    df['Title'] = df.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)

    # Standarize titles
    mapping = {'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'}
    df['Title'] = df['Title'].replace(mapping)
    title_medians = df.groupby('Title')['Age'].median().to_dict()

    # Replace missing values in the 'Age' column with the median of the corresponding 'Title'
    df['Age'] = df.apply(
        lambda row: title_medians[row['Title']] if pd.isnull(row['Age']) else row['Age'],
        axis=1
    )

    title_std = ~df['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master'])
    df.loc[title_std, 'Title'] = df.loc[title_std, 'Sex'].map({'male': 'Mr', 'female': 'Mrs'})

    return df

#### 4.3 Missing data on Cabin

In [114]:
def replace_cabin (df):

    # Special case for cabins as nan may be signal
    df.Cabin = df.Cabin.fillna('Z00 ')

    # Create the 'Cabin_2' column
    df['Deck'] = df['Cabin'].str.split(' ').str[0].str[0]

    return df

#### 4.4 Missing data on Embarked & Fare

In [115]:
def replace_missing_embarked_fare (df):
    
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    return df

### 5. Featuring Engineering

#### 5.1 Merging supplementary data

In [116]:
def add_features_columns (df):

    # Creating new family_size column
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['Family_Size'] == 1).astype(int)

    # Creating Fare per person
    df['Fare_Per_Person']= df['Fare'] / (df['Family_Size'])
        
    # Creating Age times class
    df['Age*Class'] = df['Age'] * df['Pclass']
    df['Age*Fare'] = df['Age'] * df['Fare']

    return df

In [117]:
# Only for backup, add_factorize_or_encode do both together

def add_dummies (df):
    
    df_sex = pd.get_dummies(df['Sex'], prefix='sex', drop_first=False, dtype=int)
    df_Pclass = pd.get_dummies(df['Pclass'], prefix='class', drop_first=False, dtype=int)
    df_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=False, dtype=int)
    df_Title = pd.get_dummies(df['Title'], prefix='Title', drop_first=False, dtype=int)
    df_Deck = pd.get_dummies(df['Deck'], prefix='Deck', drop_first=False, dtype=int)

    df = pd.concat([df, df_sex, df_Pclass, df_Embarked, df_Title, df_Deck], axis=1)

    df = df.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Embarked', 'Deck', 'Title', 'Cabin'], axis=1)
    
    return df

In [118]:
# Only for backup, add_factorize_or_encode do both together

def add_encoding (df):
    
    df['Sex'], mapping_index = pd.factorize(df['Sex'])
    df['Pclass'], mapping_index = pd.factorize(df['Pclass'])
    df['Embarked'], mapping_index = pd.factorize(df['Embarked'])
    df['Title'], mapping_index = pd.factorize(df['Title'])
    df['Deck'], mapping_index = pd.factorize(df['Deck'])

    df = df.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
    
    return df

In [119]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

def add_factorize_or_encode(df, task = "classification"):
    # Preprocessing Step: Automatically handle categorical features
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    if task == "classification":
        # Factorize categorical columns for tree-based models and naive Bayes
        if len(categorical_cols) > 0:
            print(f"Factorizing categorical columns: {list(categorical_cols)}")
            for col in categorical_cols:
                df[col] = pd.factorize(df[col])[0]  # Factorize categories into numeric codes

    elif task == "regression":
        # One-hot encode categorical columns for regression models
        if len(categorical_cols) > 0:
            print(f"One-hot encoding categorical columns: {list(categorical_cols)}")
            encoder = OneHotEncoder(drop='first', sparse=False)
            encoded_cols = pd.DataFrame(encoder.fit_transform(X[categorical_cols]),
                                        columns=encoder.get_feature_names_out(categorical_cols))
            df = pd.concat([df.drop(columns=categorical_cols), encoded_cols], axis=1)
    else:
        raise ValueError("Invalid task. Please choose 'classification' or 'regression'.")
    
    return df

#### 5.2 Standarizing the data

In [159]:
def discretise_numeric(train, test, numeric_cols, no_bins=10):
    """
    Discretizes the specified numeric columns into bins, leaving other columns unchanged.
    """
    # Reset indices for consistent processing
    train = train.reset_index()
    test = test.reset_index()
    
    # Get the sizes of train and test datasets
    N = len(train)
    M = len(test)
    
    # Ensure unique indices for test by offsetting
    test.index += N  # Shift test index to avoid overlaps with train
    
    # Combine train and test for consistent discretization
    joint_df = pd.concat([train, test], axis=0)
    
    # Loop through only the specified numeric_cols to discretize
    for column in numeric_cols:
        if column in joint_df.columns:
            # Apply pd.qcut to discretize into quantile-based bins
            joint_df[column] = pd.qcut(joint_df[column], no_bins, labels=False, duplicates='drop')
        else:
            print(f"Warning: Column '{column}' not found in the dataset.")

    # Split the combined dataset back into train and test
    train = joint_df.iloc[:N].reset_index(drop=True)
    test = joint_df.iloc[N:].reset_index(drop=True)
    
    return train, test

In [121]:
from sklearn.preprocessing import StandardScaler

def standarize_values(train, test, numeric_cols):
    """
    Standardizes specified numeric columns in the training and test datasets,
    leaving all other columns unchanged.
    """
    # Reset indices for consistent processing
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler on the training set for the specified numeric columns
    scaler.fit(train[numeric_cols])

    # Transform only the specified numeric columns
    train_scaled = scaler.transform(train[numeric_cols])
    test_scaled = scaler.transform(test[numeric_cols])

    # Create DataFrames for the standardized numeric columns
    train_scaled_df = pd.DataFrame(train_scaled, columns=numeric_cols, index=train.index)
    test_scaled_df = pd.DataFrame(test_scaled, columns=numeric_cols, index=test.index)

    # Replace the specified numeric columns with the standardized versions
    train[numeric_cols] = train_scaled_df
    test[numeric_cols] = test_scaled_df

    return train, test

In [122]:
def equal_dataframes (train, test):
    
    # Get the columns that are in the test DataFrame
    common_columns = test.columns

    # Add 'Survived' to the list of columns
    columns_to_keep = list(common_columns)

    # Filter the train DataFrame to keep only these columns
    train_filtered = train[columns_to_keep]

    # Set PassengerId as Index
    # train_filtered = train_filtered.set_index('PassengerId')
    # test = test.set_index('PassengerId')

    return train_filtered, test

#### 5.2 Final features & cleaning selection

In [160]:
def clean_df (df):
    
    # Do all the steps together
    df = replace_missing_age(df)
    df = replace_missing_embarked_fare(df)
    df = replace_cabin(df)
    df = add_features_columns(df)
    df = add_factorize_or_encode(df, "classification")
    
    # Drop unnecessary columns
    df = df.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)
    
    return df

### 6. Data set analysis & premodeling work

In [161]:
train = clean_df(train_data)
test = clean_df(test_data)

# Only discretise numerical columns, not binaries
tr, te = discretise_numeric(train, test, numeric_cols=['Age', 'Fare'])

# Only standarize numerical columns, not discrets, not binaries
tr, te = standarize_values(tr, te, numeric_cols=['Fare_Per_Person', 'Age*Class', 'Age*Fare'])
train_final, test_final = equal_dataframes(tr, te)


Factorizing categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Deck']
Factorizing categorical columns: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Deck']


In [None]:
# correlation_analysis_type(tr, task="classification", target_col="Survived")

In [None]:
# correlation_analysis_type(tr, task="classification", target_col="Survived")

### 7. Modeling

#### 7.1 Short modeling option

In [104]:
# Model, Predicts and creates the submission file

from sklearn.ensemble import RandomForestClassifier

y = train_final['Survived']

X = train_final.drop(['Survived', 'PassengerId'], axis=1)
X_test = test_final.drop(['Survived', 'PassengerId'], axis=1)

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
model.fit(X,y)

predictions = model.predict(X_test).astype('int')

output = pd.DataFrame({'PassengerID': test_final.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print('Done')

Done


#### 7.2 Modeling function

In [175]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier, MLPRegressor

def evaluate_models(X, y, task="classification"):
    """
    Evaluates multiple machine learning models for a given task, and calculates relevant metrics.

    Parameters:
        X (DataFrame): Features for training.
        y (Series): Target variable for training.
        task (str): "classification" or "regression".
        
    Returns:
        dict: A dictionary of models and their evaluation metrics.
    """
    # Dictionary to store results: {model_name: metrics}
    results = {}

    # List of models to evaluate based on the task
    if task == "classification":
        models = {
            'Random Forest Classifier': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1),
            'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=1),
            'Logistic Regression': LogisticRegression(max_iter=2000, random_state=1),
            'Support Vector Machine (SVC)': SVC(kernel='rbf', probability=True, random_state=1),
            'K-Nearest Neighbors (KNN) Classifier': KNeighborsClassifier(n_neighbors=5),
            'Naive Bayes': GaussianNB(),
            'Neural Network Classifier': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=1)
        }
    elif task == "regression":
        models = {
            'Random Forest Regressor': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=1),
            'Gradient Boosting Regressor': GradientBoostingClassifier(random_state=1),
            'Linear Regression': LinearRegression(),
            'Ridge Regression': Ridge(random_state=1),
            'Support Vector Machine (SVR)': SVR(kernel='rbf'),
            'K-Nearest Neighbors (KNN) Regressor': KNeighborsRegressor(n_neighbors=5),
            'Neural Network Regressor': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=1)
        }
    else:
        raise ValueError("Invalid task. Please choose 'classification' or 'regression'.")

    # Loop through each model
    for model_name, model in models.items():
        # Split the data into training and validation sets (80% train, 20% test)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

        # Fit the model on the training data
        model.fit(X_train, y_train)

        # Predict on the validation set
        y_val_pred = model.predict(X_val)

        # Initialize a metrics dictionary
        metrics = {}

        # Calculate metrics for classification
        if task == "classification":
            metrics['Accuracy'] = accuracy_score(y_val, y_val_pred)
            metrics['F1 Score'] = f1_score(y_val, y_val_pred, average='weighted')
            if hasattr(model, "predict_proba"):  # Check if model supports predict_proba
                y_val_proba = model.predict_proba(X_val)[:, 1]
                metrics['ROC-AUC'] = roc_auc_score(y_val, y_val_proba)

        # Calculate metrics for regression
        elif task == "regression":
            metrics['MSE'] = mean_squared_error(y_val, y_val_pred)
            metrics['MAE'] = mean_absolute_error(y_val, y_val_pred)
            metrics['R²'] = r2_score(y_val, y_val_pred)
            metrics['RMSE'] = np.sqrt(metrics['MSE'])

        # Save the model's metrics in the results dictionary
        results[model_name] = metrics

    # Transpose to have models as rows and metrics as columns
    results_df = pd.DataFrame(results).T
    
    # Reset the index to add the model names as a column
    results_df = results_df.reset_index().rename(columns={'index': 'Model'})
        
    return results_df


In [165]:
def get_param_grid(model_name):
    """
    Returns a parameter grid for the selected model.

    Parameters:
        model_name (str): Name of the model for which to generate the param_grid.

    Returns:
        dict: A dictionary with hyperparameter options for the model.
    """
    param_grids = {
        'Gradient Boosting Classifier': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.8, 1.0]
        },
        'Random Forest Classifier': {
            'n_estimators': [100, 200, 300],
            'max_depth': [5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        },
        'Logistic Regression': {
            'penalty': ['l1', 'l2', 'elasticnet', None],
            'C': [0.01, 0.1, 1, 10],
            'solver': ['lbfgs', 'liblinear', 'saga']
        },
        'Support Vector Machine (SVC)': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto']
        },
        'K-Nearest Neighbors (KNN) Classifier': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
        },
        'Naive Bayes': {
            # Naive Bayes typically has limited hyperparameters to tune
            'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
        },
        'Neural Network Classifier': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['relu', 'tanh', 'logistic'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive']
        }
    }

    # Return the parameter grid for the selected model
    if model_name in param_grids:
        return param_grids[model_name]
    else:
        raise ValueError(f"Parameter grid for '{model_name}' is not defined.")

In [166]:
from sklearn.model_selection import GridSearchCV

def search_best_params(X, y, model, param_grid):
    """
    Searches for the best hyperparameters for a given model using GridSearchCV.

    Parameters:
        X (DataFrame): Features for training.
        y (Series): Target variable for training.
        model: The machine learning model to optimize.
        param_grid (dict): Dictionary containing hyperparameter grid to search.

    Returns:
        dict: The best parameters and the corresponding score.
    """
    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',  # You can change scoring as per your task
        cv=5,  # 5-fold cross-validation
        verbose=1,  # Displays the progress
        n_jobs=-1  # Uses all available processors
    )

    # Fit GridSearchCV on the data
    grid_search.fit(X, y)

    # Return the best parameters and score
    return {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }

### 7.3 Data set modeling work

In [180]:
y = train_final['Survived']

X = train_final.drop(['Survived', 'PassengerId'], axis=1) #Check to leave or not 'index' column, sometimes it predicts better with it
X_test = test_final.drop(['Survived', 'PassengerId'], axis=1)

results = evaluate_models(X, y, task="classification")
results

Unnamed: 0,Model,Accuracy,F1 Score,ROC-AUC
0,Random Forest Classifier,0.798883,0.792611,0.848669
1,Gradient Boosting Classifier,0.793296,0.787348,0.85526
2,Logistic Regression,0.77095,0.766373,0.82153
3,Support Vector Machine (SVC),0.592179,0.440498,0.736366
4,K-Nearest Neighbors (KNN) Classifier,0.564246,0.524568,0.625678
5,Naive Bayes,0.715084,0.711527,0.808478
6,Neural Network Classifier,0.75419,0.745276,0.827475


In [45]:
# Why Gradient Boosting Classifier?
# Highest Accuracy:
# It achieved an Accuracy of 0.815642, the highest among all the models. Accuracy measures the proportion of correctly classified samples.

# Strong F1 Score:
# Its F1 Score is 0.811958, also the highest. F1 Score is particularly important for imbalanced datasets as it balances precisio
# (minimizing false positives) and recall (minimizing false negatives).

# Best ROC-AUC:
# It also has the highest ROC-AUC (0.848798), which evaluates the model's ability to distinguish between classes.
# Higher values indicate better overall performance for classification problems, especially when probabilities are important.

In [177]:
chosen_model_name = 'Gradient Boosting Classifier'
model = GradientBoostingClassifier(random_state=1)
param_grid = get_param_grid(chosen_model_name)

# Call the function to find the best parameters
best_params_result = search_best_params(X, y, model, param_grid)
best_params = best_params_result['best_params']

# Display the results
print("Best Parameters:", best_params_result['best_params'])
print("Best Cross-Validation Score:", best_params_result['best_score'])


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best Cross-Validation Score: 0.8417487916640513


### 8. Prediction and Submission

#### 8.1 Get Predictions & Submissions

In [178]:
# Initialize the model with the best parameters
best_model = GradientBoostingClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    random_state=1  # Keep the random_state consistent
)

# Fit the model on the full training dataset
best_model.fit(X, y)

# Predict on the test data
predictions = best_model.predict(X_test).astype('int')

# Export to .csv file
output = pd.DataFrame({'PassengerID': test_final.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print('Done')

Done
