In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import numpy as np
import lightgbm as lgb


In [2]:


# load trainging and test(for validation) csv's
train_file_path = '../data/processed/train.csv'
test_file_path = '../data/processed/test.csv'
train_data = pd.read_csv(train_file_path)
validation_data = pd.read_csv(test_file_path)

# split data into features and target values (x,y)
X = train_data['Survived']
y = train_data.drop(columns=['Survived'])



In [3]:
# split training data into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.60, test_size=0.30, random_state=42)

Family feature engineering

In [4]:
# feature engineering for parch and sibsp columns

# combine parch and sibsp into 'Family_size'
train_data['Family_Size'] = train_data['Parch'] + train_data['SibSp']
validation_data['Family_Size'] = validation_data['Parch'] + validation_data['SibSp']

#categorize 'Family_size' into 'family_category'
def categorize_family_size(size):
    if size ==0:
        return 'no family'
    elif size <= 3:
        return 'small family'
    else:
        return 'large family'
    
train_data['Family_category'] = train_data['Family_Size'].apply(categorize_family_size)
validation_data['Family_category'] = validation_data['Family_Size'].apply(categorize_family_size)

# encode family_category with numbers
family_encoder = LabelEncoder()
train_data['Family_category'] = family_encoder.fit_transform(train_data['Family_category'])
validation_data['Family_category'] = family_encoder.transform(validation_data['Family_category'])



Age feature engineering


In [5]:
# define bins for age categories and corresponding labels
bins = [-float('inf'), 2, 4, 12, 18, 30, 45, 60, float('inf')]
labels = ['baby', 'infant', 'child', 'teenager', 'youngadult', 'adult', 'oldadult','elder']

#replace 'Age' with 'Age_band'
train_data['Age_band'] = pd.cut(train_data['Age'], bins=bins, labels=labels)
validation_data['Age_band'] = pd.cut(validation_data['Age'], bins=bins, labels=labels)

# encode Age_band with numbers
age_band_encoder = LabelEncoder()
train_data['Age_band'] = age_band_encoder.fit_transform(train_data['Age_band'])
validation_data['Age_band'] = age_band_encoder.transform(validation_data['Age_band'])


In [6]:
train_data.head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

preprocessing pipeline

In [7]:
preprocessor = ColumnTransformer(transformers=[
    # Pipeline for numerical features
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Impute missing values for numerical features
        ('scaler', StandardScaler())
    ]), ['Fare']),
    
    # Pipeline for categorical features
    ('cat', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values for categorical features
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ]), ['Pclass', 'Age_band', 'Family_category', 'Sex', 'Embarked'])
], remainder='passthrough')

In [11]:
#print(train_data.columns)
na_count = train_data.isna().sum()
print(na_count)

PassengerId          0
Survived             0
Pclass               0
Name                 0
Sex                  0
Age                177
SibSp                0
Parch                0
Ticket               0
Fare                 0
Cabin              687
Embarked             2
Family_Size          0
Family_category      0
Age_band             0
dtype: int64


Imputations

In [12]:
# Handle missing values by imputing 'Age_Band' with KNN for both training and testing
features_for_imputation = ['Pclass', 'Family_category', 'Fare', 'Age_band']
train_impute = train_data[features_for_imputation]
validation_impute = validation_data[features_for_imputation]
knn_imputer = KNNImputer(n_neighbors=5)
train_imputed = knn_imputer.fit_transform(train_impute)
validation_imputed = knn_imputer.transform(validation_impute)

#convert imputed arrays back to df and retain original feature name
train_imputed_df = pd.DataFrame(train_imputed,columns=features_for_imputation)
validation_imputed_df = pd.DataFrame(validation_imputed,columns=features_for_imputation)

#replace age_band in original train and test df with imputed values
train_data['Age_band'] = train_imputed_df['Age_band']
validation_data['Age_band'] = validation_imputed_df['Age_band']


In [13]:
na_count = train_data.isna().sum()
print(na_count)

PassengerId          0
Survived             0
Pclass               0
Name                 0
Sex                  0
Age                177
SibSp                0
Parch                0
Ticket               0
Fare                 0
Cabin              687
Embarked             2
Family_Size          0
Family_category      0
Age_band             0
dtype: int64


define features and target 

In [8]:
X = train_data[['Pclass', 'Sex', 'Age_band', 'Family_category', 'Fare', 'Embarked']]
y = train_data['Survived']

# split training data into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.80, random_state=42)

ML models

In [9]:
# Models to train
models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbours": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, random_state=0),
    #"Bagging Decision Tree": BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42),
    #"Boosted Decision Tree": AdaBoostClassifier(n_estimators=50, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Voting Classifier": VotingClassifier(estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(probability=True)),
        #("XGBoost", xgb.XGBClassifier(n_estimators=100, random_state=0)),
        #("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42))
        #('xgb', xgb.XGBClassifier(n_estimators=100, random_state=0))
    ], voting='soft'),
    "Neural Network": MLPClassifier(max_iter=1000),
    "LightGBM": lgb.LGBMClassifier(
        n_estimators=150,  # Number of boosting rounds (trees)
        learning_rate=0.05,  # Step size at each iteration
        max_depth=-1,  # Maximum depth of a tree (-1 means no limit)
        random_state=0,  # For reproducibility
        boosting_type='gbdt',  # Gradient Boosting Decision Tree
        subsample=0.8,  # Fraction of data used to build each tree
        colsample_bytree=0.8  # Fraction of features used in building each tree
    )
}

Training


In [None]:
results = {}
training_accuracies = {}
training_metrics = []

for name, model in models.items():
    #create pipeline with preprocessor and classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    #train the model
    pipeline.fit(X_train, y_train)
    results[name] = pipeline
    
    # Evaluate accuracy on training set
    y_train_pred = pipeline.predict(X_train)
    accuracy = accuracy_score(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred, average='binary')
    recall = recall_score(y_train, y_train_pred, average='binary')
    f1 = f1_score(y_train, y_train_pred, average='binary')
    #training_accuracies[name] = accuracy
    #print(f"{name} Training Accuracy: {accuracy:.4f}")
    
     # Evaluate AUC of ROC
    try:
        y_train_proba = pipeline.predict_proba(X_train)[:,1] # get prob of positive class
        roc_auc = roc_auc_score(y_train, y_train_proba)
    except AttributeError:
        # Some models dont have predict_proba (SVM)
        y_train_decision = pipeline.decision_function(X_train)
        roc_auc = roc_auc_score(y_train, y_train_decision)
    
    #store metrics
    metrics = f"{name}:\n"
    metrics += f"Accuracy: {accuracy:.4f}\n"
    metrics += f"Precision: {precision:.4f}\n"
    metrics += f"Recall: {recall:.4f}\n"
    metrics += f"F1 Score: {f1:.4f}\n"
    metrics += f"AUC of ROC: {roc_auc:.4f}\n"
    print(metrics)
        
    # append metrics to the list
    training_metrics.append(metrics)

metrics_file_path = '../reports/performance-metrics/model_performance_metrics.txt'

with open(metrics_file_path, 'w') as f:
    for metric in training_metrics:
        f.write(metric + "\n")

print(f"Performance metrics saved to: {metrics_file_path}")    



this one

In [11]:
# Initialize a variable to store results, average metrics, and best model tracking
results = {}
training_metrics = []
best_model_name = None
best_model_auc = 0

# Use 5-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through each model in your models dictionary
for name, model in models.items():
    # Create a pipeline with preprocessor and classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

    # Perform cross-validation predictions
    y_train_pred = cross_val_predict(pipeline, X_train, y_train, cv=kfold, method='predict')
    
    # Calculate metrics using cross-validation predictions
    accuracy = accuracy_score(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred, average='binary')
    recall = recall_score(y_train, y_train_pred, average='binary')
    f1 = f1_score(y_train, y_train_pred, average='binary')
    """To decide between MSE and MAE, it’s crucial to assess the nature of your data. 
    If your dataset includes outliers — data points that don’t conform to the general pattern — it’s advisable to opt for MAE. 
    By treating all errors equally, MAE provides better resilience against the distortions introduced by outliers. 
    Conversely, if your data is relatively clean and without significant outliers, MSE’s faster convergence might offer an advantage."""
    mse = mean_squared_error(y_train, y_train_pred)
    mae = mean_absolute_error(y_train, y_train_pred)

    # Calculate AUC of ROC
    try:
        y_train_proba = cross_val_predict(pipeline, X_train, y_train, cv=kfold, method='predict_proba')[:, 1]
        roc_auc = roc_auc_score(y_train, y_train_proba)
    except AttributeError:
        # Some models don't have predict_proba (e.g., SVM without probability=True)
        y_train_decision = cross_val_predict(pipeline, X_train, y_train, cv=kfold, method='decision_function')
        roc_auc = roc_auc_score(y_train, y_train_decision)

    # Store the average metrics
    metrics = f"{name} (Averaged over 5-Fold Cross Validation):\n"
    metrics += f"Accuracy: {accuracy:.4f}\n"
    metrics += f"Precision: {precision:.4f}\n"
    metrics += f"Recall: {recall:.4f}\n"
    metrics += f"F1 Score: {f1:.4f}\n"
    metrics += f"AUC of ROC: {roc_auc:.4f}\n"
    metrics += f"mean squared error: {mse:.4f}\n"
   # metrics += f"mean absolute error: {mae:.4f}\n"
    print(metrics)

    # Append metrics to list
    training_metrics.append(metrics)

    # Update best model based on AUC of ROC
    if roc_auc > best_model_auc:
        best_model_auc = roc_auc
        best_model_name = name
        # Store the best performing pipeline for validation
        results[name] = pipeline

# Print the model with the highest AUC of ROC
print(f"\nModel with the highest AUC of ROC: {best_model_name} ({best_model_auc:.4f})")

metrics_file_path = '../reports/performance-metrics/model_performance_metrics.txt'

with open(metrics_file_path, 'w') as f:
    for metric in training_metrics:
        f.write(metric + "\n")

print(f"Performance metrics saved to: {metrics_file_path}")    



Naive Bayes (Averaged over 5-Fold Cross Validation):
Accuracy: 0.7865
Precision: 0.7248
Recall: 0.6978
F1 Score: 0.7110
AUC of ROC: 0.8145
mean squared error: 0.2135

Logistic Regression (Averaged over 5-Fold Cross Validation):
Accuracy: 0.8132
Precision: 0.7755
Recall: 0.7090
F1 Score: 0.7407
AUC of ROC: 0.8504
mean squared error: 0.1868

K-Nearest Neighbours (Averaged over 5-Fold Cross Validation):
Accuracy: 0.8020
Precision: 0.7773
Recall: 0.6642
F1 Score: 0.7163
AUC of ROC: 0.8323
mean squared error: 0.1980

Support Vector Machine (Averaged over 5-Fold Cross Validation):
Accuracy: 0.8230
Precision: 0.8257
Recall: 0.6716
F1 Score: 0.7407
AUC of ROC: 0.8304
mean squared error: 0.1770

Decision Tree (Averaged over 5-Fold Cross Validation):
Accuracy: 0.8034
Precision: 0.7581
Recall: 0.7015
F1 Score: 0.7287
AUC of ROC: 0.7801
mean squared error: 0.1966

XGBoost (Averaged over 5-Fold Cross Validation):
Accuracy: 0.8188
Precision: 0.7860
Recall: 0.7127
F1 Score: 0.7476
AUC of ROC: 0.8494


In [None]:
results = {}
training_metrics = []

# Initialize a variable to keep track of the best model based on AUC of ROC
best_model_name = None
best_model_auc = 0

# Loop through each model in your model dictionary
for name, model in models.items():
    # Initialize accumulators for each metric to calculate the average
    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    total_roc_auc = 0
    
    # Run each model 10 times
    for i in range(100):
        # Shuffle the training data by re-splitting using train_test_split
        X_train_split, _, y_train_split, _ = train_test_split(X_train, y_train, train_size=0.7, random_state=i)
        
        # Create a pipeline with the preprocessor and classifier
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
        
        # Train the model
        pipeline.fit(X_train_split, y_train_split)
        results[name] = pipeline
        
        # Predict on the training set to evaluate performance
        y_train_pred = pipeline.predict(X_train_split)
        
        # Calculate metrics
        accuracy = accuracy_score(y_train_split, y_train_pred)
        precision = precision_score(y_train_split, y_train_pred, average='binary')
        recall = recall_score(y_train_split, y_train_pred, average='binary')
        f1 = f1_score(y_train_split, y_train_pred, average='binary')
        
        # Calculate AUC of ROC
        try:
            y_train_proba = pipeline.predict_proba(X_train_split)[:, 1]  # Get probability of positive class
            roc_auc = roc_auc_score(y_train_split, y_train_proba)
        except AttributeError:
            # Some models may not have predict_proba (e.g., SVM without probability=True)
            y_train_decision = pipeline.decision_function(X_train_split)
            roc_auc = roc_auc_score(y_train_split, y_train_decision)
        
        # Accumulate metrics
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_roc_auc += roc_auc
    
    # Calculate average metrics over 10 runs
    avg_accuracy = total_accuracy / 10
    avg_precision = total_precision / 10
    avg_recall = total_recall / 10
    avg_f1 = total_f1 / 10
    avg_roc_auc = total_roc_auc / 10

    # Store the average metrics
    metrics = f"{name} (Averaged over 10 runs):\n"
    metrics += f"Accuracy: {avg_accuracy:.4f}\n"
    metrics += f"Precision: {avg_precision:.4f}\n"
    metrics += f"Recall: {avg_recall:.4f}\n"
    metrics += f"F1 Score: {avg_f1:.4f}\n"
    metrics += f"AUC of ROC: {avg_roc_auc:.4f}\n"
    print(metrics)
    
    # Append metrics to the list for saving to the file
    training_metrics.append(metrics)

    # Update the best model based on AUC of ROC
    if avg_roc_auc > best_model_auc:
        best_model_auc = avg_roc_auc
        best_model_name = name
    
    
        
# Print the model with the highest AUC of ROC
print(f"\nModel with the highest AUC of ROC: {best_model_name} ({best_model_auc:.4f})")

metrics_file_path = '../reports/performance-metrics/model_performance_metrics.txt'

with open(metrics_file_path, 'w') as f:
    for metric in training_metrics:
        f.write(metric + "\n")

print(f"Performance metrics saved to: {metrics_file_path}")    


test on validation data

In [None]:
chosen_model = results[best_model_name]
chosen_model.fit(X_train, y_train)

# Extract features from validation data
X_validation = validation_data[['Pclass', 'Sex', 'Age_band', 'Family_category', 'Fare', 'Embarked']]

# Transform the validation features using the pre-trained preprocessor
#X_validation_transformed = preprocessor.transform(X_validation)

# Make predictions using the best performing model
y_validation_pred = chosen_model.predict(X_validation)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': validation_data['PassengerId'],
    'Survived': y_validation_pred
})


# Save the submission to a CSV file
submission_file_path = '../reports/submission.csv'
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to: {submission_file_path}")

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000349 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 149
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
Submission file saved to: ../reports/lightGBM_classifier.csv
