In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb


In [95]:
# load trainging and test(for validation) csv's
train_file_path = '../data/processed/train.csv'
test_file_path = '../data/processed/test.csv'
train_data = pd.read_csv(train_file_path)
validation_data = pd.read_csv(test_file_path)

# split data into features and target values (x,y)
#X = train_data['Survived']
#y = train_data.drop(columns=['Survived'])



In [96]:
# split training data into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.60, test_size=0.30, random_state=42)

Family feature engineering

In [97]:
# feature engineering for parch and sibsp columns

# combine parch and sibsp into 'Family_size'
train_data['Family_Size'] = train_data['Parch'] + train_data['SibSp']
validation_data['Family_Size'] = validation_data['Parch'] + validation_data['SibSp']

#categorize 'Family_size' into 'family_category'
def categorize_family_size(size):
    if size ==0:
        return 'no family'
    elif size <= 3:
        return 'small family'
    else:
        return 'large family'
    
train_data['Family_category'] = train_data['Family_Size'].apply(categorize_family_size)
validation_data['Family_category'] = validation_data['Family_Size'].apply(categorize_family_size)

# encode family_category with numbers
family_encoder = LabelEncoder()
train_data['Family_category'] = family_encoder.fit_transform(train_data['Family_category'])
validation_data['Family_category'] = family_encoder.transform(validation_data['Family_category'])



Age feature engineering


In [98]:
# define bins for age categories and corresponding labels
bins = [-float('inf'), 2, 4, 12, 18, 30, 45, 60, float('inf')]
labels = ['baby', 'infant', 'child', 'teenager', 'youngadult', 'adult', 'oldadult','elder']

#replace 'Age' with 'Age_band'
train_data['Age_band'] = pd.cut(train_data['Age'], bins=bins, labels=labels)
validation_data['Age_band'] = pd.cut(validation_data['Age'], bins=bins, labels=labels)

# encode Age_band with numbers
age_band_encoder = LabelEncoder()
train_data['Age_band'] = age_band_encoder.fit_transform(train_data['Age_band'])
validation_data['Age_band'] = age_band_encoder.transform(validation_data['Age_band'])


In [99]:
train_data.head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

preprocessing pipeline

In [100]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), ['Fare']),
    ('cat', OneHotEncoder(sparse_output=False), ['Pclass', 'Age_band', 'Family_category', 'Sex', 'Embarked'])
], remainder='passthrough')

In [101]:
print(train_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family_Size',
       'Family_category', 'Age_band'],
      dtype='object')


Imputations

In [102]:
# Handle missing values by imputing 'Age_Band' with KNN for both training and testing
features_for_imputation = ['Pclass', 'Family_category', 'Fare', 'Age_band']
train_impute = train_data[features_for_imputation]
validation_impute = validation_data[features_for_imputation]
knn_imputer = KNNImputer(n_neighbors=5)
train_imputed = knn_imputer.fit_transform(train_impute)
validation_imputed = knn_imputer.transform(validation_impute)

#convert imputed arrays back to df and retain original feature name
train_imputed_df = pd.DataFrame(train_imputed,columns=features_for_imputation)
validation_imputed_df = pd.DataFrame(validation_imputed,columns=features_for_imputation)

#replace age_band in original train and test df with imputed values
train_data['Age_band'] = train_imputed_df['Age_band']
validation_data['Age_band'] = validation_imputed_df['Age_band']


define features and target 

In [103]:
X = train_data[['Pclass', 'Sex', 'Age_band', 'Family_category', 'Fare', 'Embarked']]
y = train_data['Survived']

# split training data into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.70, random_state=42)

ML models

In [104]:
# Models to train
models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbours": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, random_state=0),
    #"Bagging Decision Tree": BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42),
    #"Boosted Decision Tree": AdaBoostClassifier(n_estimators=50, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Voting Classifier": VotingClassifier(estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(probability=True)),
        #('xgb', xgb.XGBClassifier(n_estimators=100, random_state=0))
    ], voting='soft'),
    "Neural Network": MLPClassifier(max_iter=1000)
}

Training


In [105]:
results = {}
training_accuracies = {}
training_metrics = []

for name, model in models.items():
    #create pipeline with preprocessor and classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    #train the model
    pipeline.fit(X_train, y_train)
    results[name] = pipeline
    
    # Evaluate accuracy on training set
    y_train_pred = pipeline.predict(X_train)
    accuracy = accuracy_score(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred, average='binary')
    recall = recall_score(y_train, y_train_pred, average='binary')
    f1 = f1_score(y_train, y_train_pred, average='binary')
    #training_accuracies[name] = accuracy
    #print(f"{name} Training Accuracy: {accuracy:.4f}")
    
     # Evaluate AUC of ROC
    try:
        y_train_proba = pipeline.predict_proba(X_train)[:,1] # get prob of positive class
        roc_auc = roc_auc_score(y_train, y_train_proba)
    except AttributeError:
        # Some models dont have predict_proba (SVM)
        y_train_decision = pipeline.decision_function(X_train)
        roc_auc = roc_auc_score(y_train, y_train_decision)
    
    #store metrics
    metrics = f"{name}:\n"
    metrics += f"Accuracy: {accuracy:.4f}\n"
    metrics += f"Precision: {precision:.4f}\n"
    metrics += f"Recall: {recall:.4f}\n"
    metrics += f"F1 Score: {f1:.4f}\n"
    metrics += f"AUC of ROC: {roc_auc:.4f}\n"
    print(metrics)
        
    # append metrics to the list
    training_metrics.append(metrics)

metrics_file_path = '../reports/performance-metrics/model_performance_metrics.txt'

with open(metrics_file_path, 'w') as f:
    for metric in training_metrics:
        f.write(metric + "\n")

print(f"Performance metrics saved to: {metrics_file_path}")    



Naive Bayes:
Accuracy: 0.7271
Precision: 0.8961
Recall: 0.2987
F1 Score: 0.4481
AUC of ROC: 0.8397

Logistic Regression:
Accuracy: 0.8202
Precision: 0.7902
Recall: 0.7013
F1 Score: 0.7431
AUC of ROC: 0.8662

K-Nearest Neighbours:
Accuracy: 0.8459
Precision: 0.8325
Recall: 0.7316
F1 Score: 0.7788
AUC of ROC: 0.9233

Support Vector Machine:
Accuracy: 0.8523
Precision: 0.8883
Recall: 0.6883
F1 Score: 0.7756
AUC of ROC: 0.8967

Decision Tree:
Accuracy: 0.9583
Precision: 0.9858
Recall: 0.9004
F1 Score: 0.9412
AUC of ROC: 0.9954

XGBoost:
Accuracy: 0.9502
Precision: 0.9673
Recall: 0.8961
F1 Score: 0.9303
AUC of ROC: 0.9888

Random Forest:
Accuracy: 0.9583
Precision: 0.9767
Recall: 0.9091
F1 Score: 0.9417
AUC of ROC: 0.9911

Voting Classifier:
Accuracy: 0.8539
Precision: 0.8723
Recall: 0.7100
F1 Score: 0.7828
AUC of ROC: 0.9229

Neural Network:
Accuracy: 0.8716
Precision: 0.9037
Recall: 0.7316
F1 Score: 0.8086
AUC of ROC: 0.9307

Performance metrics saved to: ../reports/performance-metrics/mo

In [109]:
results = {}
training_metrics = []

# Initialize a variable to keep track of the best model based on AUC of ROC
best_model_name = None
best_model_auc = 0

# Loop through each model in your model dictionary
for name, model in models.items():
    # Initialize accumulators for each metric to calculate the average
    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    total_roc_auc = 0
    
    # Run each model 10 times
    for i in range(10):
        # Shuffle the training data by re-splitting using train_test_split
        X_train_split, _, y_train_split, _ = train_test_split(X_train, y_train, train_size=0.7, random_state=i)
        
        # Create a pipeline with the preprocessor and classifier
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
        
        # Train the model
        pipeline.fit(X_train_split, y_train_split)
        results[name] = pipeline
        
        # Predict on the training set to evaluate performance
        y_train_pred = pipeline.predict(X_train_split)
        
        # Calculate metrics
        accuracy = accuracy_score(y_train_split, y_train_pred)
        precision = precision_score(y_train_split, y_train_pred, average='binary')
        recall = recall_score(y_train_split, y_train_pred, average='binary')
        f1 = f1_score(y_train_split, y_train_pred, average='binary')
        
        # Calculate AUC of ROC
        try:
            y_train_proba = pipeline.predict_proba(X_train_split)[:, 1]  # Get probability of positive class
            roc_auc = roc_auc_score(y_train_split, y_train_proba)
        except AttributeError:
            # Some models may not have predict_proba (e.g., SVM without probability=True)
            y_train_decision = pipeline.decision_function(X_train_split)
            roc_auc = roc_auc_score(y_train_split, y_train_decision)
        
        # Accumulate metrics
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_roc_auc += roc_auc
    
    # Calculate average metrics over 10 runs
    avg_accuracy = total_accuracy / 10
    avg_precision = total_precision / 10
    avg_recall = total_recall / 10
    avg_f1 = total_f1 / 10
    avg_roc_auc = total_roc_auc / 10

    # Store the average metrics
    metrics = f"{name} (Averaged over 10 runs):\n"
    metrics += f"Accuracy: {avg_accuracy:.4f}\n"
    metrics += f"Precision: {avg_precision:.4f}\n"
    metrics += f"Recall: {avg_recall:.4f}\n"
    metrics += f"F1 Score: {avg_f1:.4f}\n"
    metrics += f"AUC of ROC: {avg_roc_auc:.4f}\n"
    print(metrics)
    
    # Append metrics to the list for saving to the file
    training_metrics.append(metrics)

    # Update the best model based on AUC of ROC
    if avg_roc_auc > best_model_auc:
        best_model_auc = avg_roc_auc
        best_model_name = name
    
    
        
# Print the model with the highest AUC of ROC
print(f"\nModel with the highest AUC of ROC: {best_model_name} ({best_model_auc:.4f})")

metrics_file_path = '../reports/performance-metrics/model_performance_metrics.txt'

with open(metrics_file_path, 'w') as f:
    for metric in training_metrics:
        f.write(metric + "\n")

print(f"Performance metrics saved to: {metrics_file_path}")    


Naive Bayes (Averaged over 10 runs):
Accuracy: 0.7326
Precision: 0.7825
Recall: 0.5359
F1 Score: 0.5750
AUC of ROC: 0.8385

Logistic Regression (Averaged over 10 runs):
Accuracy: 0.8193
Precision: 0.7865
Recall: 0.7067
F1 Score: 0.7443
AUC of ROC: 0.8647

K-Nearest Neighbours (Averaged over 10 runs):
Accuracy: 0.8450
Precision: 0.8408
Recall: 0.7207
F1 Score: 0.7759
AUC of ROC: 0.9196

Support Vector Machine (Averaged over 10 runs):
Accuracy: 0.8472
Precision: 0.8798
Recall: 0.6861
F1 Score: 0.7699
AUC of ROC: 0.8947

Decision Tree (Averaged over 10 runs):
Accuracy: 0.9651
Precision: 0.9863
Recall: 0.9195
F1 Score: 0.9516
AUC of ROC: 0.9966

XGBoost (Averaged over 10 runs):
Accuracy: 0.9583
Precision: 0.9678
Recall: 0.9188
F1 Score: 0.9425
AUC of ROC: 0.9915

Random Forest (Averaged over 10 runs):
Accuracy: 0.9651
Precision: 0.9702
Recall: 0.9355
F1 Score: 0.9524
AUC of ROC: 0.9926

Voting Classifier (Averaged over 10 runs):
Accuracy: 0.8486
Precision: 0.8650
Recall: 0.7047
F1 Score: 0

test on validation data

In [112]:
chosen_model = results[best_model_name]

# Extract features from validation data
X_validation = validation_data[['Pclass', 'Sex', 'Age_band', 'Family_category', 'Fare', 'Embarked']]

# Transform the validation features using the pre-trained preprocessor
X_validation_transformed = preprocessor.transform(X_validation)

# Make predictions using the best performing model
y_validation_pred = chosen_model.named_steps['classifier'].predict(X_validation_transformed)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': validation_data['PassengerId'],
    'Survived': y_validation_pred
})


# Save the submission to a CSV file
submission_file_path = '../reports/decision_tree_classifier.csv'
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to: {submission_file_path}")

Submission file saved to: ../reports/decision_tree_classifier.csv
