In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

# TODO combine Parch and SibSp into family and no family, look into categorizing no fam, small fam and large fam 
# TODO turn age in to categorical data (age bands)b c t YA A OA E
# TODO look into categorizing fare

# Load training and test datasets
train_file_path = '../data/processed/train.csv'
test_file_path = '../data/processed/test.csv'
train_data = pd.read_csv(train_file_path)
X = train_data['Survived']
y = train_data.drop(columns=['Survived'])
test_data = pd.read_csv(test_file_path)

# Combine parch and sibsp into family_size
train_data['Family_Size'] = train_data['Parch'] + train_data['SibSp']
test_data['Family_Size'] = test_data['Parch'] + test_data['SibSp']

#categorize "family_size" into 'family_category'
def categorize_family_size(size):
    if size ==0:
        return 'no family'
    elif size <= 3:
        return 'small family'
    else:
        return 'large family'

train_data['Family_Category'] = train_data['Family_Size'].apply(categorize_family_size)
test_data['Family_Category'] = test_data['Family_Size'].apply(categorize_family_size)

# Step 3: Convert 'Family_Category' to numerical codes (e.g., 0: no family, 1: small family, 2: large family)
family_encoder = LabelEncoder()
train_data['Family_Category'] = family_encoder.fit_transform(train_data['Family_Category'])
test_data['Family_Category'] = family_encoder.transform(test_data['Family_Category'])

# Drop the original 'Parch' and 'SibSp' columns
#train_data.drop(columns=['Parch', 'SibSp'], inplace=True)
#test_data.drop(columns=['Parch', 'SibSp'], inplace=True)

# Impute the missing value in 'Fare' for the test set using the median value
imputer = SimpleImputer(strategy='median') 
test_data['Fare'] = imputer.fit_transform(test_data[['Fare']])

'''
# Define bins for Fare categories
#fare_bins = [-float('inf'), 7.91, 14.45, 31.0, float('inf')]
fare_labels = ['Low', 'Medium', 'High', 'Very High']

# Apply categorization to Fare
train_data['Fare_Band'] = pd.cut(train_data['Fare'], bins=fare_bins, labels=fare_labels)
test_data['Fare_Band'] = pd.cut(test_data['Fare'], bins=fare_bins, labels=fare_labels)

# Encode 'Fare_Band' into numerical form
fare_band_encoder = LabelEncoder()
train_data['Fare_Band'] = fare_band_encoder.fit_transform(train_data['Fare_Band'])
test_data['Fare_Band'] = fare_band_encoder.transform(test_data['Fare_Band'])
'''

# Define the bins for age categories and corresponding labels
bins = [-float('inf'), 4, 12, 18, 30, 45, 60, float('inf')]
labels = ['baby', 'child', 'teenager', 'youngadult', 'adult', 'oldadult', 'elder']

# Cut out 'Age'column and add 'Age_Band' to train and test
train_data['Age_Band'] = pd.cut(train_data['Age'], bins=bins, labels=labels)
test_data['Age_Band'] = pd.cut(test_data['Age'], bins=bins, labels=labels)

# Handle missing values by imputing 'Age' with the median for both training and testing datasets
#imputer = SimpleImputer(strategy='median')
#train_data['Age'] = imputer.fit_transform(train_data[['Age']])
#test_data['Age'] = imputer.transform(test_data[['Age']])
age_band_encoder = LabelEncoder()
train_data['Age_Band'] = age_band_encoder.fit_transform(train_data['Age_Band'])
test_data['Age_Band'] = age_band_encoder.transform(test_data['Age_Band'])


# Preprocessing pipeline: one-hot encode categorical variables and standardize numeric ones
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), ['Fare']),
    ('cat', OneHotEncoder(sparse_output=False), ['Pclass', 'Age_Band', 'Family_Category', 'Sex', 'Embarked'])
], remainder='passthrough')

# Handle missing values by imputing 'Age_Band' with KNN for both training and testing
features_for_imputation = ['Pclass', 'Family_Category', 'Fare', 'Age_Band']
train_impute = train_data[features_for_imputation]
test_impute = test_data[features_for_imputation]
knn_imputer = KNNImputer(n_neighbors=5)
train_imputed = knn_imputer.fit_transform(train_impute)
test_imputed = knn_imputer.fit_transform(test_impute)

#convert imputed arrays back to df and retain original feature name
train_imputed_df = pd.DataFrame(train_imputed,columns=features_for_imputation)
test_imputed_df = pd.DataFrame(test_imputed,columns=features_for_imputation)

#replace age_band in original train and test df with imputed values
train_data['Age_Band'] = train_imputed_df['Age_Band']
test_data['Age_Band'] = test_imputed_df['Age_Band']

# Drop the 'Cabin' column entirely from both training and testing datasets
train_data.drop(columns=['Cabin'], inplace=True)
test_data.drop(columns=['Cabin'], inplace=True)

# Defining feature columns and target variable
feature_columns = ['Pclass', 'Sex', 'Age_Band', 'Family_Category', 'Fare', 'Embarked']
target_column = 'Survived'

# Prepare features and labels for training and testing
X_train = train_data[feature_columns]
y_train = train_data[target_column]
X_test = test_data[feature_columns]



# Models to train
models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbours": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, random_state=0),
    #"Bagging Decision Tree": BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42),
    #"Boosted Decision Tree": AdaBoostClassifier(n_estimators=50, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Voting Classifier": VotingClassifier(estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('knn', KNeighborsClassifier()),
        ('svc', SVC(probability=True)),
        #('xgb', xgb.XGBClassifier(n_estimators=100, random_state=0))
    ], voting='soft'),
    "Neural Network": MLPClassifier(max_iter=1000)
}

# Train and evaluate each model using the training set
results = {}
training_accuracies = {}
training_metrics = []

for name, model in models.items():
    # Create a pipeline with the preprocessor and classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    results[name] = pipeline
    
    # Evaluate accuracy on training set
    y_train_pred = pipeline.predict(X_train)
    accuracy = accuracy_score(y_train, y_train_pred)
    precision = precision_score(y_train, y_train_pred, average='binary')
    recall = recall_score(y_train, y_train_pred, average='binary')
    f1 = f1_score(y_train, y_train_pred, average='binary')
    #training_accuracies[name] = accuracy
    #print(f"{name} Training Accuracy: {accuracy:.4f}")
    
     # Evaluate AUC of ROC
    try:
        y_train_proba = pipeline.predict_proba(X_train)[:,1] # get prob of positive class
        roc_auc = roc_auc_score(y_train, y_train_proba)
    except AttributeError:
        # Some models dont have predict_proba (SVM)
        y_train_decision = pipeline.decision_function(X_train)
        roc_auc = roc_auc_score(y_train, y_train_decision)
    
    #store metrics
    metrics = f"{name}:\n"
    metrics += f"Accuracy: {accuracy:.4f}\n"
    metrics += f"Precision: {precision:.4f}\n"
    metrics += f"Recall: {recall:.4f}\n"
    metrics += f"F1 Score: {f1:.4f}\n"
    metrics += f"AUC of ROC: {roc_auc:.4f}\n"
    print(metrics)
        
    # append metrics to the list
    training_metrics.append(metrics)

metrics_file_path = '../reports/performance-metrics/model_performance_metrics.txt'

with open(metrics_file_path, 'w') as f:
    for metric in training_metrics:
        f.write(metric + "\n")

print(f"Performance metrics saved to: {metrics_file_path}")    

# Choose a model to print out to csv
chosen_model = results['Voting Classifier']

# Fit the preprocessor to ensure consistency between training and test sets
preprocessor.fit(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Generate predictions
y_test_pred = chosen_model.named_steps['classifier'].predict(X_test_transformed)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': y_test_pred
})

# Save the submission to a CSV file
submission_file_path = '../reports/titanic_submission_vote_classifier.csv'
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to: {submission_file_path}")

Naive Bayes:
Accuracy: 0.7284
Precision: 0.9098
Recall: 0.3246
F1 Score: 0.4784
AUC of ROC: 0.8442

Logistic Regression:
Accuracy: 0.8126
Precision: 0.7760
Recall: 0.7193
F1 Score: 0.7466
AUC of ROC: 0.8722

K-Nearest Neighbours:
Accuracy: 0.8575
Precision: 0.8349
Recall: 0.7836
F1 Score: 0.8084
AUC of ROC: 0.9307

Support Vector Machine:
Accuracy: 0.8395
Precision: 0.8396
Recall: 0.7193
F1 Score: 0.7748
AUC of ROC: 0.8932

Decision Tree:
Accuracy: 0.9495
Precision: 0.9598
Recall: 0.9064
F1 Score: 0.9323
AUC of ROC: 0.9928

XGBoost:
Accuracy: 0.9371
Precision: 0.9414
Recall: 0.8918
F1 Score: 0.9159
AUC of ROC: 0.9813

Random Forest:
Accuracy: 0.9495
Precision: 0.9486
Recall: 0.9181
F1 Score: 0.9331
AUC of ROC: 0.9879

Voting Classifier:
Accuracy: 0.8474
Precision: 0.8503
Recall: 0.7310
F1 Score: 0.7862
AUC of ROC: 0.9257

Neural Network:
Accuracy: 0.8709
Precision: 0.8697
Recall: 0.7807
F1 Score: 0.8228
AUC of ROC: 0.9340

Performance metrics saved to: ../reports/performance-metrics/mo