In [1]:
import pandas as pd
import numpy as np


In [2]:
training_data = pd.read_csv('data/final_training_data.csv')
print(training_data.shape)

(891, 21)


In [4]:
# initialize the final dataframe
final_df = training_data['PassengerId']

In [3]:
# split training data on target variable

X = training_data.drop(['Survived', 'PassengerId'], axis=1)
y = training_data['Survived']
passenger_id = training_data['PassengerId']

In [5]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold

from sklearn.base import clone

In [6]:
# Build logistic regression repeated KFold model

from sklearn.linear_model import LogisticRegression


# Configuration
n_splits = 5
n_repeats = 20
random_state = 42

# Create the cross-validation object
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

# Create the model
model = LogisticRegression()

# initialize a dictionary to store the accuracy scores
correct_predictions = {pid: 0 for pid in passenger_id}
# initialize a dictionary to store the number of times a passenger appears in a validation set
validation_counts = {pid: 0 for pid in passenger_id}


for train_index, val_index in rkf.split(X, y):
    # Split the data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    val_passenger_id = passenger_id.iloc[val_index]

    # Clone the model to ensure a fresh model is used for each fold
    clf = clone(model)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_val)

    # Update the correct_predictions dictionary
    for pid, pred, actual in zip(val_passenger_id, predictions, y_val):
        validation_counts[pid] += 1
        if pred == actual:
            correct_predictions[pid] += 1

# Calculate the proportion of correct predictions for each PassengerID
proportion_correct = {pid: correct_predictions[pid] / validation_counts[pid] for pid in passenger_id}


In [36]:
# Use this to reset LOGIT Data on the final_df

# Remove the logistic Regression Dataline on the final_df
final_df = final_df.drop('ProportionCorrectLogisticRegression', axis=1)

In [7]:
# Convert proportion_correct dictionary to DataFrame
proportion_correct_df = pd.DataFrame(list(proportion_correct.items()), columns=['PassengerId', 'ProportionCorrectLogisticRegression'])

# Merge the final_df with the proportion_correct_df on PassengerId
final_df = pd.merge(final_df, proportion_correct_df, on='PassengerId', how='left')




In [9]:
print(final_df.head(10))



   PassengerId  ProportionCorrectLogisticRegression
0            1                                  1.0
1            2                                  1.0
2            3                                  1.0
3            4                                  1.0
4            5                                  1.0
5            6                                  1.0
6            7                                  1.0
7            8                                  1.0
8            9                                  1.0
9           10                                  1.0


In [10]:
# build random forest RKF model

from sklearn.ensemble import RandomForestClassifier


# Configuration
n_splits = 5
n_repeats = 20
random_state = 42

# Create the cross-validation object
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

# Create the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# initialize a dictionary to store the accuracy scores
correct_predictions = {pid: 0 for pid in passenger_id}
# initialize a dictionary to store the number of times a passenger appears in a validation set
validation_counts = {pid: 0 for pid in passenger_id}


for train_index, val_index in rkf.split(X, y):
    # Split the data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    val_passenger_id = passenger_id.iloc[val_index]

    # Clone the model to ensure a fresh model is used for each fold
    clf = clone(model)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_val)

    # Update the correct_predictions dictionary
    for pid, pred, actual in zip(val_passenger_id, predictions, y_val):
        validation_counts[pid] += 1
        if pred == actual:
            correct_predictions[pid] += 1


# Calculate the proportion of correct predictions for each PassengerID
proportion_correct = {pid: correct_predictions[pid] / validation_counts[pid] for pid in passenger_id}


In [26]:
# Use this to reset Random Forest Data on the final_df

# Remove the RF Dataline on the final_df
final_df = final_df.drop('ProportionCorrectRandomForest', axis=1)

KeyError: "['ProportionCorrectRandomForest'] not found in axis"

In [12]:
# Convert proportion_correct dictionary to DataFrame
proportion_correct_df = pd.DataFrame(list(proportion_correct.items()), columns=['PassengerId', 'ProportionCorrectRandomForest'])

# Merge the final_df with the proportion_correct_df on PassengerId
final_df = pd.merge(final_df, proportion_correct_df, on='PassengerId', how='left')

In [13]:
print(final_df.head(10))

   PassengerId  ProportionCorrectLogisticRegression  \
0            1                                  1.0   
1            2                                  1.0   
2            3                                  1.0   
3            4                                  1.0   
4            5                                  1.0   
5            6                                  1.0   
6            7                                  1.0   
7            8                                  1.0   
8            9                                  1.0   
9           10                                  1.0   

   ProportionCorrectRandomForest  
0                           1.00  
1                           1.00  
2                           0.75  
3                           1.00  
4                           1.00  
5                           1.00  
6                           1.00  
7                           1.00  
8                           0.75  
9                           1.00  


In [14]:
# Basic implementation of XGBoost

from xgboost import XGBClassifier

# Configuration
n_splits = 5
n_repeats = 20
random_state = 42

# Create the cross-validation object
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

# Create the model
model = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')

# initialize a dictionary to store the accuracy scores
correct_predictions = {pid: 0 for pid in passenger_id}

# initialize a dictionary to store the number of times a passenger appears in a validation set
validation_counts = {pid: 0 for pid in passenger_id}


for train_index, val_index in rkf.split(X, y):
    # Split the data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    val_passenger_id = passenger_id.iloc[val_index]

    # Clone the model to ensure a fresh model is used for each fold
    clf = clone(model)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_val)

    # Update the correct_predictions dictionary
    for pid, pred, actual in zip(val_passenger_id, predictions, y_val):
        validation_counts[pid] += 1
        if pred == actual:
            correct_predictions[pid] += 1

# Calculate the proportion of correct predictions for each PassengerID
proportion_correct = {pid: correct_predictions[pid] / validation_counts[pid] for pid in passenger_id}


In [10]:
"""
Do NOT USE THIS CODE
Still under construction and in need of debugging
"""


# build an XGBoost RKF model

# XGBoost uses Early Stopping, which requires an additional evaluation set beyond the k folds
# We will use the first 20% of the data as the evaluation set, and run an extra k-fold iteration on the early stopping eval data so we still get every training sample evaluated many times

from xgboost import XGBClassifier


debug_file_path = 'debug_log.txt'

with open(debug_file_path, 'w') as f:
    f.write("Starting debugging... \n")

print("Total number of passengers:", len(passenger_id))
# Configuration
n_splits = 5
n_repeats = 20
random_state = 42

# Initialize KFold for ES validation data selection
es_kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

# Initialize RepeatedKFold for training/validation
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

# Create the model
model = XGBClassifier(n_estimators=100, random_state=42, early_stopping_rounds = 10, eval_metric='logloss')

# initialize a dictionary to store the accuracy scores
correct_predictions = {pid: 0 for pid in passenger_id}
# initialize a dictionary to store the number of times a passenger appears in a validation set
validation_counts = {pid: 0 for pid in passenger_id}

for es_train_index, es_val_index in es_kf.split(X, y):
   
    
    # Split the data for ES validation
    X_es_train, X_es_val = X.iloc[es_train_index], X.iloc[es_val_index]
    y_es_train, y_es_val = y.iloc[es_train_index], y.iloc[es_val_index]

    


    for train_index, val_index in rkf.split(X_es_train, y_es_train):
        # split the data for training/validation
        X_train, X_val = X_es_train.iloc[train_index], X_es_train.iloc[val_index]
        y_train, y_val = y_es_train.iloc[train_index], y_es_train.iloc[val_index]
        val_passenger_id = passenger_id.iloc[val_index]

        # Clone the model to ensure a fresh model is used for each fold
        clf = clone(model)
        # Fit the model with early stopping using the ES validation set
        clf.fit(X_train, y_train, eval_set=[(X_es_val, y_es_val)], verbose=False)
        predictions = clf.predict(X_val)

        # Update the correct_predictions dictionary
        for pid, pred, actual in zip(val_passenger_id, predictions, y_val):
            validation_counts[pid] += 1
            if pred == actual:
                correct_predictions[pid] += 1
        # check the validation count for passenger 99
        if 99 in val_passenger_id:
            print(validation_counts[99])
        else:
            print("Passenger 99 not in validation set")

# After the loops
zero_validation_counts = [pid for pid, count in validation_counts.items() if count == 0]
if zero_validation_counts:
    print("Total number of passengers not included in validation:", len(zero_validation_counts))
    print("Passenger IDs never included in validation:", zero_validation_counts)
else:
    print("All samples included in validation sets as expected.")

# Calculate the proportion of correct predictions for each PassengerID
proportion_correct = {pid: correct_predictions[pid] / validation_counts[pid] for pid in passenger_id}


Total number of passengers: 712
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
0
Passenger 93 not in validation set
0
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
0
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
0
Passenger 93 not in validation set
Passenger 93 not in validation set
0
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
0
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
Passenger 93 not in validation set
0
Passenger 93 not in validation set
Passenger

ZeroDivisionError: division by zero

In [13]:
print(validation_counts)

{332: 20, 734: 20, 383: 20, 705: 20, 814: 20, 119: 20, 537: 20, 362: 20, 30: 20, 56: 20, 866: 20, 596: 20, 240: 20, 722: 20, 82: 20, 260: 20, 487: 20, 717: 20, 801: 20, 782: 20, 543: 20, 327: 20, 535: 20, 536: 20, 484: 20, 763: 20, 534: 20, 714: 20, 391: 20, 496: 20, 483: 20, 378: 20, 366: 20, 413: 20, 469: 20, 410: 20, 8: 20, 581: 20, 156: 20, 881: 20, 426: 20, 102: 20, 200: 20, 425: 20, 299: 20, 885: 20, 248: 20, 479: 20, 306: 20, 317: 20, 3: 20, 406: 20, 743: 20, 261: 20, 368: 20, 159: 20, 555: 20, 830: 20, 678: 20, 381: 20, 627: 20, 303: 20, 433: 20, 523: 20, 405: 20, 844: 20, 356: 20, 465: 20, 754: 20, 213: 20, 450: 20, 80: 20, 149: 20, 429: 20, 61: 20, 661: 20, 805: 20, 589: 20, 766: 20, 266: 20, 134: 20, 709: 20, 545: 20, 347: 20, 822: 20, 634: 20, 457: 20, 501: 20, 431: 20, 446: 20, 651: 20, 173: 20, 451: 20, 315: 20, 333: 20, 802: 20, 91: 20, 835: 20, 182: 20, 582: 20, 796: 20, 70: 20, 132: 20, 335: 20, 598: 20, 136: 20, 165: 20, 29: 20, 784: 20, 194: 20, 870: 20, 716: 20, 526

In [32]:
# Use this to reset XGBoost Data on the final_df

# Remove the XGBoost Dataline on the final_df
final_df = final_df.drop('ProportionCorrectXGBoost', axis=1)


In [15]:
# Convert proportion_correct dictionary to DataFrame
proportion_correct_df = pd.DataFrame(list(proportion_correct.items()), columns=['PassengerId', 'ProportionCorrectXGBoost'])

# Merge the final_df with the proportion_correct_df on PassengerId
final_df = pd.merge(final_df, proportion_correct_df, on='PassengerId', how='left')

In [16]:
print(final_df.head(10))

   PassengerId  ProportionCorrectLogisticRegression  \
0            1                                  1.0   
1            2                                  1.0   
2            3                                  1.0   
3            4                                  1.0   
4            5                                  1.0   
5            6                                  1.0   
6            7                                  1.0   
7            8                                  1.0   
8            9                                  1.0   
9           10                                  1.0   

   ProportionCorrectRandomForest  ProportionCorrectXGBoost  
0                           1.00                      1.00  
1                           1.00                      1.00  
2                           0.75                      0.05  
3                           1.00                      1.00  
4                           1.00                      1.00  
5                           

In [17]:
# build a Lightboost RKF model

from lightgbm import LGBMClassifier

# Configuration
n_splits = 5
n_repeats = 20
random_state = 42

# Create the cross-validation object
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

# Create the model
model = LGBMClassifier(n_estimators=100, random_state=random_state, objective= 'binary', metric='binary_logloss')

# initialize a dictionary to store the accuracy scores
correct_predictions = {pid: 0 for pid in passenger_id}
# initialize a dictionary to store the number of times a passenger appears in a validation set
validation_counts = {pid: 0 for pid in passenger_id}


for train_index, val_index in rkf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    val_passenger_id = passenger_id.iloc[val_index]

    # Clone the model to ensure a fresh model is used for each fold
    clf = clone(model)
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='logloss')

    for pid, pred, actual in zip(val_passenger_id, predictions, y_val):
        validation_counts[pid] += 1
        if pred == actual:
            correct_predictions[pid] += 1

# Calculate the proportion of correct predictions for each PassengerID
proportion_correct = {pid: correct_predictions[pid] / validation_counts[pid] for pid in passenger_id}
    

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 229
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 275, number of negative: 438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 18
[LightGBM] [Info] [binary:BoostF

In [None]:
# Use this to reset LightBoost Data on the final_df

# Remove the XGBoost Dataline on the final_df
final_df = final_df.drop('ProportionCorrectLightBoost', axis=1)


In [18]:
# Convert proportion_correct dictionary to DataFrame
proportion_correct_df = pd.DataFrame(list(proportion_correct.items()), columns=['PassengerId', 'ProportionCorrectLightBoost'])

# Merge the final_df with the proportion_correct_df on PassengerId
final_df = pd.merge(final_df, proportion_correct_df, on='PassengerId', how='left')

In [19]:
print(final_df.head(10))

   PassengerId  ProportionCorrectLogisticRegression  \
0            1                                  1.0   
1            2                                  1.0   
2            3                                  1.0   
3            4                                  1.0   
4            5                                  1.0   
5            6                                  1.0   
6            7                                  1.0   
7            8                                  1.0   
8            9                                  1.0   
9           10                                  1.0   

   ProportionCorrectRandomForest  ProportionCorrectXGBoost  \
0                           1.00                      1.00   
1                           1.00                      1.00   
2                           0.75                      0.05   
3                           1.00                      1.00   
4                           1.00                      1.00   
5                     

In [20]:
# Catboost Model

from catboost import CatBoostClassifier, Pool

# Configuration
n_splits = 5
n_repeats = 20
random_state = 42

# Create the cross-validation object
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

# Create the model
model = CatBoostClassifier(n_estimators=100, 
                            random_state=random_state, 
                            learning_rate=0.1,
                            depth=6,
                            loss_function='Logloss', 
                            eval_metric='Accuracy'
                            )

# initialize a dictionary to store the accuracy scores
correct_predictions = {pid: 0 for pid in passenger_id}
# initialize a dictionary to store the number of times a passenger appears in a validation set
validation_counts = {pid: 0 for pid in passenger_id}


for train_index, val_index in rkf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    val_passenger_id = passenger_id.iloc[val_index]

    # create Catboost data containers
    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_val, y_val)

    # Clone the model to ensure a fresh model is used for each fold
    clf = clone(model)
    clf.fit(X_train, y_train, eval_set= val_pool, early_stopping_rounds = 10)

    predictions = clf.predict(X_val)
    
    for pid, pred, actual in zip(val_passenger_id, predictions, y_val):
        validation_counts[pid] += 1
        if pred == actual:
            correct_predictions[pid] += 1

# Calculate the proportion of correct predictions for each PassengerID
proportion_correct = {pid: correct_predictions[pid] / validation_counts[pid] for pid in passenger_id}

    

0:	learn: 0.8328652	test: 0.8268156	best: 0.8268156 (0)	total: 146ms	remaining: 14.4s
1:	learn: 0.8300562	test: 0.8212291	best: 0.8268156 (0)	total: 148ms	remaining: 7.27s
2:	learn: 0.8328652	test: 0.8100559	best: 0.8268156 (0)	total: 151ms	remaining: 4.88s
3:	learn: 0.8328652	test: 0.8100559	best: 0.8268156 (0)	total: 153ms	remaining: 3.67s
4:	learn: 0.8412921	test: 0.8100559	best: 0.8268156 (0)	total: 155ms	remaining: 2.94s
5:	learn: 0.8384831	test: 0.8156425	best: 0.8268156 (0)	total: 157ms	remaining: 2.46s
6:	learn: 0.8384831	test: 0.8156425	best: 0.8268156 (0)	total: 159ms	remaining: 2.11s
7:	learn: 0.8398876	test: 0.8156425	best: 0.8268156 (0)	total: 161ms	remaining: 1.85s
8:	learn: 0.8398876	test: 0.8156425	best: 0.8268156 (0)	total: 162ms	remaining: 1.64s
9:	learn: 0.8441011	test: 0.8156425	best: 0.8268156 (0)	total: 163ms	remaining: 1.47s
10:	learn: 0.8412921	test: 0.8044693	best: 0.8268156 (0)	total: 165ms	remaining: 1.33s
Stopped by overfitting detector  (10 iterations wait)

In [None]:
# Use this to reset LightBoost Data on the final_df

# Remove the XGBoost Dataline on the final_df
final_df = final_df.drop('ProportionCorrectCatBoost', axis=1)


In [21]:
# Convert proportion_correct dictionary to DataFrame
proportion_correct_df = pd.DataFrame(list(proportion_correct.items()), columns=['PassengerId', 'ProportionCorrectCatBoost'])

# Merge the final_df with the proportion_correct_df on PassengerId
final_df = pd.merge(final_df, proportion_correct_df, on='PassengerId', how='left')

In [22]:
print(final_df.head(10))

   PassengerId  ProportionCorrectLogisticRegression  \
0            1                                  1.0   
1            2                                  1.0   
2            3                                  1.0   
3            4                                  1.0   
4            5                                  1.0   
5            6                                  1.0   
6            7                                  1.0   
7            8                                  1.0   
8            9                                  1.0   
9           10                                  1.0   

   ProportionCorrectRandomForest  ProportionCorrectXGBoost  \
0                           1.00                      1.00   
1                           1.00                      1.00   
2                           0.75                      0.05   
3                           1.00                      1.00   
4                           1.00                      1.00   
5                     

In [23]:
# Export the final_df to a csv file

final_df.to_csv('data/model_accuracy_final_training_results.csv', index=False)

In [4]:
# import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [25]:
# use the training data to build all five models, then save them to the models folder
import joblib


# Logistic Regression
model_LR = LogisticRegression()
model_LR.fit(X, y)

joblib.dump(model_LR, 'final_models/logistic_regression_model.pkl')

# Random Forest
model_RF = RandomForestClassifier(n_estimators=100, random_state=42)
model_RF.fit(X, y)
joblib.dump(model_RF, 'final_models/random_forest_model.pkl')

# XGBoost
model_XGB = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss', enable_categorical=True)
model_XGB.fit(X, y)
joblib.dump(model_XGB, 'final_models/xgboost_model.pkl')

# LightBoost
model_LB = LGBMClassifier(n_estimators=100, random_state=42, objective= 'binary', metric='binary_logloss')
model_LB.fit(X, y)
joblib.dump(model_LB, 'final_models/lightboost_model.pkl')

# CatBoost
model_CB = CatBoostClassifier(n_estimators=100, 
                            random_state=42, 
                            learning_rate=0.1,
                            depth=6,
                            loss_function='Logloss', 
                            eval_metric='Accuracy'
                            )
model_CB.fit(X, y)
joblib.dump(model_CB, 'final_models/catboost_model.pkl')

print("Models saved to models folder")

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 258
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288
0:	learn: 0.8181818	total: 1.8ms	remaining: 178ms
1:	learn: 0.8237935	total: 3.27ms	remaining: 160ms
2:	learn: 0.8170595	total: 4.12ms	remaining: 133ms
3:	learn: 0.8294052	total: 5.63ms	remaining: 135ms
4:	learn: 0.8294052	total: 7.55ms	remaining: 144ms
5:	learn: 0.8361392	total: 9.13ms	remaining: 143ms
6:	learn: 0.8316498	total: 10.1ms	remaining: 134ms
7:	learn: 0.8327722	total: 12.2ms	remaining: 140ms
8:	learn: 0.8305275	total: 14.3ms	remainin

In [None]:
new_model_LR = joblib.load('models/logistic_regression_model.pkl')