Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
# from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV

Load dataset

In [None]:
train_df = pd.read_csv('dataset/final_df.csv')
test_df = pd.read_csv('dataset/test.csv')

In [None]:
test_df.head()

In [None]:
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average = 'weighted')
    recall = recall_score(true, predicted, average = 'weighted')
    f1 = f1_score(true, predicted, average = 'weighted')
    return accuracy, precision, recall, f1

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Ridge Classifier": RidgeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss'),
    # "CatBoost Classifier": CatBoostClassifier(verbose=0),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "SVM": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Navie Bayes": GaussianNB()

}

model_list = []
accuracy_list = []

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy, train_precision, train_recall, train_f1 = evaluate_model(y_train, y_train_pred)
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(y_test, y_test_pred)
    
    print(f"{name}")
    model_list.append(name)
    
    print('Training Metrics:')
    print(f"- Accuracy: {train_accuracy:.4f}")
    print(f"- Precision: {train_precision:.4f}")
    print(f"- Recall: {train_recall:.4f}")
    print(f"- F1 Score: {train_f1:.4f}")
    
    print('Testing Metrics:')
    print(f"- Accuracy: {test_accuracy:.4f}")
    print(f"- Precision: {test_precision:.4f}")
    print(f"- Recall: {test_recall:.4f}")
    print(f"- F1 Score: {test_f1:.4f}")
    
    accuracy_list.append(test_accuracy)
    print("=" * 40)

In [None]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by='Accuracy', ascending=False)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
best_model = GradientBoostingClassifier()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import precision_recall_curve

# Example: Find threshold for optimal precision-recall balance
probabilities = best_model.predict_proba(X_test)[:, 1]  # Get probability for positive class
precisions, recalls, thresholds = precision_recall_curve(y_test, probabilities)

# Plot to find a balance (precision vs recall)
plt.plot(thresholds, precisions[:-1], label='Precision')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.xlabel('Threshold')
plt.legend()
plt.show()

In [None]:
y_probs = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_probs)
print("ROC-AUC Score:", roc_auc)
# Get the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Diagonal line for random guessing
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grids for each model
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize each model
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Set up GridSearchCV for each model
grid_search_dt = GridSearchCV(estimator=dt_model, 
                              param_grid=param_grid_dt, 
                              scoring='accuracy', 
                              cv=5, 
                              verbose=2, 
                              n_jobs=-1)

grid_search_rf = GridSearchCV(estimator=rf_model, 
                              param_grid=param_grid_rf, 
                              scoring='accuracy', 
                              cv=5, 
                              verbose=2, 
                              n_jobs=-1)

# Fit GridSearchCV for DecisionTreeClassifier
grid_search_dt.fit(X_train, y_train)
print("Best Parameters for DecisionTreeClassifier:", grid_search_dt.best_params_)
print("Best CV Score for DecisionTreeClassifier:", grid_search_dt.best_score_)

# Fit GridSearchCV for RandomForestClassifier
grid_search_rf.fit(X_train, y_train)
print("Best Parameters for RandomForestClassifier:", grid_search_rf.best_params_)
print("Best CV Score for RandomForestClassifier:", grid_search_rf.best_score_)

Save the model

In [None]:
joblib.dump(best_model, 'decision_tree_model.pkl')