In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


Imports and Setup

In [None]:
# Import utility for nicely displaying DataFrames in notebooks
from IPython.display import display

# Core data manipulation libraries
import pandas as pd           # For data manipulation and analysis
import numpy as np            # For numerical operations

# Visualization libraries
import seaborn as sns         # For creating attractive statistical plots
import matplotlib.pyplot as plt  # For plotting graphs

# Model training and evaluation tools
from sklearn.model_selection import train_test_split, RandomizedSearchCV  # For splitting data and randomized hyperparameter search
from sklearn.preprocessing import LabelEncoder, StandardScaler            # For encoding categorical variables and feature scaling
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix  # For evaluating model performance

# Machine learning models
from sklearn.ensemble import RandomForestClassifier       # Random Forest model
from sklearn.linear_model import LogisticRegression       # Logistic Regression model
import xgboost as xgb                                      # XGBoost classifier
from catboost import CatBoostClassifier                   # CatBoost classifier (handles categorical features well)

# SHAP for Explainable AI
import shap                         # For model interpretability and feature importance explanation

# File handling for saving submissions
from google.colab import files     # To download files from Colab environment

# Suppress unnecessary warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


Load Data

In [None]:
# URLs to the raw CSV files stored in the project's GitHub repository
url_train = 'https://raw.githubusercontent.com/Assignment3StarshipTitanic/Assignment_3_StarshipTitanic/main/data/train.csv'
url_test = 'https://raw.githubusercontent.com/Assignment3StarshipTitanic/Assignment_3_StarshipTitanic/main/data/test.csv'

# Load training and test datasets from the URLs into Pandas DataFrames
train_df = pd.read_csv(url_train)
test_df = pd.read_csv(url_test)

# (Optional) Load sample submission file if URL provided
sample_submission_df = pd.read_csv(url_sample)

# Display the first few rows of the training dataset
print("Train Dataset")
display(train_df.head())

# Display the first few rows of the test dataset
print("\nTest Dataset")
display(test_df.head())


Train Dataset


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True



 Test Dataset


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


Train Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
# - max_iter=500: Allows enough iterations for convergence
# - random_state=42: Ensures reproducibility
lr = LogisticRegression(max_iter=500, random_state=42)

# Fit the model on the scaled training data
lr.fit(X_train_scaled, y_train)


Evaluate Logistic Regression Model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Predict on the validation set using the trained Logistic Regression model
pred_lr = lr.predict(X_val_scaled)

# Calculate and print accuracy score
acc_lr = accuracy_score(y_val, pred_lr)
print(f"Logistic Regression Accuracy: {acc_lr * 100:.2f}%")

# Print classification report (precision, recall, f1-score for each class)
print(classification_report(y_val, pred_lr))

# Plot confusion matrix as a heatmap to visualize true vs predicted values
plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_val, pred_lr), annot=True, fmt='d', cmap='Blues')
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


Train XGBoost with Hyperparameter Tuning

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters for tuning XGBoost
params = {
    "n_estimators": [100, 300],
    "max_depth": [4, 6],
    "learning_rate": [0.03, 0.1],
    "subsample": [0.8, 1],
    "colsample_bytree": [0.8, 1]
}

# Initialize XGBoost classifier with fixed random seed and evaluation metric
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Perform grid search with 3-fold cross-validation to find best hyperparameters
grid = GridSearchCV(xgb, params, scoring='accuracy', cv=3, verbose=1)

# Train the model on scaled training data
grid.fit(X_train_scaled, y_train)

# Retrieve the best model from the grid search
best_xgb = grid.best_estimator_


Evaluate XGBoost Model

In [None]:
# Predict on validation data using the best XGBoost model
pred_xgb = best_xgb.predict(X_val_scaled)

# Calculate accuracy score on validation set
acc_xgb = accuracy_score(y_val, pred_xgb)
print(f"XGBoost Accuracy: {acc_xgb * 100:.2f}%")

# Print classification report with precision, recall, f1-score
print(classification_report(y_val, pred_xgb))

# Plot confusion matrix heatmap for evaluation
plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_val, pred_xgb), annot=True, fmt='d', cmap='Greens')
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


Visualize Accuracy Comparison

In [None]:
# Define model names and corresponding accuracies (in percentage)
model_names = ['Logistic Regression', 'XGBoost']
accuracies = [acc_lr * 100, acc_xgb * 100]

# Plot bar chart to compare model accuracies
plt.figure(figsize=(6, 4))
sns.barplot(x=model_names, y=accuracies, palette='pastel')

# Annotate bars with accuracy values
for i, acc in enumerate(accuracies):
    plt.text(i, acc + 0.5, f"{acc:.2f}%", ha='center', va='bottom', fontweight='bold')

# Set chart title and labels
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy (%)")

# Limit y-axis for better visualization
plt.ylim(75, 100)
plt.tight_layout()
plt.show()
