In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from xgboost import XGBRegressor
from sklearn.metrics import (
    accuracy_score,      # Measures how many predictions were correct
    precision_score,     # Of the ones predicted '1' (survived), how many were actually '1'
    recall_score,        # Of all the actual '1' (survived), how many did we correctly predict
    f1_score,            # Harmonic mean of precision and recall (balances both)
    confusion_matrix,    # Gives a 2x2 matrix showing TP, FP, FN, TN
    classification_report, # Gives a nice summary of precision, recall, f1 for each class
    roc_auc_score         # Measures model’s ability to separate classes (0–1 range)
)

In [36]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [37]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [39]:
features = ['Sex', 'Age', 'SibSp', 'Parch', 'Pclass', 'Fare']

x_train = train[features].copy()
y_train = train['Survived']
x_test = test[features].copy()


# ONE-HOT ENCODING (for categorical variables)
x_train = pd.get_dummies(x_train, drop_first=True)
x_test = pd.get_dummies(x_test, drop_first=True)

# Align columns between train and test (important!)
x_test = x_test.reindex(columns=x_train.columns, fill_value=0)


scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# model training using XGBoost
model = XGBClassifier(
    n_estimators=300,        # Number of trees
    learning_rate=0.05,      # How much model learns in each iteration
    max_depth=8,             # Tree depth
    subsample=0.8,           # Fraction of samples used for each tree
    colsample_bytree=0.8,    # Fraction of features used per tree
    random_state=42,
    n_jobs=-1
)

model.fit(x_train, y_train)

# making predictions
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)


print("📊 MODEL PERFORMANCE METRICS (Training Data)")
print("------------------------------------------------")
print(f"✅ Accuracy  : {accuracy_score(y_train, y_train_pred):.4f}  # Overall correctness")
print(f"🎯 Precision : {precision_score(y_train, y_train_pred):.4f}  # How many predicted '1' were right")
print(f"🔍 Recall    : {recall_score(y_train, y_train_pred):.4f}  # How many actual '1' were caught")
print(f"⚖️  F1-Score  : {f1_score(y_train, y_train_pred):.4f}  # Balance between precision & recall")
print(f"🚀 ROC-AUC   : {roc_auc_score(y_train, y_train_pred):.4f}  # Overall class separability")


print("\n📦 Confusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))

print("\n🧾 Classification Report:")
print(classification_report(y_train, y_train_pred))


📊 MODEL PERFORMANCE METRICS (Training Data)
------------------------------------------------
✅ Accuracy  : 0.9529  # Overall correctness
🎯 Precision : 0.9630  # How many predicted '1' were right
🔍 Recall    : 0.9123  # How many actual '1' were caught
⚖️  F1-Score  : 0.9369  # Balance between precision & recall
🚀 ROC-AUC   : 0.9452  # Overall class separability

📦 Confusion Matrix:
[[537  12]
 [ 30 312]]

🧾 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       549
           1       0.96      0.91      0.94       342

    accuracy                           0.95       891
   macro avg       0.96      0.95      0.95       891
weighted avg       0.95      0.95      0.95       891

