In [3]:
# Scenario Question: Predicting Titanic Survival
# Researchers are studying the Titanic disaster and want to build models that predict whether a
#  passenger would survive or not survive based on their information.
# - Features used:
# - Passenger class (pclass)
# - Gender (sex)
# - Age (age)
# - Number of siblings/spouses aboard (sibsp)
# - Number of parents/children aboard (parch)
# - Ticket fare (fare)
# - Label:
# - 1 = Survived
# - 0 = Died
# The researchers train three different models:
# - Logistic Regression
# - K-Nearest Neighbors (KNN) with k=5
# - Decision Tree with max depth = 4
# They then evaluate each model using a classification report (precision, recall, F1-score, accuracy).
# Questions for Learners
# - Which model performs best at predicting survival, and why?
# - How does Logistic Regression differ from Decision Tree in terms of interpretability?
# # - Why is scaling applied before training Logistic Regression and KNN, but not strictly needed
#  for Decision Trees?
# - Looking at the classification report, what do precision and recall mean in the context of survival
#  predictions?
# - Precision → Of those predicted to survive, how many actually survived?
# - Recall → Of all who truly survived, how many were correctly predicted?
# - If you were a historian, which model would you trust more to explain survival patterns, and why?

# ==============================
# Titanic Survival Prediction
# ==============================

import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

df = sns.load_dataset('titanic')

df = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'survived']]

df['age'] = df['age'].fillna(df['age'].median())
df['fare'] = df['fare'].fillna(df['fare'].median())

df = df.dropna()

df['sex'] = df['sex'].map({'male': 0, 'female': 1})

X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

print("===== Logistic Regression =====")
print(classification_report(y_test, log_model.predict(X_test_scaled)))

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

print("===== KNN (k=5) =====")
print(classification_report(y_test, knn.predict(X_test_scaled)))

tree = DecisionTreeClassifier(max_depth=4, random_state=42)
tree.fit(X_train, y_train)

print("===== Decision Tree (max_depth=4) =====")
print(classification_report(y_test, tree.predict(X_test)))

===== Logistic Regression =====
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

===== KNN (k=5) =====
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       105
           1       0.75      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

===== Decision Tree (max_depth=4) =====
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78 

In [4]:
# ==========================================
# Student Success & Career Path Prediction
# ==========================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier

# ------------------------------------------
# 1️⃣ Create Sample Dataset
# ------------------------------------------

np.random.seed(42)
n = 300

data = {
    "study_hours": np.random.randint(1, 10, n),
    "attendance": np.random.randint(50, 100, n),
    "assignments_completion": np.random.randint(40, 100, n),
    "sleep_hours": np.random.randint(4, 9, n),
    "stress_level": np.random.randint(1, 10, n),
}

df = pd.DataFrame(data)

# Generate exam score (dependent on features)
df["exam_score"] = (
    df["study_hours"] * 5 +
    df["attendance"] * 0.3 +
    df["assignments_completion"] * 0.2 -
    df["stress_level"] * 2 +
    df["sleep_hours"] * 2 +
    np.random.normal(0, 5, n)
)

# ------------------------------------------
# 2️⃣ Regression – Predict Exam Score
# ------------------------------------------

X_reg = df.drop("exam_score", axis=1)
y_reg = df["exam_score"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

reg_model = LinearRegression()
reg_model.fit(X_train_r, y_train_r)

y_pred_r = reg_model.predict(X_test_r)

print("===== Regression Results =====")
print("MAE:", mean_absolute_error(y_test_r, y_pred_r))
print("R2 Score:", r2_score(y_test_r, y_pred_r))

# ------------------------------------------
# 3️⃣ Classification – At Risk vs On Track
# ------------------------------------------

df["risk_label"] = np.where(df["exam_score"] < 40, 0, 1)

X_clf = df.drop(["exam_score", "risk_label"], axis=1)
y_clf = df["risk_label"]

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_c_scaled = scaler.fit_transform(X_train_c)
X_test_c_scaled = scaler.transform(X_test_c)

clf_model = LogisticRegression()
clf_model.fit(X_train_c_scaled, y_train_c)

y_pred_c = clf_model.predict(X_test_c_scaled)

print("\n===== Classification Results =====")
print(classification_report(y_test_c, y_pred_c))

# ------------------------------------------
# 4️⃣ Clustering – Study Habit Groups
# ------------------------------------------

X_cluster = df[["study_hours", "attendance", "stress_level"]]

scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=3, random_state=42)
df["cluster"] = kmeans.fit_predict(X_cluster_scaled)

print("\n===== Cluster Distribution =====")
print(df["cluster"].value_counts())

def recommend_intervention(row):
    if row["risk_label"] == 0 and row["attendance"] < 70:
        return "Extra Tutoring"
    elif row["stress_level"] > 7:
        return "Counseling"
    elif row["assignments_completion"] < 60:
        return "Time Management Workshop"
    else:
        return "On Track – Encourage Advanced Learning"

df["intervention"] = df.apply(recommend_intervention, axis=1)

print(df[["exam_score", "risk_label", "cluster", "intervention"]].head())

===== Regression Results =====
MAE: 3.2949460373238546
R2 Score: 0.9183232069977294

===== Classification Results =====
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         6
           1       0.93      0.96      0.95        54

    accuracy                           0.90        60
   macro avg       0.71      0.65      0.67        60
weighted avg       0.89      0.90      0.89        60


===== Cluster Distribution =====
cluster
2    105
0    101
1     94
Name: count, dtype: int64

===== Sample Recommendations =====
   exam_score  risk_label  cluster                            intervention
0   65.647218           1        1                Time Management Workshop
1   60.723676           1        1                Time Management Workshop
2   79.357265           1        2  On Track – Encourage Advanced Learning
3   64.513298           1        0  On Track – Encourage Advanced Learning
4   72.620504           1        0  On Track – 