<img src="../assets/ittc_logo_full.png" height=150>

# Lecture 8 Logistic Regression

## In this Practical

In this practical you will:

1. Execute code chunks to fit a logistic regression model to the financial default data

## 1. Import the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Load the dataset
df = pd.read_csv("data/Default.csv")

# Display the first few rows
df.head()


## 2. Exploratory Data Analysis (EDA)
We examine the relationship between balance, income, student status, and default.

In [None]:
# Convert categorical variables to proper types
df['default'] = df['default'].astype('category')
df['student'] = df['student'].astype('category')

# Plot 1: Balance vs Income, faceted by student, colored by default
sns.set(style="whitegrid")
g = sns.FacetGrid(df, col="student", hue="default", height=5, aspect=1.2)
g.map_dataframe(sns.scatterplot, x="income", y="balance", alpha=0.7)
g.add_legend()
g.set_axis_labels("Income", "Balance")
g.set_titles("Student: {col_name}")
plt.show()

# Plot 2: Balance vs Income, faceted by default, colored by student
g2 = sns.FacetGrid(df, col="default", hue="student", height=5, aspect=1.2)
g2.map_dataframe(sns.scatterplot, x="income", y="balance", alpha=0.7)
g2.add_legend()
g2.set_axis_labels("Income", "Balance")
g2.set_titles("Default: {col_name}")
plt.show()


## 3. Split the data into training and test sets (stratified by default)

In [None]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["default"]
)


## 4. Fit a logistic regression model
We use `student` and `balance` as predictors for `default`.

In [None]:
# Create design matrix and response variable for training set
X_train = pd.get_dummies(train_df[['student']], drop_first=True)
X_train['balance'] = train_df['balance']
X_train = sm.add_constant(X_train).astype(float)
y_train = (train_df['default'] == 'Yes').astype(int)

# Fit the logistic regression model
model = sm.Logit(y_train, X_train).fit()
model.summary()


## 5. Assess model fit on test set
We produce a normalized confusion matrix and ROC curve.

In [None]:
# Prepare test set
X_test = pd.get_dummies(test_df[['student']], drop_first=True)
X_test['balance'] = test_df['balance']
X_test = sm.add_constant(X_test).astype(float)
y_test = (test_df['default'] == 'Yes').astype(int)

# Predict probabilities and labels
y_pred_probs = model.predict(X_test)
y_pred_labels = (y_pred_probs >= 0.5).astype(int)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_labels)
cm_normalized = cm.astype(float) / cm.sum()

# Plot confusion matrix
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.heatmap(cm_normalized, annot=True, fmt=".2f", cmap="Blues", ax=ax[0],
            xticklabels=["No Default", "Default"], yticklabels=["No Default", "Default"])
ax[0].set_title("Normalized Confusion Matrix")
ax[0].set_xlabel("Predicted")
ax[0].set_ylabel("Actual")

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)

ax[1].plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}", linewidth=2)
ax[1].plot([0, 1], [0, 1], 'k--', linewidth=1)
ax[1].set_xlim([0.0, 1.0])
ax[1].set_ylim([0.0, 1.05])
ax[1].set_xlabel("False Positive Rate")
ax[1].set_ylabel("True Positive Rate")
ax[1].set_title("ROC Curve")
ax[1].legend(loc="lower right")

plt.tight_layout()
plt.show()
