# L02: Logistic Regression

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Digital-AI-Finance/methods-algorithms/blob/master/notebooks/L02_logistic_regression.ipynb)

**Course**: Methods and Algorithms - MSc Data Science

---

## Learning Objectives

By the end of this notebook, you will be able to:

1. Understand the logistic function and maximum likelihood estimation
2. Implement logistic regression from scratch using NumPy
3. Evaluate classifiers using confusion matrix, ROC, and PR curves
4. Handle class imbalance in credit scoring applications

## Setup

In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_curve, auc,
                             precision_recall_curve, classification_report)
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Plotting settings
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print('Setup complete!')

## 1. Generate Synthetic Credit Data

We create a synthetic credit scoring dataset for hands-on practice.

In [None]:
# Generate synthetic credit data
n_samples = 1000

# Features
income = np.random.normal(50000, 15000, n_samples)  # Annual income
debt_ratio = np.random.uniform(0.1, 0.8, n_samples)  # Debt-to-income ratio
credit_history = np.random.randint(1, 10, n_samples)  # Credit history (years)
num_accounts = np.random.randint(1, 10, n_samples)  # Number of credit accounts

# True probability of default (logistic relationship)
z = -3 + (-0.00003 * income) + (4 * debt_ratio) + (-0.1 * credit_history) + (0.05 * num_accounts)
prob_default = 1 / (1 + np.exp(-z))
default = (np.random.random(n_samples) < prob_default).astype(int)

# Create DataFrame
df = pd.DataFrame({
    'income': income,
    'debt_ratio': debt_ratio,
    'credit_history_years': credit_history,
    'num_accounts': num_accounts,
    'default': default
})

print(f'Dataset shape: {df.shape}')
print(f'\nDefault rate: {df["default"].mean():.1%}')
df.head()

In [None]:
# Visualize the data
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

colors = ['green' if d == 0 else 'red' for d in df['default']]

axes[0, 0].scatter(df['income']/1000, df['debt_ratio'], c=colors, alpha=0.5)
axes[0, 0].set_xlabel('Income (thousands)')
axes[0, 0].set_ylabel('Debt Ratio')
axes[0, 0].set_title('Income vs Debt Ratio')

axes[0, 1].hist([df[df['default']==0]['income']/1000, df[df['default']==1]['income']/1000],
               bins=20, label=['No Default', 'Default'], color=['green', 'red'], alpha=0.7)
axes[0, 1].set_xlabel('Income (thousands)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Income Distribution by Default Status')
axes[0, 1].legend()

axes[1, 0].hist([df[df['default']==0]['debt_ratio'], df[df['default']==1]['debt_ratio']],
               bins=20, label=['No Default', 'Default'], color=['green', 'red'], alpha=0.7)
axes[1, 0].set_xlabel('Debt Ratio')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Debt Ratio Distribution by Default Status')
axes[1, 0].legend()

default_rate_by_history = df.groupby('credit_history_years')['default'].mean()
axes[1, 1].bar(default_rate_by_history.index, default_rate_by_history.values, color='steelblue')
axes[1, 1].set_xlabel('Credit History (years)')
axes[1, 1].set_ylabel('Default Rate')
axes[1, 1].set_title('Default Rate by Credit History')

plt.tight_layout()
plt.show()

## 2. Theory: The Sigmoid Function

The logistic (sigmoid) function maps any real number to the (0, 1) interval:

$$\sigma(z) = \frac{1}{1 + e^{-z}}$$

In [None]:
def sigmoid(z):
    """Compute sigmoid function."""
    return 1 / (1 + np.exp(-z))

# Visualize sigmoid
z = np.linspace(-8, 8, 100)
plt.figure(figsize=(10, 6))
plt.plot(z, sigmoid(z), linewidth=3, color='purple')
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7)
plt.axvline(x=0, color='gray', linestyle='--', alpha=0.7)
plt.xlabel('z')
plt.ylabel('sigmoid(z)')
plt.title('Sigmoid Function')
plt.grid(True, alpha=0.3)
plt.show()

## 3. Implementation from Scratch

In [None]:
# Prepare data
X = df[['income', 'debt_ratio', 'credit_history_years', 'num_accounts']].values
y = df['default'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

In [None]:
def cross_entropy_loss(y_true, y_pred):
    """Compute binary cross-entropy loss."""
    epsilon = 1e-15  # Prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def fit_logistic_gd(X, y, learning_rate=0.1, n_iterations=1000, tol=1e-6):
    """Fit logistic regression using gradient descent with early stopping."""
    n, p = X.shape
    
    # Add intercept column
    X_with_intercept = np.column_stack([np.ones(n), X])
    
    # Initialize weights
    w = np.zeros(p + 1)
    losses = []
    
    for i in range(n_iterations):
        # Forward pass
        z = X_with_intercept @ w
        y_pred = sigmoid(z)
        
        # Compute loss
        loss = cross_entropy_loss(y, y_pred)
        losses.append(loss)
        
        # Early stopping: check convergence
        if i > 0 and abs(losses[-2] - losses[-1]) < tol:
            print(f'Converged at iteration {i}')
            break
        
        # Compute gradient
        gradient = (1/n) * X_with_intercept.T @ (y_pred - y)
        
        # Update weights
        w = w - learning_rate * gradient
    
    return w, losses

# Fit model
w_gd, losses = fit_logistic_gd(X_train, y_train, learning_rate=0.5, n_iterations=500, tol=1e-6)
print(f'Final loss: {losses[-1]:.4f}')
print(f'Iterations: {len(losses)}')
print(f'\nLearned weights:')
print(f'  Intercept: {w_gd[0]:.4f}')
for i, name in enumerate(['income', 'debt_ratio', 'credit_history', 'num_accounts']):
    print(f'  {name}: {w_gd[i+1]:.4f}')

In [None]:
# Plot convergence
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.xlabel('Iteration')
plt.ylabel('Cross-Entropy Loss')
plt.title('Gradient Descent Convergence')
plt.grid(True, alpha=0.3)
plt.show()

## 4. Using scikit-learn

In [None]:
# Fit model using scikit-learn
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)[:, 1]

print('scikit-learn coefficients:')
print(f'  Intercept: {lr.intercept_[0]:.4f}')
for name, coef in zip(['income', 'debt_ratio', 'credit_history', 'num_accounts'], lr.coef_[0]):
    print(f'  {name}: {coef:.4f}')

## 5. Model Evaluation

In [None]:
# Classification report
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=['No Default', 'Default']))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
plt.imshow(cm, cmap='Blues')
plt.colorbar()

# Add labels
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center', fontsize=20)

plt.xticks([0, 1], ['Predicted: No Default', 'Predicted: Default'])
plt.yticks([0, 1], ['Actual: No Default', 'Actual: Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', linewidth=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Precision-Recall Curve
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(10, 6))
plt.plot(recall, precision, color='green', linewidth=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Coefficient Interpretation

In [None]:
# Odds ratios
feature_names = ['income', 'debt_ratio', 'credit_history', 'num_accounts']
odds_ratios = np.exp(lr.coef_[0])

print('Odds Ratios (per 1 std increase in feature):')
for name, or_val, coef in zip(feature_names, odds_ratios, lr.coef_[0]):
    direction = 'increases' if coef > 0 else 'decreases'
    change = abs(or_val - 1) * 100
    print(f'  {name}: OR = {or_val:.3f} ({direction} odds by {change:.1f}%)')

In [None]:
# Feature importance plot
plt.figure(figsize=(10, 5))
colors = ['green' if c < 0 else 'red' for c in lr.coef_[0]]
plt.barh(feature_names, lr.coef_[0], color=colors)
plt.xlabel('Coefficient Value')
plt.title('Logistic Regression Coefficients (Standardized)')
plt.axvline(x=0, color='black', linewidth=0.5)
plt.show()

print('\nInterpretation:')
print('  - Positive coefficient = increases probability of default')
print('  - Negative coefficient = decreases probability of default')

## 7. Threshold Selection

In [None]:
# Try different thresholds
thresholds_to_try = [0.3, 0.4, 0.5, 0.6, 0.7]

print('Threshold | Accuracy | Precision | Recall | F1')
print('-' * 55)

for thresh in thresholds_to_try:
    y_pred_thresh = (y_proba >= thresh).astype(int)
    acc = accuracy_score(y_test, y_pred_thresh)
    prec = precision_score(y_test, y_pred_thresh, zero_division=0)
    rec = recall_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)
    print(f'   {thresh:.1f}    |  {acc:.3f}   |   {prec:.3f}   | {rec:.3f}  | {f1:.3f}')

## Exercises

### Exercise 1: Implement Accuracy from Scratch
Write a function to compute accuracy from the confusion matrix.

In [None]:
# Solution: Implement accuracy from scratch
def accuracy_from_scratch(y_true, y_pred):
    """Calculate accuracy without sklearn."""
    correct = sum(1 for t, p in zip(y_true, y_pred) if t == p)
    return correct / len(y_true)

acc_manual = accuracy_from_scratch(y_test, y_pred)
print(f"Manual accuracy: {acc_manual:.4f}")
print(f"sklearn accuracy: {accuracy_score(y_test, y_pred):.4f}")

### Exercise 2: Handle Class Imbalance
Use class_weight='balanced' and compare results.

In [None]:
# Solution: Handle class imbalance with class_weight='balanced'
clf_balanced = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
clf_balanced.fit(X_train, y_train)
y_pred_balanced = clf_balanced.predict(X_test)
print("With class_weight='balanced':")
print(classification_report(y_test, y_pred_balanced))

print(f"\nDefault accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Balanced accuracy: {accuracy_score(y_test, y_pred_balanced):.4f}")

## Summary

Key takeaways from this notebook:

1. Logistic regression models binary outcomes using the sigmoid function
2. Maximum likelihood estimation finds optimal parameters via gradient descent
3. Evaluation requires multiple metrics: accuracy, precision, recall, F1, AUC
4. Coefficients are interpretable as log-odds (use odds ratios for business)