<a href="https://colab.research.google.com/github/AxinLi1/CS436_quiz4/blob/main/ridge_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [4]:
X, y = make_classification(
    n_samples=1000,
    n_features=100,
    n_informative=10,
    n_redundant=5,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---- Model WITHOUT regularization (prone to overfitting) ----
vanilla_model = LogisticRegression(penalty=None, max_iter=1000, random_state=42)
vanilla_model.fit(X_train_scaled, y_train)

# ---- Model WITH L2 regularization (counteracts overfitting) ----
'''
  Ridge Regression Model
Rdige regression counteracts overfitting by penalizing overly complex models,
forcing the algorithm to prefer smaller, more generalized weights. So, the
regularized model sacrifices some training accuracy for higher test accuracy,
while the unregularized model achieves high training accuracy, but performs
poorly in test accuracy. Hence, by constraining the weights, the model avoids
overcomplicating itself to fit noise.
'''
ridge_model = LogisticRegression(
    penalty='l2',
    C=0.1,
    max_iter=1000,
    random_state=42
)
ridge_model.fit(X_train_scaled, y_train)

def evaluate_model(model, X_train, X_test, y_train, y_test):
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    return {
        'Train Accuracy': accuracy_score(y_train, train_pred),
        'Test Accuracy': accuracy_score(y_test, test_pred)
    }

vanilla_results = evaluate_model(vanilla_model, X_train_scaled, X_test_scaled, y_train, y_test)
ridge_results = evaluate_model(ridge_model, X_train_scaled, X_test_scaled, y_train, y_test)

print("Vanilla Model (No Regularization):", vanilla_results)
print("Ridge Model (L2 Regularization):", ridge_results)

Vanilla Model (No Regularization): {'Train Accuracy': 0.8885714285714286, 'Test Accuracy': 0.8233333333333334}
Ridge Model (L2 Regularization): {'Train Accuracy': 0.8771428571428571, 'Test Accuracy': 0.8333333333333334}
