In [2]:
# Q1

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

np.random.seed(42)

n_samples = 300
base = np.random.rand(n_samples, 1)

X = np.hstack([
    base,
    base + 0.01 * np.random.randn(n_samples, 1),
    base + 0.02 * np.random.randn(n_samples, 1),
    base + 0.03 * np.random.randn(n_samples, 1),
    base + 0.04 * np.random.randn(n_samples, 1),
    base + 0.05 * np.random.randn(n_samples, 1),
    base + 0.06 * np.random.randn(n_samples, 1)
])

true_w = np.array([3, -2, 1.5, 0.5, 4, -1, 2])
y = X.dot(true_w) + 0.5 * np.random.randn(n_samples)

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

y = y.reshape(-1, 1)
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y).flatten()

def ridge_gradient_descent(X, y, lr, lam, n_iters=1000):
    m, n = X.shape
    w = np.zeros(n)
    b = 0.0
    for _ in range(n_iters):
        y_pred = X.dot(w) + b
        error = y_pred - y
        dw = (1/m) * (X.T.dot(error)) + (lam/m) * w
        db = (1/m) * np.sum(error)
        w -= lr * dw
        b -= lr * db
        # Check for NaN values and break if found
        if np.isnan(w).any() or np.isnan(b) or np.isinf(w).any() or np.isinf(b):
            return np.full(n, np.nan), np.nan, np.inf, -np.inf # Return NaN for weights/bias, inf for cost, -inf for R2

    y_pred_final = X.dot(w) + b
    cost = (1/(2*m)) * np.sum((y_pred_final - y) ** 2) + (lam/(2*m)) * np.sum(w ** 2)
    r2 = r2_score(y, y_pred_final)
    return w, b, cost, r2

learning_rates = [0.0001, 0.001, 0.01, 0.1, 1, 10]
lambdas = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

best_config = None
best_r2 = -np.inf
best_cost = np.inf

for lr in learning_rates:
    for lam in lambdas:
        w, b, cost, r2 = ridge_gradient_descent(X_scaled, y_scaled, lr, lam, n_iters=2000)
        # Only consider valid results for comparison
        if not np.isnan(r2) and ((r2 > best_r2) or (r2 == best_r2 and cost < best_cost)):
            best_r2 = r2
            best_cost = cost
            best_config = (lr, lam, w, b)

print("Best learning rate:", best_config[0])
print("Best lambda:", best_config[1])
print("Best cost:", best_cost)
print("Best R2 score:", best_r2)


Best learning rate: 0.1
Best lambda: 1e-15
Best cost: 0.020455433522743573
Best R2 score: 0.9590891329545128


In [7]:
# Q2

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

# (a) Load + preprocess
url = "https://gist.githubusercontent.com/keeganhines/59974f1ebef97bbaa44fb19143f90bad/raw/Hitters.csv"
data = pd.read_csv(url)

data = data.dropna()
X = data.drop(columns=["Salary"])
X = pd.get_dummies(X, drop_first=True)
y = data["Salary"]

# (b) Split + scale
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# (c) Train & evaluate
lin = LinearRegression()
lin.fit(X_train_s, y_train)
y_pred_lin = lin.predict(X_test_s)

ridge = Ridge(alpha=1)
ridge.fit(X_train_s, y_train)
y_pred_ridge = ridge.predict(X_test_s)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_s, y_train)
y_pred_lasso = lasso.predict(X_test_s)

print("Linear R2:", r2_score(y_test, y_pred_lin))
print("Ridge R2:", r2_score(y_test, y_pred_ridge))
print("Lasso R2:", r2_score(y_test, y_pred_lasso))

print("Linear MSE:", mean_squared_error(y_test, y_pred_lin))
print("Ridge MSE:", mean_squared_error(y_test, y_pred_ridge))
print("Lasso MSE:", mean_squared_error(y_test, y_pred_lasso))


Linear R2: 0.16769360190025295
Ridge R2: 0.17104747376757878
Lasso R2: 0.11422403850131013
Linear MSE: 150540.93304991836
Ridge MSE: 149934.3114963777
Lasso MSE: 160212.08057711055


  model = cd_fast.enet_coordinate_descent(


In [10]:
# Q3

import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import r2_score, mean_squared_error

# Load the California housing dataset as an alternative to Boston
housing = fetch_california_housing()
X = housing.data
y = housing.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ridge_alphas = [0.01, 0.1, 1, 10, 100]
ridge_cv = RidgeCV(alphas=ridge_alphas, cv=5, scoring="r2")
ridge_cv.fit(X_train_scaled, y_train)
y_pred_ridge_cv = ridge_cv.predict(X_test_scaled)

print("RidgeCV best alpha:", ridge_cv.alpha_)
print("RidgeCV R2:", r2_score(y_test, y_pred_ridge_cv))
print("RidgeCV MSE:", mean_squared_error(y_test, y_pred_ridge_cv))
print("-" * 30)

lasso_cv = LassoCV(alphas=None, cv=5, max_iter=10000, random_state=42)
lasso_cv.fit(X_train_scaled, y_train)
y_pred_lasso_cv = lasso_cv.predict(X_test_scaled)

print("LassoCV best alpha:", lasso_cv.alpha_)
print("LassoCV R2:", r2_score(y_test, y_pred_lasso_cv))
print("LassoCV MSE:", mean_squared_error(y_test, y_pred_lasso_cv))


RidgeCV best alpha: 0.01
RidgeCV R2: 0.5757879873121596
RidgeCV MSE: 0.5558912301037886
------------------------------
LassoCV best alpha: 0.000798519564426035
LassoCV R2: 0.5766495309609692
LassoCV MSE: 0.554762255571242


In [11]:
# Q4

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

classes = np.unique(y_train)
m, n = X_train_s.shape

W = np.zeros((len(classes), n))
b = np.zeros(len(classes))

lr = 0.1
iters = 2000

for i, c in enumerate(classes):
    y_c = (y_train == c).astype(int)
    w = np.zeros(n)
    bi = 0.0
    for _ in range(iters):
        z = X_train_s @ w + bi
        p = sigmoid(z)
        error = p - y_c
        dw = (1 / m) * (X_train_s.T @ error)
        db = (1 / m) * error.sum()
        w -= lr * dw
        bi -= lr * db
    W[i] = w
    b[i] = bi

z_test = X_test_s @ W.T + b
probs = sigmoid(z_test)
y_pred = np.argmax(probs, axis=1)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9
