# Assignment 5 - Regression Solutions
This notebook contains solutions for all questions (Q1-Q4) in Lab Assignment 5.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

---
# Q1: Ridge Regression using Gradient Descent Optimization
Generate a dataset with at least 7 highly correlated columns and implement Ridge Regression using Gradient Descent.

## Step 1: Generate Dataset with Highly Correlated Features

In [None]:
np.random.seed(42)

def generate_correlated_data(n_samples=1000, n_features=7):
    """Generate a dataset with highly correlated features"""
    base_feature = np.random.randn(n_samples, 1)
    
    X = np.zeros((n_samples, n_features))
    for i in range(n_features):
        noise = np.random.randn(n_samples, 1) * 0.3
        X[:, i] = (base_feature + noise).flatten()
    
    true_weights = np.random.randn(n_features, 1) * 2
    y = X @ true_weights + np.random.randn(n_samples, 1) * 0.5
    y = y.flatten()
    
    return X, y

X, y = generate_correlated_data(n_samples=1000, n_features=7)

print(f"Dataset Shape: X = {X.shape}, y = {y.shape}")
print("\nCorrelation Matrix:")
corr_matrix = np.corrcoef(X.T)
print(pd.DataFrame(corr_matrix.round(3), 
                   columns=[f'F{i+1}' for i in range(7)],
                   index=[f'F{i+1}' for i in range(7)]))

## Step 2: Implement Ridge Regression Cost Function and Gradient Descent

In [None]:
def ridge_cost(X, y, theta, lambda_reg):
    """Calculate Ridge Regression cost (MSE + L2 penalty)"""
    m = len(y)
    predictions = X @ theta
    mse = (1/(2*m)) * np.sum((predictions - y)**2)
    l2_penalty = (lambda_reg/(2*m)) * np.sum(theta[1:]**2)
    return mse + l2_penalty

def ridge_gradient_descent(X, y, learning_rate, lambda_reg, n_iterations=1000):
    """Implement Ridge Regression using Gradient Descent"""
    m, n = X.shape
    theta = np.zeros(n)
    cost_history = []
    
    for iteration in range(n_iterations):
        predictions = X @ theta
        errors = predictions - y
        gradients = (1/m) * (X.T @ errors)
        
        reg_term = np.zeros(n)
        reg_term[1:] = (lambda_reg/m) * theta[1:]
        gradients += reg_term
        
        theta = theta - learning_rate * gradients
        cost = ridge_cost(X, y, theta, lambda_reg)
        cost_history.append(cost)
    
    return theta, cost_history

def calculate_r2(y_true, y_pred):
    """Calculate R-squared score"""
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ss_res / ss_tot)

print("Ridge Regression functions defined!")

## Step 3: Test Different Learning Rates and Regularization Parameters

In [None]:
# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add bias term
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

print(f"Training set: {X_train_b.shape}")
print(f"Test set: {X_test_b.shape}")

In [None]:
# Define hyperparameter grid
learning_rates = [0.0001, 0.001, 0.01, 0.1]
lambda_values = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]

results = []

print("Testing hyperparameter combinations...\n")

for lr in learning_rates:
    for lam in lambda_values:
        try:
            theta, cost_history = ridge_gradient_descent(X_train_b, y_train, lr, lam, n_iterations=1000)
            
            y_train_pred = X_train_b @ theta
            y_test_pred = X_test_b @ theta
            
            train_r2 = calculate_r2(y_train, y_train_pred)
            test_r2 = calculate_r2(y_test, y_test_pred)
            final_cost = cost_history[-1]
            
            if not np.isnan(final_cost) and not np.isinf(final_cost):
                results.append({
                    'learning_rate': lr,
                    'lambda': lam,
                    'final_cost': final_cost,
                    'train_r2': train_r2,
                    'test_r2': test_r2
                })
        except:
            pass

results_df = pd.DataFrame(results)
print(f"Tested {len(results)} valid combinations")

In [None]:
# Find best parameters
best_by_cost = results_df.loc[results_df['final_cost'].idxmin()]
best_by_r2 = results_df.loc[results_df['test_r2'].idxmax()]

print("="*60)
print("BEST PARAMETERS")
print("="*60)
print(f"\nBy Minimum Cost:")
print(f"  Learning Rate: {best_by_cost['learning_rate']}")
print(f"  Lambda: {best_by_cost['lambda']}")
print(f"  Final Cost: {best_by_cost['final_cost']:.6f}")
print(f"  Test R2: {best_by_cost['test_r2']:.4f}")

print(f"\nBy Maximum R2 Score:")
print(f"  Learning Rate: {best_by_r2['learning_rate']}")
print(f"  Lambda: {best_by_r2['lambda']}")
print(f"  Final Cost: {best_by_r2['final_cost']:.6f}")
print(f"  Test R2: {best_by_r2['test_r2']:.4f}")

In [None]:
# Top 10 results
print("\nTop 10 Results by Test R2:")
print(results_df.nlargest(10, 'test_r2').to_string(index=False))

---
# Q2: Hitters Dataset - Linear, Ridge, and LASSO Regression

## (a) Load and Preprocess the Data

In [None]:
# Note: Download Hitters dataset or use sample data
# Creating sample similar data for demonstration
try:
    hitters = pd.read_csv('Hitters.csv')
except:
    # Create synthetic Hitters-like data
    np.random.seed(42)
    n = 300
    hitters = pd.DataFrame({
        'AtBat': np.random.randint(100, 600, n),
        'Hits': np.random.randint(50, 200, n),
        'HmRun': np.random.randint(0, 40, n),
        'Runs': np.random.randint(20, 120, n),
        'RBI': np.random.randint(20, 120, n),
        'Walks': np.random.randint(10, 100, n),
        'Years': np.random.randint(1, 20, n),
        'CAtBat': np.random.randint(100, 10000, n),
        'CHits': np.random.randint(50, 3000, n),
        'CHmRun': np.random.randint(0, 400, n),
        'CRuns': np.random.randint(20, 1500, n),
        'CRBI': np.random.randint(20, 1500, n),
        'CWalks': np.random.randint(10, 1000, n),
        'League': np.random.choice(['A', 'N'], n),
        'Division': np.random.choice(['E', 'W'], n),
        'PutOuts': np.random.randint(50, 1000, n),
        'Assists': np.random.randint(0, 500, n),
        'Errors': np.random.randint(0, 30, n),
        'NewLeague': np.random.choice(['A', 'N'], n)
    })
    # Generate salary based on features
    hitters['Salary'] = (hitters['Hits'] * 2 + hitters['HmRun'] * 10 + 
                         hitters['Years'] * 50 + np.random.randn(n) * 100 + 200)
    # Add some null values
    null_indices = np.random.choice(n, 20, replace=False)
    hitters.loc[null_indices, 'Salary'] = np.nan

print(f"Dataset shape: {hitters.shape}")
print(f"\nNull values:\n{hitters.isnull().sum()[hitters.isnull().sum() > 0]}")
hitters.head()

In [None]:
# Handle null values
hitters_clean = hitters.dropna()
print(f"Shape after removing nulls: {hitters_clean.shape}")

# Encode categorical variables
le = LabelEncoder()
for col in ['League', 'Division', 'NewLeague']:
    if col in hitters_clean.columns:
        hitters_clean[col] = le.fit_transform(hitters_clean[col])

print("\nPreprocessing complete!")
hitters_clean.head()

## (b) Separate Features and Scale

In [None]:
# Separate features and target
X_hitters = hitters_clean.drop('Salary', axis=1)
y_hitters = hitters_clean['Salary']

# Split data
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_hitters, y_hitters, test_size=0.2, random_state=42
)

# Scale features
scaler_h = StandardScaler()
X_train_h_scaled = scaler_h.fit_transform(X_train_h)
X_test_h_scaled = scaler_h.transform(X_test_h)

print(f"Training set: {X_train_h_scaled.shape}")
print(f"Test set: {X_test_h_scaled.shape}")

## (c) Fit Linear, Ridge, and LASSO Regression

In [None]:
# Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train_h_scaled, y_train_h)

# Ridge Regression (alpha = 0.5748)
ridge_reg = Ridge(alpha=0.5748)
ridge_reg.fit(X_train_h_scaled, y_train_h)

# LASSO Regression (alpha = 0.5748)
lasso_reg = Lasso(alpha=0.5748)
lasso_reg.fit(X_train_h_scaled, y_train_h)

print("All models trained successfully!")

## (d) Evaluate Model Performance

In [None]:
# Predictions
y_pred_linear = linear_reg.predict(X_test_h_scaled)
y_pred_ridge = ridge_reg.predict(X_test_h_scaled)
y_pred_lasso = lasso_reg.predict(X_test_h_scaled)

# Calculate metrics
models = ['Linear', 'Ridge', 'LASSO']
predictions = [y_pred_linear, y_pred_ridge, y_pred_lasso]

print("="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)

comparison = []
for name, y_pred in zip(models, predictions):
    mse = mean_squared_error(y_test_h, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_h, y_pred)
    comparison.append({'Model': name, 'MSE': mse, 'RMSE': rmse, 'R2': r2})
    print(f"\n{name} Regression:")
    print(f"  MSE:  {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R2:   {r2:.4f}")

comparison_df = pd.DataFrame(comparison)
best_model = comparison_df.loc[comparison_df['R2'].idxmax(), 'Model']
print(f"\nBest Model: {best_model}")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot comparison
x = np.arange(len(models))
width = 0.25

axes[0].bar(x - width, comparison_df['RMSE'], width, label='RMSE', color='coral')
axes[0].bar(x, comparison_df['R2'], width, label='R2', color='steelblue')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models)
axes[0].set_title('Model Comparison')
axes[0].legend()

# Coefficient comparison
coef_df = pd.DataFrame({
    'Feature': X_hitters.columns,
    'Linear': linear_reg.coef_,
    'Ridge': ridge_reg.coef_,
    'LASSO': lasso_reg.coef_
})

coef_df.set_index('Feature')[['Linear', 'Ridge', 'LASSO']].plot(kind='bar', ax=axes[1])
axes[1].set_title('Coefficient Comparison')
axes[1].legend()
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("\nAnalysis:")
print("- Ridge shrinks coefficients but keeps all features")
print("- LASSO can set some coefficients to zero (feature selection)")
print("- Linear regression may overfit with many correlated features")

---
# Q3: Cross Validation for Ridge and Lasso (RidgeCV & LassoCV)

In [None]:
# Using California Housing as Boston is deprecated
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
X_housing = housing.data
y_housing = housing.target

print(f"Dataset Shape: {X_housing.shape}")
print(f"Features: {housing.feature_names}")

# Split and scale
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(
    X_housing, y_housing, test_size=0.2, random_state=42
)

scaler_cv = StandardScaler()
X_train_cv_scaled = scaler_cv.fit_transform(X_train_cv)
X_test_cv_scaled = scaler_cv.transform(X_test_cv)

In [None]:
# RidgeCV - automatically finds best alpha
alphas = np.logspace(-6, 6, 100)

ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train_cv_scaled, y_train_cv)

print("="*60)
print("RIDGE CROSS VALIDATION (RidgeCV)")
print("="*60)
print(f"Best Alpha: {ridge_cv.alpha_:.6f}")
print(f"Train R2: {ridge_cv.score(X_train_cv_scaled, y_train_cv):.4f}")
print(f"Test R2: {ridge_cv.score(X_test_cv_scaled, y_test_cv):.4f}")

In [None]:
# LassoCV - automatically finds best alpha
lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_cv.fit(X_train_cv_scaled, y_train_cv)

print("="*60)
print("LASSO CROSS VALIDATION (LassoCV)")
print("="*60)
print(f"Best Alpha: {lasso_cv.alpha_:.6f}")
print(f"Train R2: {lasso_cv.score(X_train_cv_scaled, y_train_cv):.4f}")
print(f"Test R2: {lasso_cv.score(X_test_cv_scaled, y_test_cv):.4f}")
print(f"\nNumber of non-zero coefficients: {np.sum(lasso_cv.coef_ != 0)}/{len(lasso_cv.coef_)}")

In [None]:
# Compare RidgeCV vs LassoCV
print("\n" + "="*60)
print("COMPARISON: RidgeCV vs LassoCV")
print("="*60)

cv_comparison = pd.DataFrame({
    'Model': ['RidgeCV', 'LassoCV'],
    'Best Alpha': [ridge_cv.alpha_, lasso_cv.alpha_],
    'Train R2': [ridge_cv.score(X_train_cv_scaled, y_train_cv), 
                 lasso_cv.score(X_train_cv_scaled, y_train_cv)],
    'Test R2': [ridge_cv.score(X_test_cv_scaled, y_test_cv),
                lasso_cv.score(X_test_cv_scaled, y_test_cv)]
})
print(cv_comparison.to_string(index=False))

---
# Q4: Multiclass Logistic Regression (One-vs-Rest)

In [None]:
# Load Iris dataset
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

print(f"Dataset Shape: {X_iris.shape}")
print(f"Classes: {iris.target_names}")

# Split data
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.2, random_state=42, stratify=y_iris
)

# Scale features
scaler_iris = StandardScaler()
X_train_iris_scaled = scaler_iris.fit_transform(X_train_iris)
X_test_iris_scaled = scaler_iris.transform(X_test_iris)

## Step-by-Step One-vs-Rest Implementation

In [None]:
def sigmoid(z):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def logistic_cost(X, y, theta):
    """Logistic regression cost function"""
    m = len(y)
    h = sigmoid(X @ theta)
    epsilon = 1e-15
    cost = -(1/m) * np.sum(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))
    return cost

def logistic_gradient_descent(X, y, learning_rate=0.1, n_iterations=1000):
    """Train logistic regression using gradient descent"""
    m, n = X.shape
    theta = np.zeros(n)
    
    for i in range(n_iterations):
        h = sigmoid(X @ theta)
        gradient = (1/m) * (X.T @ (h - y))
        theta = theta - learning_rate * gradient
    
    return theta

print("Logistic regression functions defined!")

In [None]:
# One-vs-Rest implementation
classes = np.unique(y_train_iris)
n_classes = len(classes)

# Add bias term
X_train_b = np.c_[np.ones((X_train_iris_scaled.shape[0], 1)), X_train_iris_scaled]
X_test_b = np.c_[np.ones((X_test_iris_scaled.shape[0], 1)), X_test_iris_scaled]

# Train one classifier per class
classifiers = {}

print("="*60)
print("TRAINING ONE-VS-REST CLASSIFIERS")
print("="*60)

for c in classes:
    # Create binary labels (class c vs all others)
    y_binary = (y_train_iris == c).astype(int)
    
    # Train classifier
    theta = logistic_gradient_descent(X_train_b, y_binary, learning_rate=0.5, n_iterations=2000)
    classifiers[c] = theta
    
    # Training accuracy for this classifier
    train_pred = (sigmoid(X_train_b @ theta) >= 0.5).astype(int)
    train_acc = np.mean(train_pred == y_binary)
    print(f"Class {c} ({iris.target_names[c]}): Training Accuracy = {train_acc:.4f}")

In [None]:
# Make predictions using One-vs-Rest
def predict_ovr(X, classifiers):
    """Predict using One-vs-Rest strategy"""
    probabilities = np.zeros((X.shape[0], len(classifiers)))
    
    for c, theta in classifiers.items():
        probabilities[:, c] = sigmoid(X @ theta)
    
    return np.argmax(probabilities, axis=1)

# Predictions
y_train_pred = predict_ovr(X_train_b, classifiers)
y_test_pred = predict_ovr(X_test_b, classifiers)

print("\n" + "="*60)
print("FINAL RESULTS - ONE-VS-REST MULTICLASS LOGISTIC REGRESSION")
print("="*60)
print(f"\nTraining Accuracy: {accuracy_score(y_train_iris, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test_iris, y_test_pred):.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test_iris, y_test_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion Matrix - One-vs-Rest Logistic Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
# Compare with sklearn implementation
print("\n" + "="*60)
print("COMPARISON WITH SKLEARN IMPLEMENTATION")
print("="*60)

sklearn_ovr = LogisticRegression(multi_class='ovr', max_iter=2000)
sklearn_ovr.fit(X_train_iris_scaled, y_train_iris)

print(f"\nSklearn OvR Accuracy: {sklearn_ovr.score(X_test_iris_scaled, y_test_iris):.4f}")
print(f"Custom OvR Accuracy: {accuracy_score(y_test_iris, y_test_pred):.4f}")

---
# Summary

- **Q1**: Implemented Ridge Regression from scratch using Gradient Descent with hyperparameter tuning
- **Q2**: Compared Linear, Ridge, and LASSO regression on Hitters dataset
- **Q3**: Used RidgeCV and LassoCV for automatic alpha selection with cross-validation
- **Q4**: Implemented Multiclass Logistic Regression using One-vs-Rest strategy from scratch