Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import load_breast_cancer



PART 1: REGRESSION TASK (California Housing)

In [2]:
#Task 1: Load & Split Dataset (80% Train, 20% Test)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer



In [3]:
# Load dataset
data = pd.read_csv("housing.csv")

# One-hot encode categorical column
data = pd.get_dummies(data, columns=["ocean_proximity"], drop_first=True)

# Separate features & target
X = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

# Handle missing values using mean imputation
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training shape:", X_train.shape)
print("Testing shape :", X_test.shape)



Training shape: (16512, 12)
Testing shape : (4128, 12)


In [4]:
#Task 2 – Step 1: Baseline Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(X_train, y_train)

train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)

print("Baseline Linear Regression")
print("Train MSE:", mean_squared_error(y_train, train_pred))
print("Test  MSE:", mean_squared_error(y_test, test_pred))


Baseline Linear Regression
Train MSE: 4683203783.504253
Test  MSE: 4904399775.949258


In [5]:
#Task 2 – Step 2: Ridge & Lasso
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV


In [6]:
#Ridge Regression (L2)
ridge = Ridge()
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(
    ridge,
    ridge_params,
    cv=5,
    scoring='neg_mean_squared_error'
)
ridge_cv.fit(X_train, y_train)

best_ridge = ridge_cv.best_estimator_
print("Best Ridge alpha:", ridge_cv.best_params_)


Best Ridge alpha: {'alpha': 1}


In [7]:
#Lasso Regression (L1)
lasso = Lasso(max_iter=10000)
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10]}

lasso_cv = GridSearchCV(
    lasso,
    lasso_params,
    cv=5,
    scoring='neg_mean_squared_error'
)
lasso_cv.fit(X_train, y_train)

best_lasso = lasso_cv.best_estimator_
print("Best Lasso alpha:", lasso_cv.best_params_)


Best Lasso alpha: {'alpha': 0.001}


In [8]:
#Task 2 – Step 3: Evaluation (L1 vs L2)
print("Ridge Regression")
print("Train MSE:", mean_squared_error(y_train, best_ridge.predict(X_train)))
print("Test  MSE:", mean_squared_error(y_test, best_ridge.predict(X_test)))

print("\nLasso Regression")
print("Train MSE:", mean_squared_error(y_train, best_lasso.predict(X_train)))
print("Test  MSE:", mean_squared_error(y_test, best_lasso.predict(X_test)))


Ridge Regression
Train MSE: 4683383574.687478
Test  MSE: 4905952780.849343

Lasso Regression
Train MSE: 4683203783.508414
Test  MSE: 4904399969.461572


PART 2: CLASSIFICATION TASK (Breast Cancer)

In [9]:
#Task 1: Load & Split Dataset (80% Train, 20% Test)
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


In [10]:
# Load dataset
X, y = load_breast_cancer(return_X_y=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape)
print("Testing samples :", X_test.shape)


Training samples: (455, 30)
Testing samples : (114, 30)


In [11]:
#Task 2 – Step 1: Baseline Logistic Regression (No Regularization)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [12]:
# Baseline Logistic Regression
baseline_log = LogisticRegression(max_iter=10000)
baseline_log.fit(X_train, y_train)

# Predictions
train_pred = baseline_log.predict(X_train)
test_pred = baseline_log.predict(X_test)

# Accuracy
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print("Baseline Logistic Regression")
print("Train Accuracy:", train_acc)
print("Test  Accuracy:", test_acc)


Baseline Logistic Regression
Train Accuracy: 0.9582417582417583
Test  Accuracy: 0.956140350877193


In [13]:
#Task 2 – Step 2: Hyperparameter Tuning (L1 & L2)
from sklearn.model_selection import GridSearchCV


In [14]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

log_cv = GridSearchCV(
    LogisticRegression(solver='liblinear', max_iter=10000),
    param_grid,
    cv=5,
    scoring='accuracy'
)

log_cv.fit(X_train, y_train)

best_log = log_cv.best_estimator_

print("Best Parameters:", log_cv.best_params_)


Best Parameters: {'C': 100, 'penalty': 'l1'}


In [15]:
#Task 2 – Step 3: Regularization Comparison (Final Model)
# Final evaluation
final_train_acc = accuracy_score(
    y_train, best_log.predict(X_train)
)

final_test_acc = accuracy_score(
    y_test, best_log.predict(X_test)
)

print("Optimized Logistic Regression")
print("Train Accuracy:", final_train_acc)
print("Test  Accuracy:", final_test_acc)


Optimized Logistic Regression
Train Accuracy: 0.989010989010989
Test  Accuracy: 0.9824561403508771


In [16]:
print("Number of coefficients:", best_log.coef_.shape)
print("Sample coefficients:", best_log.coef_[0][:10])


Number of coefficients: (1, 30)
Sample coefficients: [ 7.27610214e-01 -1.08216950e-01  9.89249367e-02 -2.14990675e-03
  0.00000000e+00  4.73989938e+01 -1.15916915e+01 -1.36499813e+02
  1.99010367e+01  0.00000000e+00]
