
# Assignment - 3

Compare Linear Regression Models with Default vs. Custom Parameters for Earthquake Magnitude Prediction

## Objective

Train and analyze two linear regression models to predict earthquake Magnitude: one with default parameters and one with custom-tuned parameters. Compare their performances and interpret findings, and last provide a summary report.

In [1]:

import os, math
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

DATA_PATH = "C:/Users/hp/Python/Infosys_Springboard/Week2/preprocessed_earthquake_data.csv"
TARGET = "Magnitude"
RANDOM_STATE = 42


In [2]:
df = pd.read_csv(DATA_PATH).dropna(subset=[TARGET]).drop_duplicates()
X, y = df.drop(columns=[TARGET]), df[TARGET].astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

preproc = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/hp/Python/Infosys_Springboard/Week2/preprocessed_earthquake_data.csv'

In [None]:
default_lr = Pipeline([("prep", preproc), ("lr", LinearRegression())])
custom_lr  = Pipeline([("prep", preproc), ("lr", LinearRegression(positive=True))])

default_lr.fit(X_train, y_train)
custom_lr.fit(X_train, y_train)

def metrics(m):
    y_pred = m.predict(X_test)
    return {
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": math.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred),
    }

res = pd.DataFrame({
    "Default": metrics(default_lr),
    "Custom":  metrics(custom_lr)
}).T

display(res.round(4))


In [None]:
# Plotting
fig, ax = plt.subplots(figsize=(6,4))
metrics_order = ["MAE", "RMSE", "R2"]
x = np.arange(len(metrics_order))
width = 0.35

ax.bar(x - width/2, res.loc["Default", metrics_order].values, width, label="Default")
ax.bar(x + width/2, res.loc["Custom",  metrics_order].values, width, label="Custom")

ax.set_xticks(x); ax.set_xticklabels(metrics_order)
ax.set_ylabel("Score"); ax.set_title("Default vs Custom — Test Metrics")
ax.legend()
for i, v in enumerate(res.loc["Default", metrics_order].values):
    ax.text(x[i] - width/2, v, f"{v:.3f}", ha="center", va="bottom")
for i, v in enumerate(res.loc["Custom", metrics_order].values):
    ax.text(x[i] + width/2, v, f"{v:.3f}", ha="center", va="bottom")
plt.tight_layout(); plt.show()

## Summary

Default model: MAE = 0.7070, RMSE = 0.9715, R² = 0.0915

Custom model: MAE = 0.7078, RMSE = 0.9730, R² = 0.0887

Default performed slightly better across RMSE and R².

Both models show low explanatory power (R² ~0.09).