Q1: K-Fold Cross Validation for Multiple Linear Regression (Least Squares Fit)


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

data=pd.read_csv("USA_Housing.csv")
X=data.drop(columns=['Price']).values
y=data['Price'].values.reshape(-1, 1)
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
kf=KFold(n_splits=5,shuffle=True,random_state=42)
best_beta=None
best_r2=-np.inf
print("5-Fold Cross Validation Results\n")
fold = 1
for train_index,test_index in kf.split(X_scaled):
    X_train,X_test=X_scaled[train_index],X_scaled[test_index]
    y_train,y_test=y[train_index],y[test_index]
    X_train_b=np.c_[np.ones((X_train.shape[0],1)),X_train]
    X_test_b=np.c_[np.ones((X_test.shape[0],1)),X_test]
    beta=np.linalg.inv(X_train_b.T@X_train_b)@(X_train_b.T@ y_train)
    y_pred = X_test_b @ beta
    r2 = r2_score(y_test, y_pred)
    print(f"Fold {fold}:")
    print(f" Beta Shape: {beta.shape}")
    print(f"R2 Score : {r2:.4f}\n")
    if r2>best_r2:
        best_r2=r2
        best_beta=beta
    fold+=1
print("Best R2 Score from CV:",best_r2)
print("Best Beta Matrix:\n",best_beta)


X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=42)


X_train_b=np.c_[np.ones((X_train.shape[0],1)),X_train]
X_test_b=np.c_[np.ones((X_test.shape[0],1)),X_test]


beta_final=np.linalg.inv(X_train_b.T@X_train_b)@(X_train_b.T@y_train)


y_pred_final=X_test_b@beta_final


final_r2=r2_score(y_test,y_pred_final)

print("\nFinal Model (70/30 Split)")
print("Final Beta Matrix:\n",beta_final)
print("Final R2 Score on Test Data:",final_r2)

Q2

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


data = pd.read_csv("USA_Housing.csv")

X = data.drop(columns=['Price']).values
y = data['Price'].values.reshape(-1, 1)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_scaled = np.c_[np.ones((X_scaled.shape[0], 1)), X_scaled]


X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_scaled, y, test_size=0.30, random_state=42
)

# Now split 70% into 56% (train) and 14% (val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.20, random_state=42
)  # 0.20 of 70% = 14%

print("Shapes:")
print(" Train:", X_train.shape, y_train.shape)
print(" Validation:", X_val.shape, y_val.shape)
print(" Test:", X_test.shape, y_test.shape)


def gradient_descent(X, y, lr, iters):
    m, n = X.shape
    beta = np.zeros((n, 1))

    for _ in range(iters):
        y_pred = X @ beta
        error = y_pred - y
        grad = (1/m) * (X.T @ error)
        beta -= lr * grad
    return beta


learning_rates = [0.001, 0.01, 0.1, 1]
iters = 1000

results = []

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, lr, iters)


    y_val_pred = X_val @ beta
    r2_val = r2_score(y_val, y_val_pred)

    y_test_pred = X_test @ beta
    r2_test = r2_score(y_test, y_test_pred)

    results.append((lr, beta, r2_val, r2_test))

    print(f"Learning Rate: {lr}")
    print(f"  Validation R2: {r2_val:.4f}")
    print(f"  Test R2      : {r2_test:.4f}\n")

best_result = max(results, key=lambda x: x[2])

best_lr, best_beta, best_r2_val, best_r2_test = best_result

print("------ Best Model ------")
print("Best Learning Rate:", best_lr)
print("Best Coefficients (Beta):\n", best_beta)
print("Best Validation R2:", best_r2_val)
print("Corresponding Test R2:", best_r2_test)

Car Price Prediction using Linear Regression and PCA


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size",
           "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
data = pd.read_csv(url, names=columns)


data = data.replace("?", np.nan)


for col in data.columns:
    if data[col].dtype == 'object':

        data[col].fillna(data[col].mode()[0], inplace=True)
    else:

        data[col] = pd.to_numeric(data[col], errors="coerce")  # convert to numeric first
        data[col].fillna(data[col].mean(), inplace=True)


data = data.dropna(subset=['price'])
data['price'] = data['price'].astype(float)


door_map = {"two": 2, "four": 4}
cyl_map = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "eight": 8, "twelve": 12}
data['num_doors'] = data['num_doors'].map(door_map)
data['num_cylinders'] = data['num_cylinders'].map(cyl_map)


data = pd.get_dummies(data, columns=["body_style", "drive_wheels"], drop_first=True)

for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])


data['fuel_system'] = data['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)

data['engine_type'] = data['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)


X = data.drop(columns=['price']).values
y = data['price'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
r2_original = r2_score(y_test, y_pred)

print("Performance with original features:")
print(" R2 Score:", r2_original)


pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

reg_pca = LinearRegression()
reg_pca.fit(X_train_pca, y_train_pca)

y_pred_pca = reg_pca.predict(X_test_pca)
r2_pca = r2_score(y_test_pca, y_pred_pca)

print("\nPerformance with PCA-reduced features:")
print(" R2 Score:", r2_pca)


if r2_pca > r2_original:
    print("\n✅ PCA improved performance!")
else:
    print("\n❌ PCA did not improve performance.")
