In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

file_path = "USA_Housing.csv"
df = pd.read_csv(file_path)

X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)


scaler = StandardScaler()
X = scaler.fit_transform(X)


kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2_scores = []
betas = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]


    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ (X_train_b.T @ y_train)


    y_pred = X_test_b @ beta

    r2 = r2_score(y_test, y_pred)

    r2_scores.append(r2)
    betas.append(beta)


print("R2 scores for each fold:", r2_scores)
print("Average R2 score:", np.mean(r2_scores))

best_index = np.argmax(r2_scores)
best_beta = betas[best_index]
print("\nBest Beta Matrix:\n", best_beta)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]

y_pred_final = X_test_b @ best_beta

final_r2 = r2_score(y_test, y_pred_final)
print("\nFinal R2 Score on 70-30 split:", final_r2)

R2 scores for each fold: [0.9179971706985147, 0.9145677884802819, 0.9116116385364478, 0.9193091764960817, 0.9243869413350317]
Average R2 score: 0.9175745431092714

Best Beta Matrix:
 [[1.23161736e+06]
 [2.30225051e+05]
 [1.63956839e+05]
 [1.21115120e+05]
 [7.83467170e+02]
 [1.50662447e+05]]

Final R2 Score on 70-30 split: 0.9147458156636434


In [2]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


file_path = "USA_Housing.csv"
df = pd.read_csv(file_path)

X = df.drop("Price", axis=1).values
y = df["Price"].values.reshape(-1, 1)


scaler = StandardScaler()
X = scaler.fit_transform(X)


X = np.c_[np.ones((X.shape[0], 1)), X]


X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=42)


print("Shapes:")
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

def gradient_descent(X, y, lr, iterations=1000):
    m, n = X.shape
    beta = np.zeros((n, 1))

    for _ in range(iterations):
        y_pred = X @ beta
        error = y_pred - y
        gradient = (2/m) * (X.T @ error)
        beta -= lr * gradient
    return beta


learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    beta = gradient_descent(X_train, y_train, lr, iterations=1000)


    y_val_pred = X_val @ beta
    y_test_pred = X_test @ beta


    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    results[lr] = {
        "beta": beta,
        "R2_val": r2_val,
        "R2_test": r2_test
    }


best_lr = max(results, key=lambda x: results[x]["R2_val"])
best_beta = results[best_lr]["beta"]

print("\nResults (Learning Rate -> R2_val, R2_test):")
for lr, res in results.items():
    print(f"LR={lr} -> Validation R2: {res['R2_val']:.4f}, Test R2: {res['R2_test']:.4f}")

print("\nBest Learning Rate:", best_lr)
print("Best Beta Matrix:\n", best_beta)
print("Final Test R2:", results[best_lr]["R2_test"])

Shapes:
Train: (2800, 6) (2800, 1)
Validation: (700, 6) (700, 1)
Test: (1500, 6) (1500, 1)

Results (Learning Rate -> R2_val, R2_test):
LR=0.001 -> Validation R2: 0.6820, Test R2: 0.6490
LR=0.01 -> Validation R2: 0.9098, Test R2: 0.9148
LR=0.1 -> Validation R2: 0.9098, Test R2: 0.9148
LR=1 -> Validation R2: -inf, Test R2: -inf

Best Learning Rate: 0.01
Best Beta Matrix:
 [[1232618.31836202]
 [ 230067.95333238]
 [ 163710.26584918]
 [ 121680.22876975]
 [   2833.37135223]
 [ 150657.57448494]]
Final Test R2: 0.9147569598865972


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA

cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
        "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
        "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
        "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
        "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=cols)

df = df.replace("?", np.nan)

numeric_cols = df.columns.drop(["make","fuel_type","aspiration","num_doors",
                                "body_style","drive_wheels","engine_location",
                                "engine_type","num_cylinders","fuel_system"])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')


df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

df["num_doors"] = df["num_doors"].fillna(df["num_doors"].mode()[0])

df = df.dropna(subset=["price"])

word_to_num = {
    "two":2, "three":3, "four":4, "five":5,
    "six":6, "eight":8, "twelve":12
}
df["num_doors"] = df["num_doors"].map({"two":2,"four":4})
df["num_cylinders"] = df["num_cylinders"].map(word_to_num)

df = pd.get_dummies(df, columns=["body_style","drive_wheels"], drop_first=True)

for col in ["make","aspiration","engine_location","fuel_type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in str(x).lower() else 0)

df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in str(x).lower() else 0)


X = df.drop("price", axis=1).values
y = df["price"].astype(float).values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

r2_original = r2_score(y_test, y_pred)
print("R2 Score (Without PCA):", r2_original)

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = lr_pca.predict(X_test_pca)

r2_pca = r2_score(y_test_pca, y_pred_pca)
print("R2 Score (With PCA):", r2_pca)

if r2_pca > r2_original:
    print("\n PCA improved performance!")
else:
    print("\n PCA did not improve performance.")

R2 Score (Without PCA): 0.804442243576259
R2 Score (With PCA): 0.7500675882701553

 PCA did not improve performance.
