In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# LOAD DATASET
df = pd.read_csv('USA_Housing.csv')

# Print first few rows & column names to inspect
print("Columns in dataset:\n", df.columns)
print("\nSample data:\n", df.head())

# --- Detect the target column automatically ---
target_candidates = [c for c in df.columns if 'price' in c.lower()]
if len(target_candidates) == 0:
    raise ValueError("No column containing 'price' found. Please check dataset columns!")
target_col = target_candidates[0]

# Separate features and target
X = df.drop(target_col, axis=1).values
y = df[target_col].values.reshape(-1, 1)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Q1: 5-FOLD CROSS VALIDATION (LEAST SQUARE ERROR FIT)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
betas = []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    X_train_aug = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    X_test_aug = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

    beta = np.linalg.inv(X_train_aug.T @ X_train_aug) @ X_train_aug.T @ y_train
    y_pred = X_test_aug @ beta

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    betas.append(beta)

best_beta = betas[np.argmax(r2_scores)]
print("\nQ1 - R² scores for 5 folds:", r2_scores)
print("Q1 - Best Beta (coefficients):", best_beta.ravel())

# Retrain using best beta on 70%-30% split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_aug = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_test_aug = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
beta_final = np.linalg.inv(X_train_aug.T @ X_train_aug) @ X_train_aug.T @ y_train
y_pred_final = X_test_aug @ beta_final
print("Q1 - Final Test R² score (70/30 split):", r2_score(y_test, y_pred_final))

# Q2: VALIDATION SET APPROACH (GRADIENT DESCENT)

X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

X_train_aug = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
X_val_aug = np.hstack([np.ones((X_val.shape[0], 1)), X_val])
X_test_aug = np.hstack([np.ones((X_test.shape[0], 1)), X_test])

def gradient_descent(X, y, lr, iterations=1000):
    m, n = X.shape
    beta = np.zeros((n, 1))
    for i in range(iterations):
        gradient = (1/m) * (X.T @ (X @ beta - y))
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
results = []

for lr in learning_rates:
    beta_gd = gradient_descent(X_train_aug, y_train, lr)
    y_val_pred = X_val_aug @ beta_gd
    y_test_pred = X_test_aug @ beta_gd
    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)
    results.append((lr, beta_gd, r2_val, r2_test))

best_lr, best_beta_gd, best_val_r2, best_test_r2 = max(results, key=lambda x: x[2])
print("\nQ2 - Gradient Descent Results:")
for lr, beta_gd, r2_val, r2_test in results:
    print(f"LR={lr} | Validation R²={r2_val:.4f} | Test R²={r2_test:.4f}")
print("\nQ2 - Best Learning Rate:", best_lr)
print("Q2 - Best Beta (coefficients):", best_beta_gd.ravel())
print("Q2 - Best Validation R²:", best_val_r2)
print("Q2 - Corresponding Test R²:", best_test_r2)

# Q3: Pre-processing and Multiple Linear Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# 1. Load dataset
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
           "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
           "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
           "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=columns)

# 2. Replace ? with NaN, impute missing values
df = df.replace("?", np.nan)

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])  # categorical: mode
    else:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df[col] = df[col].fillna(df[col].median())   # numeric: median

df = df.dropna(subset=["price"])  # drop rows with missing price
df["price"] = pd.to_numeric(df["price"])

# 3. Encoding
# (i) num_doors & num_cylinders
word_to_num = {"two": 2, "three": 3, "four": 4, "five": 5, "six": 6,
               "eight": 8, "twelve": 12}
df["num_doors"] = df["num_doors"].replace(word_to_num)
df["num_cylinders"] = df["num_cylinders"].replace(word_to_num)

df["num_doors"] = pd.to_numeric(df["num_doors"], errors="coerce").astype(int)
df["num_cylinders"] = pd.to_numeric(df["num_cylinders"], errors="coerce").astype(int)

# (ii) body_style & drive_wheels → dummy encoding
df = pd.get_dummies(df, columns=["body_style", "drive_wheels"], drop_first=True)

# (iii) make, aspiration, engine_location, fuel_type → label encoding
label_cols = ["make", "aspiration", "engine_location", "fuel_type"]
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

# (iv) fuel_system → pfi = 1 else 0
df["fuel_system"] = df["fuel_system"].apply(lambda x: 1 if "pfi" in x else 0)

# (v) engine_type → ohc = 1 else 0
df["engine_type"] = df["engine_type"].apply(lambda x: 1 if "ohc" in x else 0)

# 4. Split features & target
X = df.drop("price", axis=1)
y = df["price"]

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Train-test split (70-30)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Performance without PCA:")
print("  R2:", r2_score(y_test, y_pred))
print("  MSE:", mean_squared_error(y_test, y_pred))
print("  MAE:", mean_absolute_error(y_test, y_pred))

# 6. PCA + Linear Regression
pca = PCA(n_components=0.95)  # keep 95% variance
X_pca = pca.fit_transform(X_scaled)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = lr_pca.predict(X_test_pca)

print("\nPerformance with PCA:")
print("  R2:", r2_score(y_test_pca, y_pred_pca))
print("  MSE:", mean_squared_error(y_test_pca, y_pred_pca))
print("  MAE:", mean_absolute_error(y_test_pca, y_pred_pca))



Columns in dataset:
 Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')

Sample data:
    Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0       79545.45857             5.682861                   7.009188   
1       79248.64245             6.002900                   6.730821   
2       61287.06718             5.865890                   8.512727   
3       63345.24005             7.188236                   5.586729   
4       59982.19723             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  
0                          4.09      23086.80050  1.059034e+06  
1                          3.09      40173.07217  1.505891e+06  
2                          5.13      36882.15940  1.058988e+06  
3                          3.26      34310.24283  1.260617e+06  
4                          4.23      26354.

  df["num_doors"] = df["num_doors"].replace(word_to_num)
  df["num_cylinders"] = df["num_cylinders"].replace(word_to_num)
