In [None]:
# Q1

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold


url = "https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX"  
data = pd.read_csv(url)

X = data.drop("Price", axis=1).values
y = data["Price"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)


kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_r2 = -np.inf
best_beta = None


for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    
    X_train_intercept = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    X_test_intercept = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

    beta = np.linalg.pinv(X_train_intercept.T @ X_train_intercept) @ (X_train_intercept.T @ y_train)

  
    y_pred = X_test_intercept @ beta

   
    r2 = r2_score(y_test, y_pred)
    print(f"Fold {fold}: R2 Score = {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

print("\nBest Beta (from CV):", best_beta)
print("Best R2 Score:", best_r2)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


X_train_intercept = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test_intercept = np.hstack((np.ones((X_test.shape[0], 1)), X_test))



beta_final = np.linalg.pinv(X_train_intercept.T @ X_train_intercept) @ (X_train_intercept.T @ y_train)

y_pred_final = X_test_intercept @ beta_final
final_r2 = r2_score(y_test, y_pred_final)

print("\nFinal Test R2 Score (with best beta):", final_r2)

Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244

Best Beta (from CV): [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]
Best R2 Score: 0.9243869413350317

Final Test R2 Score (with best beta): 0.9146818498916267


In [None]:
# Q2.

import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop("Price", axis=1).values
y = data["Price"].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_train, X_val, y_temp_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=42)

X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_val   = np.c_[np.ones(X_val.shape[0]), X_val]
X_test  = np.c_[np.ones(X_test.shape[0]), X_test]

def gradient_descent(X, y, lr, iterations):
    m, n = X.shape
    beta = np.zeros(n)
    for _ in range(iterations):
        gradient = -(2/m) * X.T @ (y - X @ beta)
        beta -= lr * gradient
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
best_r2 = -np.inf
best_beta = None

for lr in learning_rates:
    beta = gradient_descent(X_train, y_temp_train, lr, 1000)

    y_val_pred = X_val @ beta

    if np.any(np.isinf(y_val_pred)) or np.any(np.isnan(y_val_pred)):
      r2_val = -np.inf
    else:
      r2_val = r2_score(y_val, y_val_pred)

    print(f"LR={lr} | Val R2={r2_val:.4f}")

    if r2_val > best_r2:
        best_r2 = r2_val
        best_beta = beta

y_test_pred = X_test @ best_beta
final_r2 = r2_score(y_test, y_test_pred)


print("\nBest Beta (from validation):", best_beta)
print("Final Test R2 Score (with best beta):", final_r2)

LR=0.001 | Val R2=0.6820
LR=0.01 | Val R2=0.9098
LR=0.1 | Val R2=0.9098
LR=1 | Val R2=-inf

Best Beta (from validation): [1232618.31836202  230067.95333238  163710.26584918  121680.22876975
    2833.37135223  150657.57448494]
Final Test R2 Score (with best beta): 0.9147569598865972


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [None]:
# Q3.


import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
cols = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
        "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
        "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
        "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
        "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

df = pd.read_csv(url, names=cols)
df.replace("?", np.nan, inplace=True)

df = df.dropna(subset=["price"]).copy()

df["price"] = pd.to_numeric(df["price"], errors="coerce")

num_map = {
    "two": 2, "three": 3, "four": 4, "five": 5, "six": 6,
    "eight": 8, "twelve": 12
}
def text_to_number(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in num_map:
        return num_map[s]
    try:
        return float(s)
    except:
        return np.nan


df["num_doors"] = df["num_doors"].apply(text_to_number)
df["num_cylinders"] = df["num_cylinders"].apply(text_to_number)


label_cols = ["make", "fuel_type", "aspiration", "engine_location"]
dummy_cols = ["body_style", "drive_wheels"]
special_cols = ["fuel_system", "engine_type"]

excluded = set(label_cols + dummy_cols + special_cols + ["price"])
numeric_cols = [c for c in df.columns if c not in excluded]

for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
    med = df[c].median()
    df[c].fillna(med, inplace=True)

for c in label_cols:
    if df[c].isna().any():
        mode_val = df[c].mode(dropna=True)
        if not mode_val.empty:
            df[c].fillna(mode_val[0], inplace=True)
        else:
            df[c].fillna("missing", inplace=True)
    df[c] = LabelEncoder().fit_transform(df[c].astype(str))


for c in dummy_cols:
    if df[c].isna().any():
        mode_val = df[c].mode(dropna=True)
        if not mode_val.empty:
            df[c].fillna(mode_val[0], inplace=True)
        else:
            df[c].fillna("missing", inplace=True)
df = pd.get_dummies(df, columns=dummy_cols, drop_first=True)

df["fuel_system"] = df["fuel_system"].fillna("").astype(str).str.lower().apply(lambda x: 1 if "pfi" in x else 0)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(med, inplace=True)
