In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-07 18:20:27--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-07 18:20:27 (11.6 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [5]:
import pandas as pd
df = pd.read_csv("car_fuel_efficiency.csv")[['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']]

In [7]:
# Q1
df.isnull().sum()

Unnamed: 0,0
engine_displacement,0
horsepower,708
vehicle_weight,0
model_year,0
fuel_efficiency_mpg,0


In [9]:
#Q2
df['horsepower'].median()

149.0

In [10]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error


def split_60_20_20(df, seed):
    d = shuffle(df, random_state=seed).reset_index(drop=True)
    n = len(d)
    n_train = int(0.6 * n)
    n_val   = int(0.2 * n)
    df_train = d.iloc[:n_train].copy()
    df_val   = d.iloc[n_train:n_train+n_val].copy()
    df_test  = d.iloc[n_train+n_val:].copy()
    return df_train, df_val, df_test

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def make_Xy(train_df, val_df, fill_strategy="zero", mean_from_train=None):
    if fill_strategy == "zero":
        train_df['horsepower'] = train_df['horsepower'].fillna(0)
        val_df['horsepower']   = val_df['horsepower'].fillna(0)
    elif fill_strategy == "mean":
        assert mean_from_train is not None
        train_df['horsepower'] = train_df['horsepower'].fillna(mean_from_train)
        val_df['horsepower']   = val_df['horsepower'].fillna(mean_from_train)

    y_train = train_df['fuel_efficiency_mpg'].values
    y_val   = val_df['fuel_efficiency_mpg'].values
    X_train = train_df.drop('fuel_efficiency_mpg', axis=1).values
    X_val   = val_df.drop('fuel_efficiency_mpg', axis=1).values
    return X_train, y_train, X_val, y_val

# Q3
# Seed 42 split
df_train_42, df_val_42, df_test_42 = split_60_20_20(df, seed=42)

# Option A: fill horsepower with 0
Xa, ya, Xva, yva = make_Xy(df_train_42.copy(), df_val_42.copy(), fill_strategy="zero")
lin_a = LinearRegression().fit(Xa, ya)
rmse_zero = rmse(yva, lin_a.predict(Xva))
rmse_zero_round = round(rmse_zero, 2)

# Option B: fill horsepower with mean (from training only)
hp_mean_train = df_train_42['horsepower'].mean()
Xb, yb, Xvb, yvb = make_Xy(df_train_42.copy(), df_val_42.copy(),
                           fill_strategy="mean", mean_from_train=hp_mean_train)
lin_b = LinearRegression().fit(Xb, yb)
rmse_mean = rmse(yvb, lin_b.predict(Xvb))
rmse_mean_round = round(rmse_mean, 2)

print("Q3 — RMSE with 0:   ", rmse_zero_round)
print("Q3 — RMSE with mean:", rmse_mean_round)
if rmse_zero_round < rmse_mean_round:
    print("Q3 — Answer: With 0")
elif rmse_mean_round < rmse_zero_round:
    print("Q3 — Answer: With mean")
else:
    print("Q3 — Answer: Both are equally good")

Q3 — RMSE with 0:    0.52
Q3 — RMSE with mean: 0.46
Q3 — Answer: With mean


In [11]:

# Q4

# Fill NAs with 0 and try different r
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]

# Prepare data
Xq4_tr, yq4_tr, Xq4_val, yq4_val = make_Xy(df_train_42.copy(), df_val_42.copy(), fill_strategy="zero")

rmse_by_r = {}
for r in r_list:
    ridge = Ridge(alpha=r)
    ridge.fit(Xq4_tr, yq4_tr)
    pr = ridge.predict(Xq4_val)
    rmse_r = rmse(yq4_val, pr)
    rmse_by_r[r] = round(rmse_r, 2)

print("\nQ4 — RMSE by r:", rmse_by_r)
best_rmse_q4 = min(rmse_by_r.values())
best_r_candidates = [r for r, sc in rmse_by_r.items() if sc == best_rmse_q4]
best_r_q4 = min(best_r_candidates)  # taking smallest r
print("Q4 — Answer: r =", best_r_q4)


Q4 — RMSE by r: {0: np.float64(0.52), 0.01: np.float64(0.52), 0.1: np.float64(0.52), 1: np.float64(0.52), 5: np.float64(0.52), 10: np.float64(0.52), 100: np.float64(0.52)}
Q4 — Answer: r = 0


In [12]:
# Q5
# Seeds 0..9, fill NAs with 0, unregularized LinearRegression, std of RMSEs
rmse_raw_per_seed = []
rmse_rounded_per_seed = []

for seed in range(10):
    dtr, dval, _ = split_60_20_20(df, seed=seed)
    Xs_tr, ys_tr, Xs_val, ys_val = make_Xy(dtr.copy(), dval.copy(), fill_strategy="zero")
    lin = LinearRegression().fit(Xs_tr, ys_tr)
    preds = lin.predict(Xs_val)
    seed_rmse = rmse(ys_val, preds)
    rmse_raw_per_seed.append(seed_rmse)
    rmse_rounded_per_seed.append(round(seed_rmse, 2))

std_rmse = round(np.std(rmse_raw_per_seed), 3)  # std on raw, then round to 3 decimals
print("\nQ5 — RMSE per seed (rounded):", rmse_rounded_per_seed)
print("Q5 — std of RMSE across seeds:", std_rmse)


Q5 — RMSE per seed (rounded): [np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.51), np.float64(0.53), np.float64(0.53), np.float64(0.51), np.float64(0.51), np.float64(0.51)]
Q5 — std of RMSE across seeds: 0.007


In [13]:
# Q6

# Seed 9, combine train+val, fill NAs with 0, train Ridge with r=0.001, report test RMSE
dtr9, dval9, dte9 = split_60_20_20(df, seed=9)

# combine train + val
dtrainval9 = pd.concat([dtr9, dval9], axis=0).reset_index(drop=True)

# prepare train+val and test with zeros
dtrainval9['horsepower'] = dtrainval9['horsepower'].fillna(0)
dte9['horsepower']       = dte9['horsepower'].fillna(0)

y_trval = dtrainval9['fuel_efficiency_mpg'].values
y_test9 = dte9['fuel_efficiency_mpg'].values
X_trval = dtrainval9.drop('fuel_efficiency_mpg', axis=1).values
X_test9 = dte9.drop('fuel_efficiency_mpg', axis=1).values

ridge_001 = Ridge(alpha=0.001).fit(X_trval, y_trval)
test_rmse_q6 = rmse(y_test9, ridge_001.predict(X_test9))
print("\nQ6 — Test RMSE (r=0.001, seed=9):", round(test_rmse_q6, 3))



Q6 — Test RMSE (r=0.001, seed=9): 0.515
