In [45]:
import numpy as np
import pandas as pd

In [46]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
print(df.columns.tolist())

['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']


In [51]:
FEATURE_MAP = {
    'displacement': 'engine_displacement',
    'weight': 'vehicle_weight',
    'model year': 'model_year',
}

In [50]:
df.rename(columns=FEATURE_MAP, inplace=True)

In [39]:
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

In [40]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [52]:
missing_counts = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].isna().sum()

In [53]:
q1_answer = missing_counts.idxmax() if missing_counts.max() > 0 else 'None'

In [54]:
print("Missing counts:\n", missing_counts)
print("\n[Q1] Column with missing values:", q1_answer)

Missing counts:
 engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
dtype: int64

[Q1] Column with missing values: horsepower


In [55]:
hp_median = float(df['horsepower'].median())

In [56]:
options_q2 = np.array([49, 99, 149, 199], dtype=float)

In [57]:
q2_nearest = float(options_q2[np.argmin(np.abs(options_q2 - hp_median))])

In [58]:
print("horsepower median (raw):", round(hp_median, 0))
print("[Q2] Nearest option:", q2_nearest)

horsepower median (raw): 149.0
[Q2] Nearest option: 149.0


In [60]:
def train_val_test_split(df_in, seed, val_frac=0.2, test_frac=0.2):
    n = len(df_in)
    n_val = int(n * val_frac)
    n_test = int(n * test_frac)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    rng = np.random.default_rng(seed)
    rng.shuffle(idx)

    df_train = df_in.iloc[idx[:n_train]].reset_index(drop=True)
    df_val   = df_in.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    df_test  = df_in.iloc[idx[n_train+n_val:]].reset_index(drop=True)
    return df_train, df_val, df_test

In [61]:
def fill_missing(df_part, how='mean'):
    out = df_part.copy()
    if how == 'mean':
        mean_hp = out['horsepower'].mean()
        out['horsepower'] = out['horsepower'].fillna(mean_hp)
    elif how == 'zero':
        out['horsepower'] = out['horsepower'].fillna(0.0)
    else:
        raise ValueError("how must be 'mean' or 'zero'")
    return out

In [62]:
def feature_matrix(df_part):
    X = df_part[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values.astype(float)
    ones = np.ones((X.shape[0], 1))
    return np.hstack([ones, X])  # add bias

In [63]:
def target_vector(df_part):
    return df_part['mpg'].values.astype(float)

In [64]:
def linear_regression_fit(X, y, r=0.0):
    # Normal equation with optional ridge: (X^T X + r I)^-1 X^T y
    XTX = X.T @ X
    if r and r > 0:
        XTX = XTX + r * np.eye(XTX.shape[0])
    w = np.linalg.inv(XTX) @ X.T @ y
    return w

In [65]:
def predict(X, w):
    return X @ w

In [66]:
def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

In [67]:
def evaluate_split(df_in, seed, r, fill='mean'):
    df_train, df_val, df_test = train_val_test_split(df_in, seed)
    df_train = fill_missing(df_train, fill)
    df_val   = fill_missing(df_val, fill)
    X_tr, y_tr = feature_matrix(df_train), target_vector(df_train)
    X_v,  y_v  = feature_matrix(df_val),   target_vector(df_val)
    w = linear_regression_fit(X_tr, y_tr, r=r)
    return rmse(y_v, predict(X_v, w))

In [68]:
seed_for_q3 = 42
rmse_mean = evaluate_split(df, seed=seed_for_q3, r=0.0, fill='mean')
rmse_zero = evaluate_split(df, seed=seed_for_q3, r=0.0, fill='zero')

In [69]:
better = 'With mean' if rmse_mean <= rmse_zero else 'With 0'
print(f"Validation RMSE (mean-fill): {rmse_mean:.3f}")
print(f"Validation RMSE (zero-fill): {rmse_zero:.3f}")
print("[Q3] Better option:", better)

Validation RMSE (mean-fill): 0.467
Validation RMSE (zero-fill): 0.522
[Q3] Better option: With mean


In [70]:
R_ALPHAS = [0, 0.01, 1, 10, 100]
seed_for_q4 = 42

In [71]:
grid = []
for r in R_ALPHAS:
    s = evaluate_split(df, seed=seed_for_q4, r=r, fill='mean')
    grid.append((r, s))

In [72]:
grid_sorted = sorted(grid, key=lambda t: t[1])
best_r, best_val_rmse = grid_sorted[0]

In [73]:
print("Validation RMSE by r:")
for r, s in grid:
    print(f"  r={r:<6} rmse={s:.3f}")
print(f"\n[Q4] Best r: {best_r}")

Validation RMSE by r:
  r=0      rmse=0.467
  r=0.01   rmse=0.467
  r=1      rmse=0.473
  r=10     rmse=0.473
  r=100    rmse=0.474

[Q4] Best r: 0.01


In [74]:
seeds = list(range(10))
scores = []
for s in seeds:
    val_rmse = evaluate_split(df, seed=s, r=best_r, fill='mean')
    scores.append(val_rmse)

In [75]:
std_rmse = float(np.std(scores))
options_q5 = np.array([0.001, 0.006, 0.060, 0.600])
nearest_q5 = float(options_q5[np.argmin(np.abs(options_q5 - std_rmse))])

In [76]:
print("Seed -> RMSE:")
for s, sc in zip(seeds, scores):
    print(f"  {s}: {sc:.3f}")

Seed -> RMSE:
  0: 0.475
  1: 0.462
  2: 0.467
  3: 0.461
  4: 0.467
  5: 0.471
  6: 0.462
  7: 0.457
  8: 0.463
  9: 0.474


In [77]:
print("\n[Q5] std(RMSE):", round(std_rmse, 3))
print("[Q5] Nearest option:", nearest_q5)


[Q5] std(RMSE): 0.006
[Q5] Nearest option: 0.006


In [78]:
def final_test_rmse(df_in, seed, r):
    df_train, df_val, df_test = train_val_test_split(df_in, seed)
    df_train_full = pd.concat([df_train, df_val], ignore_index=True)
    df_train_full = fill_missing(df_train_full, 'mean')
    df_test_f = fill_missing(df_test, 'mean')

    X_trf, y_trf = feature_matrix(df_train_full), target_vector(df_train_full)
    X_te,  y_te  = feature_matrix(df_test_f),      target_vector(df_test_f)

    w = linear_regression_fit(X_trf, y_trf, r=r)
    return rmse(y_te, predict(X_te, w))

In [79]:
rmse_test = final_test_rmse(df, seed=9, r=best_r)
options_q6 = np.array([0.15, 0.515, 5.15, 51.5])
nearest_q6 = float(options_q6[np.argmin(np.abs(options_q6 - rmse_test))])

In [80]:
print("[Q6] Test RMSE:", round(rmse_test, 3))
print("[Q6] Nearest option:", nearest_q6)

[Q6] Test RMSE: 0.448
[Q6] Nearest option: 0.515
