In [97]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [98]:
df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [99]:
# df_selected_col = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]
# df_selected_col

In [100]:
selected_cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df_selected = df[selected_cols]

In [101]:
# Question 1
# There's one column with missing values. What is it?

df_selected.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [102]:
# Question 2
# What's the median (50% percentile) for variable 'horsepower'?

median_horsepower = df_selected['horsepower'].median()
median_horsepower

np.float64(149.0)

In [103]:
n = len(df_selected) 

n_val = int( n * 0.2)
n_test = int( n * 0.2)
n_train = n - n_val - n_test

In [104]:
n

9704

In [105]:
n_val + n_test + n_train

9704

In [106]:
n_val, n_test, n_train

(1940, 1940, 5824)

In [108]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [109]:
df_train = df_selected.iloc[:n_train]
df_val = df_selected.iloc[n_train:n_train+n_val]
df_test = df_selected.iloc[n_train+n_val:]

In [110]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [111]:
len(df_train), len(df_val), len(df_test)

(5824, 1940, 1940)

In [112]:
df_train = df_train.reset_index(drop=True).copy()
df_val = df_val.reset_index(drop=True).copy()
df_test = df_test.reset_index(drop=True).copy()

In [113]:
# y_train = np.log1p(df_train.msrp.values)
# y_val = np.log1p(df_val.msrp.values)
# y_test = np.log1p(df_test.msrp.values)

In [114]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

def prepare_X(df):
    return df[base].copy()

In [115]:
# Option 1 - Fill with 0

df_train_0 = prepare_X(df_train).fillna(0)
df_val_0   = prepare_X(df_val).fillna(0)

y_train = df_train[target].values
y_val   = df_val[target].values

model_0 = LinearRegression()
model_0.fit(df_train_0, y_train)
pred_0  = model_0.predict(df_val_0)

rmse_0  = mean_squared_error(y_val, pred_0)
print("RMSE (fill 0):", round(rmse_0, 2))

RMSE (fill 0): 0.27


In [116]:
# Option 2 - Fill with the mean of this variable

df_train_mean = prepare_X(df_train).copy()
df_val_mean   = prepare_X(df_val).copy()

means = df_train_mean.mean()
df_train_mean = df_train_mean.fillna(means)
df_val_mean   = df_val_mean.fillna(means)

model_mean = LinearRegression()
model_mean.fit(df_train_mean, y_train)
pred_mean = model_mean.predict(df_val_mean)

mse_mean = mean_squared_error(y_val, pred_mean)
rmse_mean = np.sqrt(mse_mean)
print("RMSE (fill mean):", round(rmse_mean, 2))

RMSE (fill mean): 0.46


In [117]:
# Which option give better RMSE?

if round(rmse_0, 2) < round(rmse_mean, 2):
    print("→ Filling with 0 gives better RMSE.")
elif round(rmse_0, 2) > round(rmse_mean, 2):
    print("→ Filling with mean gives better RMSE.")
else:
    print("→ Both are equally good.")

→ Filling with 0 gives better RMSE.


In [121]:
# Question 4

def prepare_X_zero(df: pd.DataFrame) -> pd.DataFrame:
    # 1) select features
    X = df.loc[:, base].copy()
    # 2) coerce to numeric (strings -> NaN)
    X = X.apply(pd.to_numeric, errors='coerce')
    # 3) fill NaNs with 0 as required
    X = X.fillna(0)
    return X

def prepare_y(df: pd.DataFrame) -> np.ndarray:
    y = pd.to_numeric(df[target], errors='coerce').to_numpy()
    return y

In [122]:
# Step 1 — Prepare train/val data (fill NA with 0)
X_train = prepare_X_zero(df_train)
X_val   = prepare_X_zero(df_val)
y_train = prepare_y(df_train)
y_val   = prepare_y(df_val)

In [123]:
# Step 2 — Train Ridge models with different r values
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    rmse_scores[r] = round(rmse, 2)
    print(f"r={r}: RMSE={round(rmse, 2)}")

r=0: RMSE=0.52
r=0.01: RMSE=0.52
r=0.1: RMSE=0.52
r=1: RMSE=0.52
r=5: RMSE=0.52
r=10: RMSE=0.52
r=100: RMSE=0.52


In [124]:
# Step 3 — Find the best r (lowest RMSE; smallest if tie)
best_r = min(rmse_scores, key=lambda x: (rmse_scores[x], x))
print("\nBest r:", best_r)


Best r: 0


In [128]:
# Question 5

# Setup
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

def prepare_X(df):
    X = df[base].copy()
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
    return X

def prepare_y(df):
    return pd.to_numeric(df[target], errors='coerce').to_numpy()

In [129]:
# Try different random seeds
seeds = [0,1,2,3,4,5,6,7,8,9]
rmse_scores = []

for seed in seeds:
    # Shuffle
    df_shuffled = df_selected.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # Split 60/20/20
    n = len(df_shuffled)
    n_train = int(0.6 * n)
    n_val   = int(0.2 * n)
    n_test  = n - n_train - n_val
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val   = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test  = df_shuffled.iloc[n_train+n_val:].copy()
    
    # Prepare data
    X_train = prepare_X(df_train)
    X_val   = prepare_X(df_val)
    y_train = prepare_y(df_train)
    y_val   = prepare_y(df_val)
    
    # Train plain linear regression (no regularization)
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    
    rmse_scores.append(round(rmse, 3))
    print(f"Seed {seed}: RMSE = {round(rmse, 3)}")

Seed 0: RMSE = 0.521
Seed 1: RMSE = 0.522
Seed 2: RMSE = 0.523
Seed 3: RMSE = 0.516
Seed 4: RMSE = 0.511
Seed 5: RMSE = 0.529
Seed 6: RMSE = 0.532
Seed 7: RMSE = 0.51
Seed 8: RMSE = 0.515
Seed 9: RMSE = 0.513


In [130]:
# Compute std of RMSE scores
std = np.std(rmse_scores)
print("\nStandard deviation of RMSE:", round(std, 3))


Standard deviation of RMSE: 0.007


In [131]:
# Question 6

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

In [132]:
# Step 1 — Split with seed = 9

df_train = df_selected.iloc[:n_train]
df_val = df_selected.iloc[n_train:n_train+n_val]
df_test = df_selected.iloc[n_train+n_val:]

In [133]:
# Step 2 — Combine train + val

df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)

In [134]:
# Step 3 — Prepare data (fill NAs with 0)

base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

def prepare_X(df):
    X = df[base].copy()
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
    return X

def prepare_y(df):
    return pd.to_numeric(df[target], errors='coerce').to_numpy()

X_full_train = prepare_X(df_full_train)
y_full_train = prepare_y(df_full_train)
X_test       = prepare_X(df_test)
y_test       = prepare_y(df_test)

In [135]:
# Step 4 — Train Ridge model (r = 0.001)

model = Ridge(alpha=0.001)
model.fit(X_full_train, y_full_train)
pred_test = model.predict(X_test)

In [139]:
# Step 5 — Evaluate RMSE

mse_test = mean_squared_error(y_test, pred_test)
rmse_test = np.sqrt(mse_test)
print("RMSE on test:", round(rmse_test, 3))

RMSE on test: 0.511
