# Machine Learning for Regression for Machine Learning Zoomcamp 2025

## Import libraries & Dataset


In [48]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

## Q1. Missing values


In [50]:
df


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [51]:
missing_values = df.isnull().sum()
missing_values

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

## Q2. Median for horse power

In [53]:
horsepower_median = df['horsepower'].median()
print(horsepower_median)

149.0


## Q3. Filling NAs


In [55]:
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df_filtered = df[columns].copy()

df_filtered = df_filtered.fillna(df_filtered.mean())
df_filtered

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.000000,3413.433759,2003,13.231729
1,130,97.000000,3149.664934,2007,13.688217
2,170,78.000000,3079.038997,2018,14.246341
3,220,149.657292,2542.392402,2009,16.912736
4,210,140.000000,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.000000,2981.107371,2013,15.101802
9700,180,154.000000,2439.525729,2004,17.962326
9701,220,138.000000,2583.471318,2008,17.186587
9702,230,177.000000,2905.527390,2011,15.331551


## Q4. Best regularization

In [57]:
# Data Preparation
np.random.seed(42)
df_filtered = df_filtered.sample(frac=1, random_state=42).reset_index(drop=True)

n = len(df_filtered)
n_train = int(0.6 * n)
n_val = int(0.2 * n)

df_train = df_filtered[:n_train]
df_val = df_filtered[n_train:n_train + n_val]
df_test = df_filtered[n_train + n_val:]

In [58]:
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

X_train = df_train[features].copy()
y_train = df_train[target].values

X_val = df_val[features].copy()
y_val = df_val[target].values

## Q5. RMSE Standard Deviation

In [60]:

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)

# Train model
model_0 = LinearRegression()
model_0.fit(X_train_0, y_train)

# Predict
y_pred_0 = model_0.predict(X_val_0)
import numpy as np
rmse_0 = np.sqrt(mean_squared_error(y_val, y_pred_0))
print("RMSE with 0 is : ", round(rmse_0, 2))

RMSE with 0 is :  0.46


In [61]:
hp_mean = X_train['horsepower'].mean()

X_train_mean = X_train.fillna(hp_mean)
X_val_mean = X_val.fillna(hp_mean)

# Train model
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)

# Predict and evaluate
y_pred_mean = model_mean.predict(X_val_mean)
rmse_mean = mean_squared_error(y_val, y_pred_mean)
print("RMSE with mean is : ", round(rmse_mean, 2))

RMSE with mean is :  0.21


In [62]:
from sklearn.linear_model import Ridge

# Fill missing with 0
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
results = {}

for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train_0, y_train)
    y_pred = model.predict(X_val_0)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results[r] = round(rmse, 2)

results

{0: 0.46, 0.01: 0.46, 0.1: 0.46, 1: 0.46, 5: 0.46, 10: 0.46, 100: 0.46}

In [63]:
from sklearn.model_selection import train_test_split

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []

for s in seeds:
    df_shuffled = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]
    df_shuffled = df_shuffled.sample(frac=1, random_state=s)

    n = len(df_shuffled)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    n_test = n - n_train - n_val

    df_train = df_shuffled.iloc[:n_train]
    df_val = df_shuffled.iloc[n_train:n_train+n_val]
    df_test = df_shuffled.iloc[n_train+n_val:]

    X_train = df_train.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    y_train = df_train['fuel_efficiency_mpg']
    X_val = df_val.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    y_val = df_val['fuel_efficiency_mpg']

    # Train linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append(rmse)

# Compute standard deviation
std = np.std(scores)
print("Standard deviation of RMSE is : ", round(std, 3))

Standard deviation of RMSE is :  0.007


## Q6. Evaluation on test

In [65]:
df_shuffled = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]
df_shuffled = df_shuffled.sample(frac=1, random_state=9)

n = len(df_shuffled)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

df_train = df_shuffled.iloc[:n_train]
df_val = df_shuffled.iloc[n_train:n_train+n_val]
df_test = df_shuffled.iloc[n_train+n_val:]

# Combine train and validation
df_full_train = pd.concat([df_train, df_val])

X_full_train = df_full_train.drop('fuel_efficiency_mpg', axis=1).fillna(0)
y_full_train = df_full_train['fuel_efficiency_mpg']

X_test = df_test.drop('fuel_efficiency_mpg', axis=1).fillna(0)
y_test = df_test['fuel_efficiency_mpg']

# Train model with r = 0.001
model = Ridge(alpha=0.001)
model.fit(X_full_train, y_full_train)

# Predict on test
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Test RMSE is : ", round(rmse, 3))

Test RMSE is :  0.515
