In [2]:
import pandas as pd

In [8]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-13 06:04:13--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-13 06:04:14 (180 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [3]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [4]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [5]:
columns = ['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']
df_subset = df[columns].copy()

In [6]:
df_subset

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [7]:
df_subset.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [8]:
df_subset['horsepower'].median()

149.0

In [9]:
import numpy as np

In [11]:
# Set the random seed for reproducibility
np.random.seed(42)

# Get the total number of rows in the DataFrame
n = len(df_subset)

# Create a shuffled array of indices from 0 to n-1
idx = np.arange(n)
np.random.shuffle(idx)

# Calculate the sizes for each set
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Split the shuffled indices
train_idx = idx[n_val+n_test:]
val_idx = idx[:n_val]
test_idx = idx[n_val:n_val+n_test]

# Create the DataFrames for each set
df_train = df_subset.iloc[train_idx]
df_val = df_subset.iloc[val_idx]
df_test = df_subset.iloc[test_idx]

In [21]:
# Create the feature matrix and target vector for the training set
X_train = df_train.drop('fuel_efficiency_mpg', axis=1)
y_train = df_train['fuel_efficiency_mpg'].values

# Do the same for the validation set
X_val = df_val.drop('fuel_efficiency_mpg', axis=1)
y_val = df_val['fuel_efficiency_mpg'].values

# Function to train the model using the normal equation
def train_linear_regression(X, y):
    # Add a bias term (a column of ones) to the feature matrix
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Normal equation: w = (X^T * X)^-1 * X^T * y
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:] # Separate bias (w0) and feature weights (w)

# Function to calculate RMSE
def rmse(y, y_pred):
    # Calculate the squared difference between actual and predicted values
    error = y_pred - y
    # Square the errors, take the mean, and then the square root
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [23]:
# --- Option 1: Fill with 0 ---
X_train_zero = X_train.fillna(0)
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train)

# Make predictions on the validation set
X_val_zero = X_val.fillna(0)
y_pred_zero = w0_zero + X_val_zero.dot(w_zero)
rmse_zero = rmse(y_val, y_pred_zero)
print(f"RMSE with zero-filling: {round(rmse_zero, 2)}")

RMSE with zero-filling: 0.53


In [24]:
# --- Option 2: Fill with mean ---
# IMPORTANT: Calculate mean on the training data ONLY to avoid data leakage
mean_hp = X_train['horsepower'].mean()
X_train_mean = X_train.fillna(mean_hp)
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

# Make predictions on the validation set
X_val_mean = X_val.fillna(mean_hp)
y_pred_mean = w0_mean + X_val_mean.dot(w_mean)
rmse_mean = rmse(y_val, y_pred_mean)
print(f"RMSE with mean-filling: {round(rmse_mean, 2)}")

RMSE with mean-filling: 0.47


In [41]:
# A new function for training with regularization
def train_linear_regression_reg(X, y, r=0.0):
    # Add a bias term (a column of ones)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Normal equation with regularization: w = (X^T * X + r*I)^-1 * X^T * y
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0]) # Create the regularization term
    XTX = XTX + reg
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# Prepare the data by filling missing values with 0
X_train_zero = X_train.fillna(0)
X_val_zero = X_val.fillna(0)

# Loop through each value of r
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    # Train the model with the current r
    w0, w = train_linear_regression_reg(X_train_zero, y_train, r=r)
    
    # Make predictions on the validation set
    y_pred = w0 + X_val_zero.dot(w)
    
    # Calculate and print the RMSE, rounded to 2 decimal places
    score = rmse(y_val, y_pred)
    print(f"r = {r}, RMSE = {round(score, 6)}")

r = 0, RMSE = 0.530479
r = 0.01, RMSE = 0.530955
r = 0.1, RMSE = 0.534768
r = 1, RMSE = 0.539329
r = 5, RMSE = 0.54011
r = 10, RMSE = 0.540215
r = 100, RMSE = 0.540312


In [42]:
# A list to store the RMSE score for each seed
rmse_scores = []

# A list of seeds to try
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Loop through each seed
for seed in seeds:
    # Set the seed for reproducibility
    np.random.seed(seed)

    # --- Data Splitting (same as before) ---
    n = len(df_subset)
    idx = np.arange(n)
    np.random.shuffle(idx)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    train_idx = idx[n_val+n_test:]
    val_idx = idx[:n_val]
    df_train = df_subset.iloc[train_idx]
    df_val = df_subset.iloc[val_idx]
    
    # --- Data Preparation ---
    X_train = df_train.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    y_train = df_train['fuel_efficiency_mpg'].values
    X_val = df_val.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    y_val = df_val['fuel_efficiency_mpg'].values

    # --- Train and Predict (using the non-regularized function) ---
    w0, w = train_linear_regression(X_train, y_train)
    y_pred = w0 + X_val.dot(w)
    
    # --- Calculate and store the score ---
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)

# Calculate the standard deviation of all the scores
std_deviation = np.std(rmse_scores)

print(f"The standard deviation of the RMSE scores is: {round(std_deviation, 3)}")

The standard deviation of the RMSE scores is: 0.008


In [43]:
# Set the final seed
np.random.seed(9)

# --- Data Splitting (60/20/20) ---
n = len(df_subset)
idx = np.arange(n)
np.random.shuffle(idx)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
train_idx = idx[n_val+n_test:]
val_idx = idx[:n_val]
test_idx = idx[n_val:n_val+n_test]
df_train = df_subset.iloc[train_idx]
df_val = df_subset.iloc[val_idx]
df_test = df_subset.iloc[test_idx]

# --- Combine train and validation sets ---
df_full_train = pd.concat([df_train, df_val])

# --- Prepare the final datasets ---
# Reset index is good practice after concatenating
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = df_full_train.drop('fuel_efficiency_mpg', axis=1)
y_full_train = df_full_train['fuel_efficiency_mpg'].values

X_test = df_test.drop('fuel_efficiency_mpg', axis=1)
y_test = df_test['fuel_efficiency_mpg'].values

# --- Fill missing values with 0 ---
X_full_train = X_full_train.fillna(0)
X_test = X_test.fillna(0)

# --- Train the final regularized model ---
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# --- Make predictions on the test set ---
y_pred = w0 + X_test.dot(w)

# --- Calculate the final RMSE score ---
final_rmse = rmse(y_test, y_pred)

print(f"The final RMSE on the test set is: {final_rmse}")

The final RMSE on the test set is: 0.5197377167065796
