## Getting data

In [99]:
import pandas as pd

In [100]:
data = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
data.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


## Preparing the dataset


In [101]:
data.shape[1]

11

In [102]:
data.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [103]:
data2 = data[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

## EDA

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(data['fuel_efficiency_mpg'], bins=30, kde=True)
plt.title('Distribution of Fuel Efficiency (mpg)')
plt.xlabel('Fuel Efficiency (mpg)')
plt.ylabel('Frequency')
plt.show()

## Question 1. Missing values

In [104]:
data2.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

## Question 2. Median for horse power

In [105]:
data2['horsepower'].median()

np.float64(149.0)

## Prepare and split the dataset

In [106]:
import numpy as np

In [107]:
n_rows = len(data2)

# Calculate the number of rows for each set
n_val = int(0.2 * n_rows)
n_test = int(0.2 * n_rows)
n_train = n_rows - n_val - n_test

In [108]:
# Create and shuffle indices
idx = np.arange(n_rows)
np.random.seed(42)
np.random.shuffle(idx)

In [109]:
# Split the data using shuffled indices
df_train = data2.iloc[idx[:n_train]]
df_val = data2.iloc[idx[n_train : n_train + n_val]]
df_test = data2.iloc[idx[n_train + n_val :]]

print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Test set size: {len(df_test)}")

Training set size: 5824
Validation set size: 1940
Test set size: 1940


In [110]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
483,220,144.0,2535.887591,2009,16.642943
7506,160,141.0,2741.170484,2019,16.298377
8795,230,155.0,2471.880237,2017,18.591822
1688,150,206.0,3748.164469,2015,11.818843
6217,300,111.0,2135.716359,2006,19.402209


## Question 3. Filling NAs

In [111]:
#Separate the target variable ('fuel_efficiency_mpg') from the features in the training, validation, and test sets.

y_train = df_train['fuel_efficiency_mpg']
X_train = df_train.drop('fuel_efficiency_mpg', axis=1)

y_val = df_val['fuel_efficiency_mpg']
X_val = df_val.drop('fuel_efficiency_mpg', axis=1)

y_test = df_test['fuel_efficiency_mpg']
X_test = df_test.drop('fuel_efficiency_mpg', axis=1)

In [112]:
#filling missing 'horsepower' values with 0 for each split.

X_train_zero = X_train.copy()
X_val_zero = X_val.copy()
X_test_zero = X_test.copy()

X_train_zero['horsepower'].fillna(0, inplace=True)
X_val_zero['horsepower'].fillna(0, inplace=True)
X_test_zero['horsepower'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_zero['horsepower'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val_zero['horsepower'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [113]:
#Train a linear regression model using the training features filled with 0

from sklearn.linear_model import LinearRegression

model_zero = LinearRegression()
model_zero.fit(X_train_zero, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [114]:
#predictions on the validation set with 0-imputed values

from sklearn.metrics import mean_squared_error
y_pred_zero = model_zero.predict(X_val_zero)

In [115]:
#calculate the RMSE.

mse_zero = mean_squared_error(y_val, y_pred_zero)
rmse_zero = np.sqrt(mse_zero)
rmse_zero_rounded = round(rmse_zero, 2)

print(f"RMSE with 0 imputation: {rmse_zero_rounded}")

RMSE with 0 imputation: 0.52


In [116]:
##filling missing 'horsepower' values with mean for each split.

train_mean_horsepower = X_train['horsepower'].mean()

X_train_mean = X_train.copy()
X_train_mean['horsepower'].fillna(train_mean_horsepower, inplace=True)

X_val_mean = X_val.copy()
X_val_mean['horsepower'].fillna(train_mean_horsepower, inplace=True)

X_test_mean = X_test.copy()
X_test_mean['horsepower'].fillna(train_mean_horsepower, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_mean['horsepower'].fillna(train_mean_horsepower, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val_mean['horsepower'].fillna(train_mean_horsepower, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte

In [117]:
#Train a linear regression model using the training features filled with 0

model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [118]:
y_pred_mean = model_mean.predict(X_val_mean)
mse_mean = mean_squared_error(y_val, y_pred_mean)
rmse_mean = np.sqrt(mse_mean)
rmse_mean_rounded = round(rmse_mean, 2)

print(f"RMSE with mean imputation: {rmse_mean_rounded}")

RMSE with mean imputation: 0.46


When missing values in 'horsepower' were imputed with 0, the linear regression model achieved an RMSE of 0.52 on the validation set while that imputed with the mean of the training data, the linear regression model achieved an RMSE of 0.47 on the validation set.

Imputing missing 'horsepower' values with the training mean resulted in a lower RMSE, suggesting it is a better strategy than filling with 0 for this dataset and model.

## Question 4. Best regularization

In [119]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
print(r_values)

[0, 0.01, 0.1, 1, 5, 10, 100]


In [120]:
from sklearn.linear_model import Ridge

rmse_scores = []

for r in r_values:
    model = Ridge(alpha=r, solver='sag', random_state=42)
    model.fit(X_train_zero, y_train)

In [121]:
from sklearn.metrics import mean_squared_error

for r in r_values:
    model = Ridge(alpha=r, solver='sag', random_state=42)
    model.fit(X_train_zero, y_train)
    y_pred = model.predict(X_val_zero)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    rmse_scores.append(round(rmse, 2))

print("RMSE scores for each r:", rmse_scores) 

RMSE scores for each r: [np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.52)]


In [122]:
min_rmse = min(rmse_scores)
best_r_index = rmse_scores.index(min_rmse)
best_r = r_values[best_r_index]

print(f"The best regularization parameter (r) is: {best_r} with an RMSE of: {min_rmse}")

The best regularization parameter (r) is: 0 with an RMSE of: 0.52


All tested regularization strengths resulted in the same rounded RMSE of 0.52 on the validation set.
Based on the task's criteria (lowest RMSE, smallest r in case of a tie), the best regularization parameter (r) was identified as 0, with an RMSE of 0.52.

## Question 5. RMSE Standard Deviation

In [123]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
print(seeds)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [124]:
rmse_scores_by_seed = []
print(rmse_scores_by_seed)

[]


In [125]:
for seed in seeds:
    print(f"Processing with seed: {seed}")
    # The subsequent steps for shuffling, splitting, training, and evaluation will go inside this loop.

Processing with seed: 0
Processing with seed: 1
Processing with seed: 2
Processing with seed: 3
Processing with seed: 4
Processing with seed: 5
Processing with seed: 6
Processing with seed: 7
Processing with seed: 8
Processing with seed: 9


In [126]:
rmse_scores_numpy = []

In [127]:
# Split data into train, validation, and test sets (60/20/20)

n_rows = len(data2)
n_val = int(0.2 * n_rows)
n_test = int(0.2 * n_rows)
n_train = n_rows - n_val - n_test

In [128]:
idx = np.arange(n_rows)
np.random.seed(seed) # Use the current seed for shuffling indices
np.random.shuffle(idx)

In [129]:
df_train = data2.iloc[idx[:n_train]].copy() 
df_val = data2.iloc[idx[n_train : n_train + n_val]].copy() # Use .copy()
df_test = data2.iloc[idx[n_train + n_val :]].copy() # Use .copy()


In [130]:
# Identify features and target
y_train = df_train['fuel_efficiency_mpg']
X_train = df_train.drop('fuel_efficiency_mpg', axis=1)

y_val = df_val['fuel_efficiency_mpg']
X_val = df_val.drop('fuel_efficiency_mpg', axis=1)

y_test = df_test['fuel_efficiency_mpg']
X_test = df_test.drop('fuel_efficiency_mpg', axis=1)


In [131]:
# Handle missing values (fill with 0)
X_train_processed = X_train.copy()
X_val_processed = X_val.copy()

X_train_processed['horsepower'] = X_train_processed['horsepower'].fillna(0)
X_val_processed['horsepower'] = X_val_processed['horsepower'].fillna(0)

In [132]:
# Train Linear Regression model
model = LinearRegression()
model.fit(X_train_processed, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [133]:
# Evaluate model
y_pred = model.predict(X_val_processed)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

In [134]:
# Store RMSE
rmse_scores_numpy.append(round(rmse, 2))

print("\nRMSE scores for each seed (NumPy method):", rmse_scores_numpy)


RMSE scores for each seed (NumPy method): [np.float64(0.51)]


In [135]:
std_dev_rmse = np.std(rmse_scores_numpy)
std_dev_rmse_rounded = round(std_dev_rmse, 3)

print(f"\nStandard deviation of RMSE scores across seeds: {std_dev_rmse_rounded}")


Standard deviation of RMSE scores across seeds: 0.0


## Question 6. Evaluation on test

In [136]:
# Calculate the number of rows for each set
n_rows = len(data2)
n_train = int(0.6 * n_rows)
n_val = int(0.2 * n_rows)
n_test = n_rows - n_train - n_val

In [137]:
# Calculate the number of rows for each set
n_rows = len(data2)
n_train = int(0.6 * n_rows)
n_val = int(0.2 * n_rows)
n_test = n_rows - n_train - n_val

In [138]:
# Split the data using shuffled indices
df_train = data2.iloc[idx[:n_train]]
df_val = data2.iloc[idx[n_train : n_train + n_val]]
df_test = data2.iloc[idx[n_train + n_val :]]

In [139]:
print(f"Training set size: {len(df_train)}")
print(f"Validation set size: {len(df_val)}")
print(f"Test set size: {len(df_test)}")

Training set size: 5822
Validation set size: 1940
Test set size: 1942


In [140]:
X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)

print("Shape of combined training and validation features:", X_train_val.shape)
print("Shape of combined training and validation target:", y_train_val.shape)

Shape of combined training and validation features: (7764, 4)
Shape of combined training and validation target: (7764,)


In [141]:
X_train_val['horsepower'].fillna(0, inplace=True)
X_test['horsepower'].fillna(0, inplace=True)

print("Missing values in X_train_val after imputation:")
print(X_train_val.isnull().sum())
print("\nMissing values in X_test after imputation:")
print(X_test.isnull().sum())

Missing values in X_train_val after imputation:
engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
dtype: int64

Missing values in X_test after imputation:
engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_val['horsepower'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['horsepower'].fillna(0, inplace=True)


In [142]:
from sklearn.linear_model import Ridge

model_ridge = Ridge(alpha=0.001, random_state=42)
model_ridge.fit(X_train_val, y_train_val)

0,1,2
,alpha,0.001
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [143]:
y_pred_test = model_ridge.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
rmse_test_rounded = round(rmse_test, 3)

print(f"RMSE on the test set with Ridge (r=0.001) and 0 imputation: {rmse_test_rounded}")

RMSE on the test set with Ridge (r=0.001) and 0 imputation: 0.516


In [144]:
print(f"The RMSE on the test data for r=0.001 with zeros is: {rmse_test_rounded}")

The RMSE on the test data for r=0.001 with zeros is: 0.516
