### Set up dependency

In [1]:
import pandas as pd
import numpy as np

### Install dataset

In [2]:
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
!wget $data_url

--2025-10-08 10:10:16--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-08 10:10:16 (7.03 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [30]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [4]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [31]:
# Check the data type for each columns
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [10]:
# Check the all columns
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [6]:
df.fuel_efficiency_mpg.head()

0    13.231729
1    13.688217
2    14.246341
3    16.912736
4    12.488369
Name: fuel_efficiency_mpg, dtype: float64

In [67]:
base = ['engine_displacement','horsepower','vehicle_weight','model_year']
base

['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [13]:
# Check how many null values in dataset
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [14]:
# Check the median for variable 'horsepower' with missing value
median_hp = df.horsepower.mean()
median_hp


149.65729212983547

## Preparing and spliting the dataset

In [32]:
# Shuffle the dataset and use seed '42'
n = len(df)
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
idx

array([ 483, 7506, 8795, ..., 5390,  860, 7270])

In [33]:
# Spliting the dataset

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [34]:
n, n_val, n_test, n_train

(9704, 1940, 1940, 5824)

In [46]:
# Filling mean value for the missing value of horsepower
df_copy = df.copy()

In [47]:
df_copy.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [49]:
df_train = df_copy.iloc[idx[:n_train]]
df_val = df_copy.iloc[idx[n_train:n_train+n_val]]
df_test = df_copy.iloc[idx[n_train+n_val:]]

In [50]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [51]:
len(df_train), len(df_val), len(df_test)

(5824, 1940, 1940)

In [60]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

In [61]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [39]:
df_copy['horsepower'] = df_copy.horsepower.fillna(median_hp)

In [41]:
df_copy.horsepower.isnull().sum()

0

In [43]:
df_copy.horsepower.mean()

149.6572921298355

In [44]:
# Check the mean value if using 0
df.horsepower.fillna(0).mean()

138.73835531739488

In [45]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [63]:
def prepare_X(df,fill_value):
    df_num = df[base]
    df_num = df_num.fillna(fill_value)
    X = df_num.values
    return X

In [48]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [90]:
# Compare filling value
fill_values = [0, df_train.horsepower.mean()]
for i in fill_values:
    X_train = prepare_X(df_train, i)
    X_val = prepare_X(df_val, i)
    w0, w = train_linear_regression(X_train, y_train)

    y_pred = w0 + X_val.dot(w)

    score = rmse(y_val, y_pred)
    
    print('The original value of score: {}:{}'.format(i,score))

    print('The filling value is {} and the score is {}'.format(i, round(score,2)))
    print()

The original value of score: 0:0.03997925782435426
The filling value is 0 and the score is 0.04

The original value of score: 149.54476367006487:0.03732771701506444
The filling value is 149.54476367006487 and the score is 0.04



In [147]:
# Adding the grelulization
def train_linear_regression_reg(X, y,r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    
    reg = r*np.eye(XTX.shape[0])
    XTX_reg = XTX.copy()
    XTX_reg[1:, 1:] += r * np.eye(XTX.shape[0] - 1)   # 不惩罚 w0

    
    XTX_inv = np.linalg.inv(XTX_reg)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [105]:
r = [0, 0.01, 0.1, 1, 5, 10, 100]
X_train = prepare_X(df_train, 0)
X_val = prepare_X(df_val, 0)


for i in r:
 
    w0, w = train_linear_regression_reg(X_train, y_train,i)

    y_pred = w0 + X_val.dot(w)

    score = rmse(y_val, y_pred)
    
    print('The original value of score: {}:{}'.format(i,score))

    print('The regulization value is {} and the score is {}'.format(i, round(score,2)))
    print()

The original value of score: 0:0.03860764644114833
The regulization value is 0 and the score is 0.04

The original value of score: 0.01:0.03862981662456327
The regulization value is 0.01 and the score is 0.04

The original value of score: 0.1:0.03923784881650093
The regulization value is 0.1 and the score is 0.04

The original value of score: 1:0.040122014153865404
The regulization value is 1 and the score is 0.04

The original value of score: 5:0.040278388047348686
The regulization value is 5 and the score is 0.04

The original value of score: 10:0.04029958138531715
The regulization value is 10 and the score is 0.04

The original value of score: 100:0.04031899409479242
The regulization value is 100 and the score is 0.04



In [104]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
scores = []

for s in seeds:
    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)

    
    X_train = prepare_X(df_train, 0)
    X_val = prepare_X(df_val, 0)
    w0, w = train_linear_regression(X_train, y_train)

    y_pred = w0 + X_val.dot(w)

    score = rmse(y_val, y_pred)
    scores.append(score)

score_std = np.std(np.array(scores))

print(f'The standard deviation of different seed is {round(score_std,3)}')

The standard deviation of different seed is 0.001


In [134]:
# Combining train and validation dataset
np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)

df_train = df_copy.iloc[idx[:n_train]]
df_val = df_copy.iloc[idx[n_train:n_train+n_val]]
df_test = df_copy.iloc[idx[n_train+n_val:]]

y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']



In [135]:
df_full_train = pd.concat([df_train,df_val])

In [136]:
df_full_train.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors
9066,240,2.0,136.0,4050.51229,11.5,2006,Europe,Diesel,All-wheel drive,-1.0
3073,190,2.0,141.0,3195.866942,17.2,2017,Europe,Diesel,All-wheel drive,-1.0
2476,200,3.0,184.0,3006.16437,9.9,2017,USA,Diesel,Front-wheel drive,0.0
658,200,4.0,,2947.14398,18.1,2022,Europe,Diesel,All-wheel drive,-1.0
954,250,,132.0,3114.371978,11.7,2001,Asia,Gasoline,All-wheel drive,1.0


In [137]:
df_full_train = df_full_train.reset_index(drop=True)

In [138]:
X_full_train = prepare_X(df_full_train,0)

In [139]:
X_full_train

array([[ 240.        ,  136.        , 4050.51228957, 2006.        ],
       [ 190.        ,  141.        , 3195.86694158, 2017.        ],
       [ 200.        ,  184.        , 3006.16436968, 2017.        ],
       ...,
       [ 180.        ,  154.        , 3346.96567067, 2018.        ],
       [ 210.        ,  152.        , 2500.17568746, 2020.        ],
       [ 260.        ,  174.        , 2702.25730066, 2011.        ]])

In [140]:
y_full_train = np.concatenate([y_train,y_val])

In [149]:
w0, w = train_linear_regression_reg(X_full_train, y_full_train,r=0.001)


X_test = prepare_X(df_test,0)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
print(f'The RMSE on the test dataset: {score:.4f}')

The RMSE on the test dataset: 0.0392
