In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# import data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
# make dataframe
df_raw = pd.read_csv(url)

# select columns
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

#only keep specific columns
df = df_raw[base + [target]].copy()

#preview
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [6]:
## Previewing and formatting ##

#preview columns for null values
df.isnull().sum()
# median of horsepower 149
df['horsepower'].median()

# add horsepower adjusted fields
df['horsepower_adj_zero'] = df['horsepower'].fillna(0)
#do mean later, calculated using training dataset only
#df['horsepower_adj_mean'] = df['horsepower'].fillna(df['horsepower'].mean())

In [8]:
## Splitting data and subbing na for mean using train ##

# splitting the dataset into training, validate and test sets
n = len(df)
n_val = int(len(df) * 0.2)
n_test = int(len(df) * 0.2)
n_train = n - n_val - n_test

# make sure they are split to equal total recores
n_val, n_test, n_train, n_val + n_test + n_train, len(df)

#shuffle records by setting a randon sequence
np.random.seed(42) # makes the random reproducable
idx = np.arange(n)
np.random.shuffle(idx)

#select random rows based on random sequence
df_train = df.iloc[idx[:n_train]].copy()
df_val = df.iloc[idx[n_train:n_train + n_val]].copy()
df_test = df.iloc[idx[n_train + n_val:]].copy()

#mean using training dataset only
hp_mean = df_train['horsepower'].mean()

df_train.loc[:, 'horsepower_adj_mean'] = df_train['horsepower'].fillna(hp_mean)
df_val.loc  [:, 'horsepower_adj_mean'] = df_val['horsepower'].fillna(hp_mean)
df_test.loc [:,'horsepower_adj_mean'] = df_test['horsepower'].fillna(hp_mean)

#test split
len(df_train), len(df_val), len(df_test), len(df_train) + len(df_val) + len(df_test), len(df)

# reset index after records have been randomly allocated
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [9]:
# set target variable?
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

# remove target variable from training data
df_train.drop(columns = [target])
df_val.drop(columns = [target])
df_test.drop(columns = [target])

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,horsepower_adj_zero,horsepower_adj_mean
0,140,148.0,2704.826005,2017,148.0,148.0
1,210,136.0,3001.227490,2004,136.0,136.0
2,210,152.0,3747.467392,2021,152.0,152.0
3,190,128.0,2705.833636,2014,128.0,128.0
4,230,156.0,2505.939657,2005,156.0,156.0
...,...,...,...,...,...,...
1935,210,163.0,1972.029124,2011,163.0,163.0
1936,160,126.0,3011.588014,2009,126.0,126.0
1937,290,187.0,2440.508039,2019,187.0,187.0
1938,260,129.0,1865.404480,2019,129.0,129.0


In [10]:
## linear regression formula ##

# modify to test different horsepower columns
base = ['engine_displacement', 'horsepower_adj_zero', 'vehicle_weight', 'model_year']

# prepare features
# transforms dataframe into a numpy array for regression modeling
def prepare_x(df):
    df_num = df[base].copy() #only columns listed in base
    df_num = df_num.fillna(0) # replaces any missing data with 0
    x = df_num.values
    return x # returns the feature matrix

x_train = prepare_x(df_train)
x_val = prepare_x(df_val)
x_test = prepare_x(df_test)

# define linear regression training function
# includes ridge regression to adjust for outliers
def train_linear_regression(x, y, r=0.0):
    ones = np.ones(x.shape[0])
    X = np.column_stack([ones, x])  # add bias term

    # matrix to regularise (ridge)
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    reg[0, 0] = 0 # don't regularise bias
    
    # w = np.linalg.solve(X.T.dot(X), X.T.dot(y))  # closed-form solution
    w = np.linalg.solve(XTX + reg, X.T.dot(y)) # includes reg
    return w[0], w[1:]  # intercept and weights


In [15]:
## training and rmse ##

#RMSE defined
#Quantifies how good or bad the model is
# average of variances
def rmse (y, y_pred):
    error = y - y_pred  # residuals
    se = error ** 2     # squared errors
    mse = se.mean()     # mean squared errors
    return round(np.sqrt(mse), 2) #root of mse

# train model with different r values
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

#Score for each r value
for r in r_values:
    w0, w = train_linear_regression(x_train, y_train, r=r)

    #predictions
    y_pred_train = w0 + x_train.dot(w)
    y_pred_val = w0 + x_val.dot(w)
    y_pred_test = w0 + x_test.dot(w)

    #rmse scores
    rmse_train = rmse(y_train, y_pred_train)
    rmse_val   = rmse(y_val, y_pred_val)
    rmse_test  = rmse(y_test, y_pred_test)

    #print results (from CoPilot)
    print(f"r={r:<6} | Train RMSE: {rmse_train} | Val RMSE: {rmse_val} | Test RMSE: {rmse_test}")



r=0      | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
r=0.01   | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
r=0.1    | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
r=1      | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
r=5      | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
r=10     | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
r=100    | Train RMSE: 0.04 | Val RMSE: 0.04 | Test RMSE: 0.04
