In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# import and clean data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
# make dataframe
df_raw = pd.read_csv(url)

# select columns
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

#only keep specific columns
df = df_raw[base + [target]].copy()

# add horsepower adjusted field
df['horsepower_adj_zero'] = df['horsepower'].fillna(0)

#preview
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg,horsepower_adj_zero
0,170,159.0,3413.433759,2003,13.231729,159.0
1,130,97.0,3149.664934,2007,13.688217,97.0
2,170,78.0,3079.038997,2018,14.246341,78.0
3,220,,2542.392402,2009,16.912736,0.0
4,210,140.0,3460.87099,2009,12.488369,140.0


In [10]:
## Splitting data and subbing na for mean using train ##

# splitting the dataset into training, validate and test sets
n = len(df)
n_val = int(len(df) * 0.2)
n_test = int(len(df) * 0.2)
n_train = n - n_val - n_test

# make sure they are split to equal total recores
n_val, n_test, n_train, n_val + n_test + n_train, len(df)

#shuffle records by setting a randon sequence
np.random.seed(9) # makes the random reproducable
idx = np.arange(n)
np.random.shuffle(idx)

#select random rows based on random sequence
df_train = df.iloc[idx[:n_train]].copy()
df_val = df.iloc[idx[n_train:n_train + n_val]].copy()
df_test = df.iloc[idx[n_train + n_val:]].copy()

#mean using training dataset only
hp_mean = df_train['horsepower'].mean()

df_train.loc[:, 'horsepower_adj_mean'] = df_train['horsepower'].fillna(hp_mean)
df_val.loc  [:, 'horsepower_adj_mean'] = df_val['horsepower'].fillna(hp_mean)
df_test.loc [:,'horsepower_adj_mean'] = df_test['horsepower'].fillna(hp_mean)

#test split
len(df_train), len(df_val), len(df_test), len(df_train) + len(df_val) + len(df_test), len(df)

# reset index after records have been randomly allocated
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)
df_train_val = pd.concat([df_train, df_val], ignore_index=True)

In [11]:
# set target variable?
#y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
#y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
#y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

# target variable is already normally distributed, no log needed
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
y_train_val = np.concatenate([y_train, y_val])

# remove target variable from training data
df_train.drop(columns = [target])
df_val.drop(columns = [target])
df_test.drop(columns = [target])

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,horsepower_adj_zero,horsepower_adj_mean
0,240,,3629.912976,2003,0.0,149.696588
1,230,183.0,2675.510371,2011,183.0,183.000000
2,270,176.0,3751.005987,2017,176.0,176.000000
3,130,174.0,2703.817442,2017,174.0,174.000000
4,240,,2765.931806,2017,0.0,149.696588
...,...,...,...,...,...,...
1935,130,190.0,2929.265698,2018,190.0,190.000000
1936,270,127.0,3489.985764,2017,127.0,127.000000
1937,240,177.0,3348.840052,2010,177.0,177.000000
1938,250,150.0,2985.704630,2022,150.0,150.000000


In [12]:
## linear regression formula ##

# modify to test different horsepower columns
base = ['engine_displacement', 'horsepower_adj_zero', 'vehicle_weight', 'model_year']

# prepare features
# transforms dataframe into a numpy array for regression modeling
def prepare_x(df):
    df_num = df[base].copy() #only columns listed in base
    df_num = df_num.fillna(0) # replaces any missing data with 0
    x = df_num.values
    return x # returns the feature matrix

x_train = prepare_x(df_train)
x_val = prepare_x(df_val)
x_test = prepare_x(df_test)
x_train_val = prepare_x(df_train_val)

# define linear regression training function
# includes ridge regression to adjust for outliers
def train_linear_regression(x, y, r=0.0):
    ones = np.ones(x.shape[0])
    X = np.column_stack([ones, x])  # add bias term

    # matrix to regularise (ridge)
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    reg[0, 0] = 0 # don't regularise bias
    
    # w = np.linalg.solve(X.T.dot(X), X.T.dot(y))  # closed-form solution
    w = np.linalg.solve(XTX + reg, X.T.dot(y)) # includes reg
    return w[0], w[1:]  # intercept and weights

In [15]:
## training and rmse ##

#RMSE defined
#Quantifies how good or bad the model is
# average of variances
def rmse (y, y_pred):
    error = y - y_pred  # residuals
    se = error ** 2     # squared errors
    mse = se.mean()     # mean squared errors
    return round(np.sqrt(mse), 50) #root of mse

# train model with different r values
r_values = [0.001]

#Score for each r value
for r in r_values:
    w0, w = train_linear_regression(x_train, y_train, r=r)

    #predictions
    y_pred_train = w0 + x_train.dot(w)
    y_pred_val = w0 + x_val.dot(w)
    y_pred_test = w0 + x_test.dot(w)
    y_pred_train_val = w0 + x_train_val.dot(w)

    #rmse scores
    rmse_train = rmse(y_train, y_pred_train)
    rmse_val   = rmse(y_val, y_pred_val)
    rmse_test  = rmse(y_test, y_pred_test)
    rmse_train_val  = rmse(y_train_val, y_pred_train_val)

    #print results (from CoPilot)
    print(f"r={r:<6} | Train RMSE: {rmse_train} | Val RMSE: {rmse_val} | Test RMSE: {rmse_test} | Train Val RMSE: {rmse_train_val}")

r=0.001  | Train RMSE: 0.5219709782195826 | Val RMSE: 0.5131865908334511 | Test RMSE: 0.5158284328943855 | Train Val RMSE: 0.5197899258682204
