In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import and clean data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
# make dataframe
df_raw = pd.read_csv(url)

# select columns
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

#only keep specific columns
df = df_raw[base + [target]].copy()

# add horsepower adjusted field
df['horsepower_adj_zero'] = df['horsepower'].fillna(0)

#preview
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg,horsepower_adj_zero
0,170,159.0,3413.433759,2003,13.231729,159.0
1,130,97.0,3149.664934,2007,13.688217,97.0
2,170,78.0,3079.038997,2018,14.246341,78.0
3,220,,2542.392402,2009,16.912736,0.0
4,210,140.0,3460.87099,2009,12.488369,140.0


In [25]:
# set up functions

# zeroes instead of nulls
def prepare_x(df):
    df_num = df[base].copy()
    df_num = df_num.fillna(0)
    return df_num.values

def train_linear_regression(x, y):
    ones = np.ones(x.shape[0])
    X = np.column_stack([ones, x])
    w = np.linalg.solve(X.T.dot(X), X.T.dot(y))
    return w[0], w[1:]

def rmse(y, y_pred):
    return round(np.sqrt((( y - y_pred) ** 2).mean()), 50)

In [29]:
# find rmses for different seeds

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seeds:
    #shuffle and split

    n = len(df)
    n_val   = int(n * 0.2)
    n_test  = int(n * 0.2)
    n_train = n - n_val - n_test

    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]].copy()
    df_val   = df.iloc[idx[n_train:n_train + n_val]].copy()
    df_test  = df.iloc[idx[n_train + n_val:]].copy()

    #target
    # y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    # y_val   = np.log1p(df_val.fuel_efficiency_mpg.values)

    #target for data that is already normally distributed (no log)
    y_train = df_train.fuel_efficiency_mpg.values
    y_val   = df_val.fuel_efficiency_mpg.values
    
    #features
    x_train = prepare_x(df_train)
    x_val = prepare_x(df_val)

    #train and predict
    w0, w = train_linear_regression(x_train, y_train)
    y_pred_val = w0 + x_val.dot(w)

    #keep rmse scores
    score = rmse(y_val, y_pred_val)
    rmse_scores.append(score)
    print(f"Seed = {seed} | Validation RMSE: {score}")

Seed = 0 | Validation RMSE: 0.5206531296297993
Seed = 1 | Validation RMSE: 0.5213388912856962
Seed = 2 | Validation RMSE: 0.5228069974801947
Seed = 3 | Validation RMSE: 0.5159516741173191
Seed = 4 | Validation RMSE: 0.5109129460066764
Seed = 5 | Validation RMSE: 0.5283406460166361
Seed = 6 | Validation RMSE: 0.531391065817111
Seed = 7 | Validation RMSE: 0.5090670387374334
Seed = 8 | Validation RMSE: 0.5147399129351771
Seed = 9 | Validation RMSE: 0.5131865908330882


In [27]:
# standard deviation of scores
round(np.std(rmse_scores), 3)

np.float64(0.007)