In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [None]:
cols_to_keep=[
    "engine_displacement",
    "horsepower",
    "vehicle_weight",
    "model_year",
    "fuel_efficiency_mpg"
]

In [None]:
df_filtered = df[cols_to_keep].copy()

In [None]:
df_filtered.fuel_efficiency_mpg.hist()

### Q1

In [None]:
df_filtered.isna().sum()

### Q2

In [None]:
df.horsepower.describe()

In [None]:
n = len(df_filtered)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [None]:
idx = np.arange(n)

In [None]:
np.random.seed(42)
np.random.shuffle(idx)

In [None]:
df_train = df_filtered.iloc[idx[:n_train]]
df_val = df_filtered.iloc[idx[n_train:n_train+n_val]]
df_test = df_filtered.iloc[idx[n_train+n_val:]]

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

In [None]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

### Q3

In [None]:
def train_linear_regression(X, y):
    
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
w0, w = train_linear_regression(X=df_train, y=y_train)

In [None]:
y_pred = w0 + df_train.values.dot(w)

In [None]:
#from matplotlib import pyplot as plt
#plt.hist(y_pred, colors='r', alpha=0.5)
#plt.hist(y_train, colors='blue', alpha=0.5)

In [None]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
def prepare_X(df):
    df_num = df.fillna(0)
    X = df_num.values
    return X

In [None]:
# evaluate with filling horsepower with 0
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X=X_train, y=y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)

print("RMSE:", rmse(y_val, y_pred))

In [None]:
def prepare_X_mean(df):
    df_num = df.copy()
    df_num['horsepower'] = df_num['horsepower'].fillna(df_num.horsepower.mean())
    X = df_num.values
    return X

In [None]:
# evaluate with filling horsepower with mean
X_train = prepare_X_mean(df_train)
w0, w = train_linear_regression(X=X_train, y=y_train)

X_val = prepare_X_mean(df_val)
y_pred = w0 + X_val.dot(w)

print("RMSE:", rmse(y_val, y_pred))

### Q4

In [None]:
def train_linear_regression_reg(X, y, r=0.001):
    
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X=X_train, y=y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)

    print("RMSE (r=%.2f): %.2f" % (r, rmse(y_val, y_pred)))

### Q5

In [None]:
scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    
    n = len(df_filtered)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    idx = np.arange(n)
    
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df_filtered.iloc[idx[:n_train]]
    df_val = df_filtered.iloc[idx[n_train:n_train+n_val]]
    df_test = df_filtered.iloc[idx[n_train+n_val:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values
    
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X=X_train, y=y_train)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    
    scores.append(rmse(y_val, y_pred))
    
    print("SEED:", seed, "RMSE:", rmse(y_val, y_pred))

In [None]:
round(np.std(scores), 3)

### Q6

In [None]:
scores = []
for seed in [9]:
    
    n = len(df_filtered)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    idx = np.arange(n)
    
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df_filtered.iloc[idx[:n_train]]
    df_val = df_filtered.iloc[idx[n_train:n_train+n_val]]
    df_test = df_filtered.iloc[idx[n_train+n_val:]]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values
    
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    
    df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
    y_full_train = np.concatenate([y_train, y_val])
    
    X_train = prepare_X(df_full_train)
    w0, w = train_linear_regression_reg(X=X_train, y=y_full_train, r=0.001)

    X_test = prepare_X(df_test)
    y_pred = w0 + X_test.dot(w)
    
    scores.append(rmse(y_test, y_pred))
    
    print("SEED:", seed, "RMSE:", rmse(y_test, y_pred))