In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame
import math

%matplotlib inline

In [116]:
df = pd.read_csv("car_fuel_efficiency.csv")

In [117]:
feature_columns = ["engine_displacement", "horsepower",
         "vehicle_weight", "model_year"]

target_col = "fuel_efficiency_mpg"

In [118]:
def prepare_X(df: DataFrame, base_features, fill_na_with=0):
    df_num = df[base_features].copy()
    df_num = df_num.fillna(fill_na_with)
    X = df_num.values
    return X

In [119]:
def split_dataset(df: DataFrame, seed=42):
    np.random.seed(seed)

    n = len(df)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx].copy()

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    return df_train, df_val, df_test

In [120]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [121]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

### Question 5

* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores. 
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)

What's the value of std?

- 0.001
- 0.006
- 0.060
- 0.600

> Note: Standard deviation shows how different the values are.
> - If it's low, then all values are approximately the same.
> - If it's high, the values are different.
> - If standard deviation of scores is low, then our model is *stable*.

In [122]:
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [123]:
rmse_values = []

for seed in seed_values:
    df_train, df_validation, df_test = split_dataset(df, seed)

    X_train = prepare_X(df_train, feature_columns, fill_na_with=0)
    y_train = df_train["fuel_efficiency_mpg"].values
    w0, weights = train_linear_regression_reg(X_train, y_train, r=0)

    X_validation = prepare_X(df_validation, feature_columns, 0)
    y_validation =df_validation[target_col].values

    y_prediction = w0 + X_validation.dot(weights)

    rmse = np.sqrt(np.mean((y_validation - y_prediction) ** 2))
    rmse_values.append(rmse)

    print(f"seed: {seed}, rmse: {rmse}")

seed: 0, rmse: 0.5206531296294218
seed: 1, rmse: 0.521338891285577
seed: 2, rmse: 0.5228069974803171
seed: 3, rmse: 0.515951674119676
seed: 4, rmse: 0.5109129460053851
seed: 5, rmse: 0.52834064601107
seed: 6, rmse: 0.5313910658146311
seed: 7, rmse: 0.5090670387381733
seed: 8, rmse: 0.5147399129511132
seed: 9, rmse: 0.5131865908224594


In [124]:
print(f"The standard deviation is: {round(np.std(rmse_values), 3)} ({np.std(rmse_values)})")

The standard deviation is: 0.007 (0.006989446426433706)


### Question 6

* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`. 
* What's the RMSE on the test dataset?

Options:

- 0.15
- 0.515
- 5.15
- 51.5

In [125]:
df_train, df_validation, df_test = split_dataset(df, seed=9)

In [126]:
df_full_train = pd.concat([df_train, df_validation])
X_full_train = prepare_X(df_full_train, feature_columns, 0)

In [127]:
y_train = df_train[target_col].values
y_validation = df_validation[target_col].values

y_full_train = np.concatenate([y_train, y_validation])

In [128]:
w0, weights = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

In [129]:
X_test = prepare_X(df_test, feature_columns, 0)
y_test = df_test[target_col].values

y_prediction = w0 + X_test.dot(weights)

rmse = np.sqrt(np.mean((y_test - y_prediction) ** 2))

In [130]:
rmse

np.float64(0.5156261299169602)