In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('car_fuel_efficiency.csv', usecols=['engine_displacement', 'horsepower','vehicle_weight', 'model_year', 'fuel_efficiency_mpg'])
df

In [None]:
missing_values  =df.isnull().any()

In [None]:
# median of horsepower
df['horsepower'].median()

In [None]:
n= len(df)
n_val = int(0.2*n)
n_test = int(0.2*n)
n_train = n - (n_val + n_test)
n_train, n_val, n_test

In [None]:
np.random.seed(42)
idx =np.arange(n)
np.random.shuffle(idx)
#shuffling the array
np.random.seed(42) 
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]
df_shuffled

In [None]:
df_train=df_shuffled.iloc[idx[ :n_train]].copy()
df_val=df_shuffled.iloc[idx[n_train:n_train+n_val]].copy()
df_test=df_shuffled.iloc[idx[n_train+n_val:]].copy()

In [None]:
y_train = np.log1p(df_train['fuel_efficiency_mpg'].values)
y_val = np.log1p(df_val['fuel_efficiency_mpg'].values)
y_test = np.log1p(df_test['fuel_efficiency_mpg'].values)

In [None]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [None]:
def train_linear_regression(X_train, y_train):
    """Train linear regression without regularization"""
    ones = np.ones(X_train.shape[0])
    X_train_with_bias = np.column_stack([ones, X_train])
    
    XTX = X_train_with_bias.T.dot(X_train_with_bias)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X_train_with_bias.T).dot(y_train)
    
    return w

In [None]:
def predict(X, w):
    """Make predictions"""
    ones = np.ones(X.shape[0])
    X_with_bias = np.column_stack([ones, X])
    return X_with_bias.dot(w)

In [None]:
def rmse(y_true, y_pred):
    """Calculate RMSE"""
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [None]:
df_train_fill_0 = df_train.fillna(0)
df_val_fill_0 = df_val.fillna(0)

w_0 = train_linear_regression(df_train_fill_0.values, y_train)
y_pred_0 = predict(df_val_fill_0.values, w_0)
rmse_fill_0 = rmse(y_val, y_pred_0)

with_zero = round(rmse_fill_0, 2)
with_zero

In [None]:
missing_col = missing_values[missing_values > 0].index[0]
mean_value = df_train[missing_col].mean()
print(f"Mean of '{missing_col}' in training set: {mean_value:.2f}")

df_train_fill_mean = df_train.fillna(mean_value)
df_val_fill_mean = df_val.fillna(mean_value)

w_mean = train_linear_regression(df_train_fill_mean.values, y_train)
y_pred_mean = predict(df_val_fill_mean.values, w_mean)
rmse_fill_mean = rmse(y_val, y_pred_mean)
round(rmse_fill_mean, 2);

In [None]:
def train_linear_regression_reg(X_train, y_train, r=0.0):
    """Train linear regression with regularization"""
    ones = np.ones(X_train.shape[0])
    X_train_with_bias = np.column_stack([ones, X_train])
    
    XTX = X_train_with_bias.T.dot(X_train_with_bias)
    reg_matrix = r * np.eye(XTX.shape[0])
    XTX_reg = XTX + reg_matrix
    XTX_inv = np.linalg.inv(XTX_reg)
    w = XTX_inv.dot(X_train_with_bias.T).dot(y_train)
    
    return w

In [None]:
df_train_q4 = df_train.fillna(0)
df_val_q4 = df_val.fillna(0)



In [None]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_values:
    w = train_linear_regression_reg(df_train_q4.values, y_train, r=r)
    y_pred = predict(df_val_q4.values, w)
    rmse_val = rmse(y_val, y_pred)
    rmse_scores[r] = round(rmse_val, 2)
    print(f"r = {r:6}: RMSE = {rmse_scores[r]}")


In [None]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_list = []

for seed in seeds:
    np.random.seed(seed)

# Shuffle
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled_seed = df.iloc[idx].reset_index(drop=True)
    
    # Split
    df_train_seed = df_shuffled_seed.iloc[:n_train].copy()
    df_val_seed = df_shuffled_seed.iloc[n_train:n_train+n_val].copy()
    
    # Prepare data
    y_train_seed = df_train_seed['fuel_efficiency_mpg'].values
    y_val_seed = df_val_seed['fuel_efficiency_mpg'].values
    
    X_train_seed = df_train_seed.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    X_val_seed = df_val_seed.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    
    # Train and evaluate
    w = train_linear_regression(X_train_seed.values, y_train_seed)
    y_pred = predict(X_val_seed.values, w)
    rmse_val = rmse(y_val_seed, y_pred)
    rmse_list.append(rmse_val)
    print(f"Seed {seed}: RMSE = {rmse_val:.4f}")


In [None]:
std_rmse = np.std(rmse_list)
print(f"Standard deviation of RMSE: {round(std_rmse, 5)}")

In [None]:
np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled_9 = df.iloc[idx].reset_index(drop=True)

df_train_9 = df_shuffled_9.iloc[:n_train].copy()
df_val_9 = df_shuffled_9.iloc[n_train:n_train+n_val].copy()
df_test_9 = df_shuffled_9.iloc[n_train+n_val:].copy()


In [None]:
df_train_val = pd.concat([df_train_9, df_val_9]).reset_index(drop=True)

In [None]:
y_train_val = df_train_val['fuel_efficiency_mpg'].values
y_test_9 = df_test_9['fuel_efficiency_mpg'].values

X_train_val = df_train_val.drop('fuel_efficiency_mpg', axis=1).fillna(0)
X_test_9 = df_test_9.drop('fuel_efficiency_mpg', axis=1).fillna(0)


In [None]:
w_final = train_linear_regression_reg(X_train_val.values, y_train_val, r=0.001)
y_pred_test = predict(X_test_9.values, w_final)
rmse_test = rmse(y_test_9, y_pred_test)
print(f"\nRMSE on test set (r=0.001): {round(rmse_test, 5)}")


