In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

In [2]:
def fit(X, y, lam):
    
    w = np.zeros((13,))    
    
    ridge_reg = Ridge(alpha=lam, fit_intercept=False)
    ridge_reg.fit(X, y)
    w = ridge_reg.coef_
    
    assert w.shape == (13,)     
    return w  

In [3]:
def calculate_RMSE(w, X, y):

    RMSE = 0
    y_pred = X @ w
    RMSE = np.sqrt(np.mean((y-y_pred)**2))
    assert np.isscalar(RMSE)      
    return RMSE

In [4]:
def average_LR_RMSE(X, y, lambdas, n_folds):
    
    RMSE_mat = np.zeros((n_folds, len(lambdas)))
    rkf = RepeatedKFold(n_splits=n_folds, n_repeats=1, random_state=12)
    fold = 0
    for train_index, test_index in rkf.split(X):
        X_train, X_test,y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
         
        for j, lambd in enumerate(lambdas):
            w = fit(X_train, y_train, lambd)
            RMSE_mat[fold,j] = calculate_RMSE(w, X_test, y_test)
        fold+=1     
        
    avg_RMSE = np.mean(RMSE_mat, axis=0)
    
    assert avg_RMSE.shape == (5,)
    return avg_RMSE

In [5]:
if __name__ == "__main__":
    # Data loading
    data = pd.read_csv("train.csv")
    y = data["y"].to_numpy()
    data = data.drop(columns="y")
    # print a few data samples
    print(data.head())

    X = data.to_numpy()
    # The function calculating the average RMSE
    lambdas = [0.1, 1, 10, 100, 200]
    n_folds = 10
    avg_RMSE = average_LR_RMSE(X, y, lambdas, n_folds)

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  


In [6]:
# Save results in the required format
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")

In [7]:
print(avg_RMSE)

[5.48923359 5.48090642 5.4559287  5.89876715 6.20359347]
