#### General guidance

This serves as a template which will guide you through the implementation of this task. It is advised
to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
import sklearn as sk

# Add any additional imports here (however, the task is solvable without using 
# any additional imports)
# import ...

rng = np.random.default_rng(1234567890)

 #### Loading data

In [2]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
print(data.head())

         x1   x2     x3   x4     x5     x6     x7      x8    x9    x10   x11  \
0   0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0  430.0  16.9   
1   9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0  666.0  20.2   
2   0.11425  0.0  13.89  1.0  0.550  6.373   92.4  3.3633   5.0  276.0  16.4   
3  24.80170  0.0  18.10  0.0  0.693  5.349   96.0  1.7028  24.0  666.0  20.2   
4   0.05646  0.0  12.83  0.0  0.437  6.232   53.7  5.0141   5.0  398.0  18.7   

      x12    x13  
0  375.21   7.34  
1  366.15   9.53  
2  393.74  10.50  
3  396.90  19.77  
4  386.40  12.34  


#### Calculating the average RMSE

In [3]:
def calculate_RMSE(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of 
    predicting y from X using a linear model with weights w. 

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression 
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    RMSE: float: dim = 1, RMSE value
    """
    # TODO: Enter your code here
    assert w.shape == (13,)
    assert X.shape == (15,13)
    assert y.shape == (15,)
    delta = y - (X @ w)
    sumSqErr = delta.transpose() @ delta
    mse = sumSqErr / 15
    RMSE = np.sqrt(mse)
    assert np.isscalar(RMSE)
    return RMSE


#### Testing Functions

In [4]:
# dummyw = np.zeros((13,1))

In [5]:
# print('dummyw shape: ', '\n', dummyw.shape)
# unit = np.identity(13)
# 
# unit = np.vstack((unit, np.zeros((2,13))))
# print('unit shape: ', '\n', unit.shape)
# ydummy = np.ones((15,1))
# print('ydummy shape: ', '\n', ydummy.shape)
# 
# rmse = calculate_RMSE(dummyw, unit, ydummy)
# print('rmse: ', rmse)


In [6]:
# def merge_Xpartition(split, i, n_folds):
#     """
#     Parameters
#     split : Array of Matrices spliting training data in to nfolds blocks
#     i : ith iteration of crossvaidation
#     n_folds : int
# 
#     Returns
#     -------
#     nfolds-1 blocks merged (ommiting the ith)
#     """
#     assert i < n_folds
#     partition = np.empty((0,13))
#     
#     for j in range(n_folds):
#         if i != j:
#             partition = np.concatenate((partition, split[j]))
#             
#     return partition
#     


In [7]:
# def merge_Ypartition(split, i, n_folds):
#     """
#     Parameters
#     split : Array of Matrices spliting training data in to nfolds blocks
#     i : ith iteration of crossvaidation
#     n_folds : int
# 
#     Returns
#     -------
#     nfolds-1 blocks merged (ommiting the ith)
#     """
#     assert i < n_folds
#     partition = np.empty(0)
#     
#     for j in range(n_folds):
#         if i != j:
#             partition = np.concatenate((partition, split[j]))
#             
#     return partition
#     
    

In [8]:
def merge_XYpartition(split, i, n_folds):
    """
    Parameters
    split : Array of Matrices spliting training data in to nfolds blocks
    i : ith iteration of crossvaidation
    n_folds : int

    Returns
    -------
    nfolds-1 blocks merged (ommiting the ith)
    """
    assert i < n_folds
    # print(split.shape)
    # assert split.shape == (150,14)

    partition = np.empty((0,14))

    for j in range(n_folds):
        if i != j:
            partition = np.concatenate((partition, split[j]))

    # print(partition.shape)
    xs = partition[0:150,0:13]
    ys = partition[0:150,13]
    # print("xsshape:",xs.shape)
    # print("ysshape:",ys.shape)
    return xs,ys


#### Fitting the regressor

In [9]:
def fit(X, y, lam):
    """
    This function receives training data points, then fits the ridge regression on this data
    with regularization hyperparameter lambda. The weights w of the fitted ridge regression
    are returned. 

    Parameters
    ----------
    X: matrix of floats, dim = (135,13), inputs with 13 features
    y: array of floats, dim = (135,), input labels)
    lam: float. lambda parameter, used in regularization term

    Returns
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    """
    w = np.zeros((13,))
    
    # TODO: Enter your code here
    assert X.shape == (135,13)
    assert y.shape == (135,)
   
   #  INIT RIDGE MODEL
    ridge_reg = Ridge(alpha=lam, fit_intercept=False, copy_X=True, max_iter=None, tol=0.0000001, solver='auto', positive=False, random_state=None)
    ridge_reg.fit(X, y)
    w = ridge_reg.coef_
    # print("weights:",weights)
    # print(weights.shape)
    
    assert w.shape == (13,)
    return w

#### Setup for computation

In [10]:
"""
Main cross-validation loop, implementing 10-fold CV. In every iteration 
(for every train-test split), the RMSE for every lambda is calculated, 
and then averaged over iterations.

Parameterspartition, split[j]
---------- 
X: matrix of floats, dim = (150, 13), inputs with 13 features
y: array of floats, dim = (150, ), input labels
lambdas: list of floats, len = 5, values of lambda for which ridge regression is fitted and RMSE estimated
n_folds: int, number of folds (pieces in which we split the dataset), parameter K in KFold CV

Compute
----------
avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
"""

# SETUP
X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_lambdas = len(lambdas)
n_folds = 10

# OLD WAY OF SPLITTING X AND Y SEPERATELY (WOULD HAVE ALSO WORKED)
# Xsplit = np.array_split(X,n_folds)
# Ysplit = np.array_split(y,n_folds)

RMSE_mat = np.zeros((n_folds, len(lambdas)))

# CONCATENATING X AND Y BEFORE PARTITIONING
y = y.reshape(150,1)
XY = np.hstack((X,y))
y = y.reshape(150,)
XYSplit = np.array_split(XY,n_folds)



#### Performing computation

In [11]:
#Computation
# TODO: Enter your code here. Hint: Use functions 'fit' and 'calculate_RMSE' with training and test data
# and fill all entries in the matrix 'RMSE_mat'

#SPLIT TRAINING DATA AND TEST DATA
print()



#TRAIN FOR EACH FOLD
for i in range(n_folds):
    
    # #ORIGINAL WAY OF SPLITTING
    # Xi, Yi = merge_Xpartition(Xsplit, i ,n_folds), merge_Ypartition(Ysplit, i, n_folds)
    # TXi, TYi = Xsplit[i], Ysplit[i]
    
    # TRYINIG DIFFERENT WAY OF SPLITTING
    Xi, Yi = merge_XYpartition(XYSplit,i,n_folds)
    TXYi = XYSplit[i]
    TXi, TYi = TXYi[0:15,0:13], TXYi[0:15,13]
    
    
    assert Xi.shape == (135,13)
    assert TXi.shape == (15,13)
    assert Yi.shape == (135,)
    assert TYi.shape == (15,)
    for l in range(n_lambdas):
        lam = lambdas[l]
        w = fit(Xi, Yi, lam)
        #Compute RMSE on test data
        RMSE_mat[i][l] = calculate_RMSE(w,TXi,TYi)


# print("RMSE_mat:\n",RMSE_mat)
avg_RMSE = np.mean(RMSE_mat, axis=0) # avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
print("avgRMSE: \n", avg_RMSE)
assert avg_RMSE.shape == (5,)

# Save results in the required format
np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")

avgRMSE: 
 [5.5036383  5.48040028 5.46988555 5.93193113 6.2433465 ]


In [12]:
# PREVIOUS VALUES 
# SCORE: 8900
# [ 92.00570599  73.48037377  88.43292605 139.51550811 141.70518924] 
# SCORE: 1900
# [23.75577114 18.97255093 22.83328332 36.0227493  36.58812253]