## Ridge regression Cross-validation

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn import preprocessing

In [2]:
dat = pd.read_csv("./data/train.csv", header = 0)
y = dat['y']
X = dat.iloc[:,2:7]

In [3]:
fm2 = np.cos(X)
fm3 = np.multiply(X, X)
fm4 = np.exp(X)
fm5 = pd.DataFrame(np.ones(X.shape[0]))

frames = [X, fm3, fm4, fm2, fm5]
dat = pd.concat(frames, axis = 1)

In [4]:
# look at dataframe
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      700 non-null    float64
 1   x2      700 non-null    float64
 2   x3      700 non-null    float64
 3   x4      700 non-null    float64
 4   x5      700 non-null    float64
 5   x1      700 non-null    float64
 6   x2      700 non-null    float64
 7   x3      700 non-null    float64
 8   x4      700 non-null    float64
 9   x5      700 non-null    float64
 10  x1      700 non-null    float64
 11  x2      700 non-null    float64
 12  x3      700 non-null    float64
 13  x4      700 non-null    float64
 14  x5      700 non-null    float64
 15  x1      700 non-null    float64
 16  x2      700 non-null    float64
 17  x3      700 non-null    float64
 18  x4      700 non-null    float64
 19  x5      700 non-null    float64
 20  0       700 non-null    float64
dtypes: float64(21)
memory usage: 115.0 KB


In [5]:
# scale all the features
min_max_scaler = preprocessing.MinMaxScaler()
dat_scaled = min_max_scaler.fit_transform(dat)

In [8]:
# ridge regression with cross-validation
regularization_param = np.linspace(0.01, 100, 1000)

ridge_model = RidgeCV(alphas= regularization_param, fit_intercept= False, normalize= True, cv = 10).fit(dat_scaled, y)
score = ridge_model.score(dat, y)
coefs = ridge_model.coef_

print(coefs)

[ 0.11474567 -0.06678273  0.14563605  0.29974673  0.10464111 -0.14308345
 -0.15877945 -0.07922056 -0.23689053  0.01577394  0.09395678 -0.08811665
  0.14189713  0.2698375   0.12425499  0.18535497  0.20101083  0.12142954
  0.28011013  0.02590653  0.        ]


In [9]:
# save coefficients
np.savetxt("results.csv", coefs, delimiter=",")

### try repeated cross-validation ridge regression for improvement
* The idea is that the way the data set is split into folds is random. Repeating the cross-validation multiple times improves the accuracy of the model
* Observation: the coefficients appear to be identical -> no change?

In [None]:
# array of the alpha values for each repetition (10 repetition)
alpha = np.zeros(10)

for i in range(0, 10):
    # ridge regression with cross-validation
    regularization_param = np.linspace(0.01, 100, 1000)

    ridge_model = RidgeCV(alphas= regularization_param, fit_intercept= False, normalize= True, cv = 10).fit(dat_scaled, y)
    score = ridge_model.score(dat, y)
    coefs = ridge_model.coef_

    alpha[i] = ridge_model.alpha_


In [None]:
 # use the mean of all alpha values
alpha_opt = np.mean(alpha)

# train model with this alpha value
ridge_regression = Ridge(alpha = alpha_opt,tol=1e-9, normalize= False, fit_intercept = False, max_iter=1000000).fit(dat_scaled, y)

In [None]:
coefs = ridge_regression.coef_
print(coefs)

# save coefficients
np.savetxt("repeatedkfold_results.csv", coefs, delimiter=",")

### try splitting into training and test data before scaling
* The thought is that if we scale the training and test data together then there is potentially some type of information from the test data contained in the scaled training data and the reported performance is then too optimistic

In [27]:
# ridge regression with cross-validation
regularization_param = np.linspace(0.01, 100, 1000)

# mse of each fold and regularization parameter
mse = np.zeros((10, len(regularization_param)))

(10, 1000)


In [28]:
# split data into folds
kf = KFold(n_splits = 10)    # maybe also try shuffeling the data points (shuffel = True)
fold_indx = 0

# loop over the sets
for train_indx, test_indx in kf.split(dat):
    dat_train = dat.iloc[train_indx,:]
    dat_test = dat.iloc[test_indx, :]
    y_train = dat.iloc[train_indx, :]
    y_test = dat.iloc[test_indx, :]

    # scale all the features
    min_max_scaler = preprocessing.MinMaxScaler()

    dat_train_scaled = min_max_scaler.fit_transform(dat_train)
    dat_test_scaled = min_max_scaler.transform(dat_test)
    
    for i in range(0, len(regularization_param)):
        # fit ridge model for all ridge regression parameters
        ridge_model = Ridge(alpha = regularization_param[i],tol=1e-9, normalize= False, fit_intercept = False, max_iter=1000000).fit(dat_train_scaled, y_train)
        # compute prediction for test data and determine the MSE
        pred = ridge_model.predict(dat_test)
        MSE = mean_squared_error(y_test, pred)
        mse[fold_indx, i] = MSE
        
    fold_indx = fold_indx + 1

In [34]:
# which regularization parameter yields the smallest mse for each training/test split yields the smallest mse

alpha_minimal_mse = np.argmin(mse, axis = 1)
print(alpha_minimal_mse)

[0 0 0 0 0 0 0 0 0 0]
