## 10-fold Cross-validation with Ridge regression

In [1]:
# import libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sklearn
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [2]:
# define constants

folds = 10
lamda = np.array([0.01, 0.1, 1, 10, 100])

In [3]:
# load training data

# load data from csv file
data_frame = pd.read_csv ('train.csv')
x_values = data_frame[['x1','x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8','x9','x10','x11','x12','x13']]
y_values = data_frame[['y']]
# visualize data
print(data_frame[:2])

   Id     y       x1   x2     x3   x4     x5     x6     x7      x8    x9  \
0   0  22.6  0.06724  0.0   3.24  0.0  0.460  6.333   17.2  5.2146   4.0   
1   1  50.0  9.23230  0.0  18.10  0.0  0.631  6.216  100.0  1.1691  24.0   

     x10   x11     x12   x13  
0  430.0  16.9  375.21  7.34  
1  666.0  20.2  366.15  9.53  


In [4]:
# 10-fold cross validation with ridge regression

performance = np.zeros([len(lamda), folds])

for i in range(0, len(lamda)):
    for j in range(0, folds):
        X_train,X_test,y_train,y_test = train_test_split(x_values, y_values, test_size = 1/folds)
        ridge_reg = Ridge(alpha = lamda[i], normalize = True)
        ridge_reg.fit(X_train, y_train)

        y_predict = ridge_reg.predict(X_test)
        rmse = np.sqrt(np.mean((y_predict-y_test)**2))
        
        performance[i,j] = rmse

In [10]:
# mean RMSE

mean_rmse = np.mean(performance, axis = 1)

In [12]:
# save to file

np.savetxt("sample.csv", mean_rmse, delimiter=",")

### evaluate best model on test data

In [5]:
# select lamda such that the rmse is minimal
lamda_optimal = np.argmin(np.mean(performance, axis = 1))
print(lamda_optimal)

ridge_reg_model = Ridge(alpha = lamda[lamda_optimal], normalize = True)
ridge_reg_model.fit(x_values, y_values)

0


Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=True, random_state=None, solver='auto', tol=0.001)

In [6]:
# load test data

data_frame = pd.read_csv ('test.csv')
x_values_test = data_frame[['x1','x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8','x9','x10','x11','x12','x13']]

In [7]:
# prediction for test data
y_predict_test = ridge_reg_model.predict(x_values)

In [8]:
# save predictions

results = data_frame[['Id']]
y_predict_dataframe = pd.DataFrame(data = y_predict_test, columns= {"y"})
results.append(y_predict_dataframe)
# save to file
results.to_csv('results_ns.csv', index = False)