In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Read Data

In [3]:
from helpers.utils import read_data

In [4]:
train_path = "./data/train.csv"
X, y = read_data(train_path)
X.shape, y.shape

((150, 13), (150,))

## 2. Cross Validation

In [5]:
import helpers.config as config

from helpers.cross_validation import CrossValidator
from helpers.utils import rmse_scoring
from sklearn.linear_model import Ridge

In [6]:
classifier = Ridge(tol=1e-10, fit_intercept=False)
param_grid = {"alpha": config.config_lambda}
scoring = {"rmse_score": rmse_scoring()}

In [7]:
cv = CrossValidator(X, y, classifier, param_grid, config.config_K, scoring, refit="rmse_score")
df_cv = cv.fit()

In [8]:
df_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_rmse_score,split1_test_rmse_score,split2_test_rmse_score,split3_test_rmse_score,split4_test_rmse_score,split5_test_rmse_score,split6_test_rmse_score,split7_test_rmse_score,split8_test_rmse_score,split9_test_rmse_score,mean_test_rmse_score,std_test_rmse_score,rank_test_rmse_score
0,0.0,0.0,0.001563,0.004688,0.1,{'alpha': 0.1},-7.441234,-5.128266,-7.707647,-4.540061,-4.075316,-5.109754,-6.551364,-6.04022,-4.887598,-3.554922,-5.503638,1.320576,3
1,0.0,0.0,0.0,0.0,1.0,{'alpha': 1.0},-7.477933,-4.883931,-7.702792,-4.500595,-4.072622,-5.151921,-6.561512,-6.09837,-4.881777,-3.472549,-5.4804,1.351858,2
2,0.001563,0.004688,0.0,0.0,10.0,{'alpha': 10.0},-7.581469,-4.452825,-7.727744,-4.339892,-4.194254,-5.394463,-6.552751,-6.36717,-4.705849,-3.382438,-5.469886,1.42898,1
3,0.0,0.0,0.0,0.0,100.0,{'alpha': 100.0},-8.196459,-3.552561,-7.779944,-4.946785,-4.894264,-7.119826,-7.13545,-7.592244,-4.302997,-3.798782,-5.931931,1.705047,4
4,0.0,0.0,0.0,0.0,200.0,{'alpha': 200.0},-8.507482,-3.603993,-7.889683,-5.240547,-5.242722,-7.8788,-7.478659,-7.993559,-4.45026,-4.147759,-6.243347,1.779898,5


In [9]:
cv.clf.best_estimator_

Ridge(alpha=10.0, fit_intercept=False, tol=1e-10)

In [10]:
rmse_vals = cv.get_rmse()
rmse_vals

0    5.503638
1    5.480400
2    5.469886
3    5.931931
4    6.243347
Name: mean_test_rmse_score, dtype: float64

## 3. Sanity Check & Save Results

In [11]:
model = cv.clf.best_estimator_

In [12]:
coeff = model.coef_
intercept = model.intercept_
coeff, intercept

(array([-0.04456046,  0.04496583,  0.0123375 ,  1.81172864, -0.067546  ,
         4.74036294,  0.02211344, -0.6632584 ,  0.36525954, -0.01283116,
        -0.22554368,  0.02190674, -0.65713917]),
 0.0)

In [13]:
y_pred = X @ coeff
assert np.allclose(y_pred, model.predict(X))
rmse_manual = np.sqrt(np.linalg.norm(y_pred - y) ** 2 / y.shape[0])
rmse_manual

5.064693241230038

In [14]:
best_lam = 10
coeff_manual =  np.linalg.pinv(X.T @ X + best_lam * np.eye(X.shape[1])) @ (X.T @ y)
assert np.allclose(coeff_manual, coeff)

In [15]:
from helpers.utils import save_data

In [16]:
out_path = "./outputs/submission.csv"
save_data(out_path, rmse_vals.values)