In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
# Load training data CSV using Pandas
filename = 'train.csv'
data = pd.read_csv(filename, header=0)
# Separate training data into Y and X
array = data.values
Y = array[:, 1]
X = array[:, 2:12]

In [3]:
# Set up cross validation, grid search, model
lambda_values = np.array([0.1, 1, 10, 100, 1000])
# solvers = ['svd', 'cholesky', 'sparse_cg', 'lsqr', 'sag', 'saga']
# param_grid = dict(alpha=lambda_values, solver=solvers)
param_grid = dict(alpha=lambda_values)

k_folds = 10
seed = 42
scoring = 'neg_mean_squared_error'

model = Ridge(fit_intercept=False, max_iter=1e6, tol=1e-5, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring,
                    n_jobs=-1, cv=k_folds, verbose=1, return_train_score=True)

In [4]:
# Run grid search on lambdas with cross validation using Ridge() model
grid.fit(X, Y)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=1000000.0,
   normalize=False, random_state=42, solver='auto', tol=1e-05),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=1)

In [5]:
# Print all results for inspection
# print(grid.cv_results_)

In [6]:
# Extract mean scores for result with only (convoluted) way to get RMSE out of sklearn
scores = np.sqrt(-grid.cv_results_['mean_test_score'])
result = pd.DataFrame(scores)

In [7]:
# Alternatively, instead of all of above, simply use CV-specialized model RidgeCV
scoring = 'neg_mean_squared_error'
model2 = RidgeCV(alphas=(0.1, 1, 10, 100, 1000), fit_intercept=False, normalize=False,
                 scoring=scoring, cv=None, store_cv_values=True)
model2.fit(X, Y)
scores2 = []
for j in range(5):
    scores2.append(np.sqrt(mean_squared_error(Y, model2.cv_values_[:,j])))
result2 = pd.DataFrame(scores2)

In [8]:
# Final step for all options: Write result to output file
filename = 'result.csv'
result.to_csv(filename, header=False, index=False)