In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
# Load training data CSV using Pandas
filename = 'train.csv'
data = pd.read_csv(filename, header=0)
# Separate training data into Y and X
array = data.values
Y = array[:, 1]
X = array[:, 2:12]

In [None]:
# Set up cross validation, grid search, model
lambda_values = np.array([0.1, 1, 10, 100, 1000])
# solvers = ['svd', 'cholesky', 'sparse_cg', 'lsqr', 'sag', 'saga']
# param_grid = dict(alpha=lambda_values, solver=solvers)
param_grid = dict(alpha=lambda_values)

k_folds = 10
seed = 42
scoring = 'neg_mean_squared_error'

model = Ridge(fit_intercept=False, max_iter=1e6, tol=1e-5, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring,
                    n_jobs=-1, cv=k_folds, verbose=1, return_train_score=True)

In [None]:
# Run grid search on lambdas with cross validation using Ridge() model
grid.fit(X, Y)

In [None]:
# Print all results for inspection
# print(grid.cv_results_)

In [None]:
# Extract mean scores for result w/ (convoluted) way to get RMSE out of sklearn
scores = np.sqrt(-grid.cv_results_['mean_test_score'])
result = pd.DataFrame(scores)

In [None]:
# Alternatively, i.s.o. GridSearchCV, use cross_val_score to obtain same result
scores2 = []
for l in lambda_values:
    model = Ridge(alpha=l, fit_intercept=False, random_state=seed)
    scores2.append(np.sqrt(-np.mean(cross_val_score(model, X, Y, scoring=scoring,
                                                    cv=k_folds, n_jobs=-1))))

print(scores-scores2)
result2 = pd.DataFrame(scores2)

In [None]:
# Different route: I.s.o. all above, simply use CV-specialized model RidgeCV
model2 = RidgeCV(alphas=(0.1, 1, 10, 100, 1000), fit_intercept=False,
                 normalize=False, scoring=scoring, cv=None, store_cv_values=True)
model2.fit(X, Y)
scores3 = []
for j in range(5):
    scores3.append(np.sqrt(mean_squared_error(Y, model2.cv_values_[:,j])))
print(scores-scores3)
result3 = pd.DataFrame(scores3)

In [None]:
# Final step for all options: Write chosen final result to output file
filename = 'result.csv'
final_result = result
#final_result = result2
#final_result = result3
final_result.to_csv(filename, header=False, index=False)