In [13]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
# Load training data CSV using Pandas
filename = 'train.csv'
data = pd.read_csv(filename, header=0)
# Separate training data into Y and X
array = data.values
Y = array[:, 1]
X = array[:, 2:12]

In [7]:
# Set up cross validation, grid search, model
lambda_values = np.array([0.1, 1, 10, 100, 1000])
# solvers = ['svd', 'cholesky', 'sparse_cg', 'lsqr', 'sag', 'saga']
# param_grid = dict(alpha=lambda_values, solver=solvers)
param_grid = dict(alpha=lambda_values)

k_folds = 10
seed = 3
scoring = 'neg_mean_squared_error'

model = Ridge(fit_intercept=False, max_iter=1e6, tol=1e-5, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring,
                    n_jobs=-1, cv=k_folds, verbose=1, return_train_score=True)

In [8]:
# Run grid search on lambdas with cross validation using Ridge() model
grid.fit(X, Y)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=1000000.0,
   normalize=False, random_state=3, solver='auto', tol=1e-05),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=1)

In [20]:
# Print all results for inspection
print(grid.cv_results_)

{'mean_fit_time': array([0.00184181, 0.00053635, 0.00057786, 0.0004853 , 0.00053957]), 'std_fit_time': array([1.50911368e-03, 1.14717953e-04, 8.04585559e-05, 1.26004580e-04,
       9.89849769e-05]), 'mean_score_time': array([0.00051959, 0.00017483, 0.00021541, 0.00017097, 0.0001909 ]), 'std_score_time': array([3.44478200e-04, 3.33479417e-05, 2.86714079e-05, 4.79284740e-05,
       4.17559428e-05]), 'param_alpha': masked_array(data=[0.1, 1.0, 10.0, 100.0, 1000.0],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.1}, {'alpha': 1.0}, {'alpha': 10.0}, {'alpha': 100.0}, {'alpha': 1000.0}], 'split0_test_score': array([  -1.27662752,   -1.28411655,   -1.37237021,   -3.59029884,
       -158.79950313]), 'split1_test_score': array([  -0.73739919,   -0.73734193,   -0.76785213,   -4.1467651 ,
       -332.42703463]), 'split2_test_score': array([  -1.22972532,   -1.22825692,   -1.22478756,   -2.26397263,
       -109.808105

In [10]:
# Extract mean scores for result w/ (convoluted) way to get RMSE out of sklearn
scores = np.sqrt(-grid.cv_results_['mean_test_score'])
result = pd.DataFrame(scores)

In [11]:
# Alternatively, i.s.o. GridSearchCV, use cross_val_score to obtain same result
scores2 = []
for l in lambda_values:
    model = Ridge(alpha=l, fit_intercept=False, random_state=seed)
    scores2.append(np.sqrt(-np.mean(cross_val_score(model, X, Y, scoring=scoring,
                                                    cv=k_folds, n_jobs=-1))))

print(scores-scores2)
result2 = pd.DataFrame(scores2)

[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  4.44089210e-16
 -3.55271368e-15]


In [53]:
# Alternatively, use KFold
def KFold_on_Ridge(n_splits=10, random_state=42, shuffle=False):
    kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    scores = []
    for l in lambda_values:
        model = Ridge(alpha=l, fit_intercept=False, random_state=seed)
        error = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            error.append(np.sqrt(mean_squared_error(y_test, y_pred)))
        # print('Lambda: {}'.format(l), '10-fold mean RMSE: {:4f}'.format(np.mean(error)))
        scores.append(np.mean(error))
    return scores
scores3 = KFold_on_Ridge()
print(scores-scores3)
result3 = pd.DataFrame(scores3)

[4.33022515e-03 4.56203497e-03 4.17370282e-03 7.31525759e-01
 9.86472697e+00]


In [42]:
# And now run KFold several times w/ shuffle and take average
n_repeats = 500
multi_scores = np.ndarray(shape=(n_repeats, len(lambda_values)))
for i in range(n_repeats):
    multi_scores[i] = KFold_on_Ridge(random_state=i, shuffle=True)
scores5 = np.mean(multi_scores, axis=0)
print(scores-scores5)
result5 = pd.DataFrame(scores5)

[ 3.53170676e-03  3.47998630e-03 -4.16095358e-03  7.00041228e-01
  9.76853330e+00]


In [14]:
# Different route: I.s.o. all above, simply use CV-specialized model RidgeCV
model2 = RidgeCV(alphas=(0.1, 1, 10, 100, 1000), fit_intercept=False,
                 normalize=False, scoring=scoring, cv=None, store_cv_values=True)
model2.fit(X, Y)
scores4 = []
for j in range(5):
    scores4.append(np.sqrt(mean_squared_error(Y, model2.cv_values_[:,j])))
print(scores-scores4)
result4 = pd.DataFrame(scores4)

[2.39944956e-03 1.80933803e-03 2.59459648e-03 2.12597324e-01
 2.27163114e+00]


In [54]:
# Final step for all options: Write chosen final result to output file
filename = 'result.csv'
final_result = result3
#final_result = result2
#final_result = result3
final_result.to_csv(filename, header=False, index=False)