In [27]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_score

In [28]:
# Load training data CSV using Pandas
filename = 'train.csv'
data = pd.read_csv(filename, header=0)
# Separate training data into Y and X
array = data.values
Y = array[:, 1]
X = array[:, 2:12]

In [56]:
# Set up cross validation, grid search, model
lambda_values = np.array([0.1, 1, 10, 100, 1000])
param_grid = dict(alpha=lambda_values)

k_folds = 10
seed = 42
scoring = 'mean_squared_error'

model = Ridge(random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, n_jobs=-1, cv=k_folds, verbose=1)

In [63]:
# Run grid search on lambdas with cross validation using Ridge() model
grid.fit(X, Y)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='mean_squared_error', verbose=1)

In [64]:
# Print results for inspection
print(grid.cv_results_)

{'mean_fit_time': array([0.00504372, 0.0008379 , 0.00079551, 0.00084801, 0.00089622]), 'std_fit_time': array([0.00492393, 0.00021194, 0.00018694, 0.00024715, 0.00033285]), 'mean_score_time': array([0.00127699, 0.00030353, 0.00027289, 0.00037868, 0.00030785]), 'std_score_time': array([1.28915921e-03, 7.00241049e-05, 6.39314163e-05, 2.90457480e-04,
       9.63838062e-05]), 'param_alpha': masked_array(data=[0.1, 1.0, 10.0, 100.0, 1000.0],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.1}, {'alpha': 1.0}, {'alpha': 10.0}, {'alpha': 100.0}, {'alpha': 1000.0}], 'split0_test_score': array([  -1.27662997,   -1.28414129,   -1.3726401 ,   -3.59409609,
       -158.59478507]), 'split1_test_score': array([  -0.73739915,   -0.7373416 ,   -0.76785466,   -4.14642822,
       -332.12405849]), 'split2_test_score': array([  -1.22973036,   -1.22830675,   -1.22522371,   -2.26435936,
       -109.95269056]), 'split3_test_score': 

In [65]:
# Extract mean scores for result with only (convoluted) way to get RMSE out of sklearn
scores = np.sqrt(-grid.cv_results_['mean_test_score'])
result = pd.DataFrame(scores)

In [66]:
# Write result to output file
filename = 'result.csv'
result.to_csv(filename, header=False, index=False)