In [191]:
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import numpy as np
import pandas as pd  
import seaborn as seabornInstance 
from sklearn import metrics
from sklearn import svm, datasets, linear_model
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, SGDClassifier, LogisticRegressionCV, RidgeClassifierCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_validate, KFold, LeaveOneOut, GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_boston, load_digits, load_breast_cancer
from sklearn.metrics import accuracy_score

%matplotlib inline
rcParams['figure.figsize'] = 12, 10

# Tip: if you want the generated figure to be large, re-run this cell before the beginning of every exercise.

# 3. Model Selection

---------------------------------------

## a. Cross validation

------------------

### cross_validate(estimator, X, y=None, scoring=None, cv=’warn’, return_train_score=False, return_estimator=False)
*  Method to evaluate metric(s) by cross-validation and also record fit/score times. 
 
* #### Parameters
 1.    estimator: the object to use to fit the data
 2.    X: the data to fit, can be a list or an array.
 3.    y: the target variable to try to predict in the case of supervise learning
 4.    scoring: string, callable, list/tuple, dict or None
       * string: define model evaluation metric
           * Classification:
               * 'accuracy': accuracy classification score
               * 'f1': F1-score
               * 'precision': precision ratio true_positive/(true_positive + false_positive)
               * 'recall': recall ratio true_positive/(true_positive + false_negative)
           * Regression:
               * 'explained_variance': explained variance regression score function.
               * 'neg_mean_absolute_error': mean absolute error regression loss.
               * 'neg_mean_squared_error': mean squared error regression loss.
               
       * callable: a function to evaluate the predictions on the test set
       * list/tuple: for evaluating multiple metrics, list/tuple of (unique) strings
       * dict: for evaluating multiple metrics, dictionary with names as keys and callables as values.
 5.    cv: int, determine the cross-validation splitting strategy. 
       * None: default 3-fold cross validation.
       * integer: specify the number of folds in a KFold
       * an iterable yielding (train,test) splits as arrays of indices.
 6.    return_train_score: whether to include train scores.
 7.    return_estimator: whether to return the estimators fitted on each split.
 
*  #### Returns (of the method): scores ~ a dict of float arrays
 *    test_score: the score array for test scores on each cross-alidation split.
 *    train_score: the score array for train scores on each crosd-validation split.
 *    fit_time: the time for fitting the estimator on the train set for each cross-validation split.
 *    score_time: the time for scoring the estimator on the test set for each cross-validation split.
 *    estimator: the estimator objects for each cross-validation split. only when return_estimator=True

In [192]:
# Example: Cross-validation on regression problem

# Data
diabetes = datasets.load_diabetes()

X = diabetes.data[:150]
y = diabetes.target[:150]

# Lasso Linear Regression object
lasso = Lasso()

# Use 3-fold cross-validation on the Lasso linear regression problem 
cv_results = cross_validate(lasso, X, y, cv=3)

# Let us see what values are available
print(cv_results.keys())

dict_keys(['fit_time', 'score_time', 'test_score'])


In [193]:
# Example: Cross-validation on regression problem (cont)

# Check the test score on 3 folds:
print(cv_results['test_score'])

[0.33150734 0.08022311 0.03531764]


In [194]:
# Exercise 1: Cross-validation on regresison problem (cont)

# Let us try to apply Lasso Linear Regression on the same data but with 4-fold cross validation
### YOUR CODE HERE. Fill in the "None".
# Hint: we use evaluation metrics 'r2' and 'neg_mean_squarred_score'
# Hint: we also want to retrieve the training score
scores = None
# Print the train score applicable to r2
print(None)    
# Hint: print the train score applicable to neg_mean_squared_error
print(None)     
# Hint: print the test score applicable to r2
print(None)    
### END OF YOUR CODE.

[0.29177858 0.35449689 0.38995421 0.20300574]
[-4047.7288045  -3850.01747559 -3592.51190783 -3855.59792991]
[ 0.3392459   0.12286347  0.16482017 -0.04610521]


# 3. Model Selection

---------------------------------------

## b. Splitting Train and Test sets

------------------

### train_test_split(arrays, test_size=None, train_size=None, shuffle=True)
*  Split array or matrices into random train and test subsets.
 
* #### Parameters
 1.    arrays: sequence of indexables with same length/shape[0]
       can be lists, numpy arrays, scip-sparse matices or pandas dataframes.
 2.    test_size: float, int or None
         * float: between 0 and 1, proportion of the dataset to be in the test split.
         * int: absolute number of test samples
         * None: automatically set to be complement of train_size; if train_size is also None, then the ratio is set to be 0.25
 3.    train_size: float, int or None
         * float: between 0 and 1, proportion of the dataset to be in the train split.
         * int: absolute number of train samples
         * None: automatically set to be complement of test_size.
 4.    shuffle: whether to shuffle the data before splitting.
 
*  #### Returns: splitting
        List containing train-test split of inputs.
        e.x: X_train, X_test, y_train, y_test = train_test_split()

In [195]:
# Exercise: Linear Regression on predicting wine quality revisited

# RELOAD EVERYTHING
# Load the dataset
dataset = pd.read_csv('winequality.csv')

# Clean up the data by removing all null values
dataset = dataset.fillna(method='ffill')

# Devide attributes/features and output vales
# Features
X = dataset[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 
             'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates','alcohol']].values
# Output values
y = dataset['quality'].values

# Split the dataset into 70% train, 30% test
### YOUR CODE HERE. 1 line of code. There are more than 1 way to split in such manner.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
### END OF YOUR CODE.

# L2-regularization Linear Regression
### YOUR CODE HERE. Fill in the "None". 2 lines of code.
# Hint: train L2-regularization Linear Regression with 0.1 as the regularization weight
regressor = None
None

# Print coefficients associated with 12 features. Hint: the first parameter of the function must be the coefficients
coeff_df = pd.DataFrame(None, dataset.columns[:-1], columns=['Coefficient']) 
### END OF YOUR CODE.

# Show the learned coefficients
coeff_df

Unnamed: 0,Coefficient
fixed acidity,0.005743
volatile acidity,-1.229988
citric acid,-0.103239
residual sugar,0.015001
chlorides,-1.82949
free sulfur dioxide,0.002147
total sulfur dioxide,-0.003042
density,-0.111951
pH,-0.471583
sulphates,0.818644


In [196]:
# Exercise: Linear Regression on predicting wine quality revisited (cont)

### YOUR CODE HERE. 1 line of code. 'y_pred' is the predicted value on test set.
y_pred = None
### END OF YOUR CODE

# Compare actual and predicted values
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(25)
df1

Unnamed: 0,Actual,Predicted
0,6,5.788372
1,5,5.048661
2,7,6.574231
3,6,5.394111
4,5,5.91012
5,6,5.060175
6,5,5.404963
7,6,6.002679
8,4,4.823235
9,5,4.966325


# 3. Model Selection

---------------------------------------

## c. KFold

------------------

### KFold(n_splits=3, shuffle=False)
*  K-Folds cross-validator object.

    Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).

    Each fold is then used once as a validation while the k - 1 remaining folds form the training set.
 
* #### Parameters
 1.    n_splits: number of folds, must be at least 2.
 2.    shuffle: whether to shuffle the data before splitting. 
 
*  #### Methods: (of the KFold object)
        * split(X, y=None): generate indices to split data into training and test set.

In [197]:
# Example: cut the dataset into 1 half for train and 1 half for test

# Data: 
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

kf = KFold(n_splits=2, shuffle=False)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [2 3] TEST: [0 1]
TRAIN: [0 1] TEST: [2 3]


In [198]:
# Exercise: Linear Regression on Hitters dataset revisited

# Reload everything
# Read the data and remove rows with 'NaN' values.
df = pd.read_csv('Hitters.csv').dropna()

y = df.Salary.values

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')
X_.dropna()

X = X_[X_.columns].values


# Good L1-regularization weight, we will see how this value is chosen shortly
best_alpha = 2.2410206769492604

# Define a L1-regularozation LinearRegression object with maximum 10000 iterations and normalization.
# The regularization weight not specified yet.
### YOUR CODE HERE. 1 line of code. 
lasso = None
### END OF YOUR CODE.

# Define Kfold object, in which we wish to split the data into 5 subsets. We also want to shuffle it before splitting.
### YOUR CODE HERE. 1 line of code.
kf = None
### END OF YOUR CODE.

# Do L1-regularization Linear Regression 5 times
### YOUR CODE HERE. Fill in the "None".
for train_index, test_index in None:
    
    # 2 lines of code
    X_train, X_test = None
    y_train, y_test = None
    
    # Set the regularization weight to be the value defined above
    None
    
    # Hint: fit our model to the data.
    None
    
    # Hint: 'y_pred' should be the predicted values on the test set.
    y_pred = None
### END OF YOUR CODE.

    # Compute Mean-squarred-error:
    print("Error of Lasso regression with alpha value " + str(best_alpha) + " is:  " + str(mean_squared_error(y_test, y_pred)))
    

Error of Lasso regression with alpha value 2.2410206769492604 is:  115817.5853601778
Error of Lasso regression with alpha value 2.2410206769492604 is:  112055.53774402301
Error of Lasso regression with alpha value 2.2410206769492604 is:  178887.24366243515
Error of Lasso regression with alpha value 2.2410206769492604 is:  74225.05711476142
Error of Lasso regression with alpha value 2.2410206769492604 is:  112757.52432805097


# 3. Model Selection

---------------------------------------

## d. LeaveOneOut

------------------

### LeaveOneOut()
*  Leave-one-out cross-validator object.

    Provides train/test indices to split data in train/test sets. Each sample is used once as a test set (singleton) while the remaining samples form the training set.
 
*  #### Methods: (of the KFold object)
        * split(X, y=None): generate indices to split data into training and test set.

In [199]:
# Example: 
X = np.array([[1, 2], [3, 4], [5,6], [7,8], [9,10], [11,12]])
y = np.array([1, 2, 3, 4, 5, 6])
loo = LeaveOneOut()
loo.get_n_splits(X)

i = 0 
for train_index, test_index in loo.split(X):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("Turn: ", i)
    print("Train set: \n", X_train, y_train)
    print("Test set: \n", X_test, y_test)
    print("\n")

Turn:  1
Train set: 
 [[ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]
 [11 12]] [2 3 4 5 6]
Test set: 
 [[1 2]] [1]


Turn:  2
Train set: 
 [[ 1  2]
 [ 5  6]
 [ 7  8]
 [ 9 10]
 [11 12]] [1 3 4 5 6]
Test set: 
 [[3 4]] [2]


Turn:  3
Train set: 
 [[ 1  2]
 [ 3  4]
 [ 7  8]
 [ 9 10]
 [11 12]] [1 2 4 5 6]
Test set: 
 [[5 6]] [3]


Turn:  4
Train set: 
 [[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 9 10]
 [11 12]] [1 2 3 5 6]
Test set: 
 [[7 8]] [4]


Turn:  5
Train set: 
 [[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [11 12]] [1 2 3 4 6]
Test set: 
 [[ 9 10]] [5]


Turn:  6
Train set: 
 [[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]] [1 2 3 4 5]
Test set: 
 [[11 12]] [6]




In [200]:
# Exercise: Leave-One-Out Logistic Regression

# Load dataset
df = pd.read_csv("dataset.csv")

# Drop the columns that are not numerical data
df = df.drop(["date", "time", "username"], axis=1)

data = df.values
X = data[19000:19041, 1:]  # all rows, no label
y = data[19000:19041, 0]  # all rows, label only
scores = []
i = 0 
for train_index, test_index in loo.split(X):
    i += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = SGDClassifier(loss="log", penalty="l2", max_iter=1000)
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

1.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0


Question: can we use KFold() to apply Leave-One-Out?

Answer: yes, just set n_splits equal to the number of samples.

# 3. Model Selection

---------------------------------------

## e. Grid Search Cross Validation

------------------

### GridSearchCV(estimator, param_grid, scoring=None, cv=’warn’, return_train_score=False)
* Exhaustive search over specified parameter values for an estimator.

    Important members are fit, predict.

    GridSearchCV implements a “fit” and a “score” method. It also implements “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.

    The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.
 
* #### Parameters
 1.    estimator: estimator object; either estimator needs to provide a score function, or scoring must be passed.
 2.    param_grid: dict of list of dictionaries
       dictionary with parameters names (string) as keys and lists of parameter settings to try as vallues.
 3.    scoring: a single string or a callable to evaluate the predictions.
     * string: define model evaluation metric
           * Classification:
               * 'accuracy': accuracy classification score
               * 'f1': F1-score
               * 'precision': precision ratio true_positive/(true_positive + false_positive)
               * 'recall': recall ratio true_positive/(true_positive + false_negative)
           * Regression:
               * 'explained_variance': explained variance regression score function.
               * 'neg_mean_absolute_error': mean absolute error regression loss.
               * 'neg_mean_squared_error': mean squared error regression loss.
               
     * callable: a function to evaluate the predictions on the test set
 4.     cv: determines the cross-validating splitting strategy. 
         * None: default 3-fold cross validation.
         * integer: specify the number of folds in a KFold
         * an iterable yielding (train,test) splits as arrays of indices.
 5.     return_train_score: whether to include training score.
       
 
*  #### Attributes (of the GridSearchCV object):
 *    cv_results_: a dict with keys as column heaers and values as columns, that can be imported into a pandas DataFrame.
 *    best_estimator_: estimator that was chosen by the search (associated with highest score/smallest loss)
 *    best_score_: mean cross-validated score of the best_estimator
 *    best_params_: parameter setting that gave the best results on the hold out data.
 *    best_index_: the index of the cv_results) array which corresponds to the best candidate parameter setting. 
 *    n_splits: the number of cross-validation splits.
 
*  #### Methods (on the GridSearchCV object)
 *    decision_function(X): call decision function on the estimator with the best found parameters.
 *    fit(X,y): run fit with all sets of parameters.
 *    get_params: get parameters for this estimator.
 *    inverse_transform(X): call inverse transform on the estimator with the best found parameters.
 *    predict(X): call predict on the estimator with the best found parameters.
 *    score(X, y): return the score on the given data, if the estimator as been refit.
 *    set_params(): set the parameters on this estimator.
 *    transform(X): call transform on the estimator with the best found parameters.

In [201]:
# Example: Grid Search Validation for Linear Regression

# Read the data and remove rows with 'NaN' values.
df = pd.read_csv('Hitters.csv').dropna()

y = df.Salary

# Drop the column with the independent variable (Salary), and columns for which we created dummy variables
X = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')

# Split the train and test set with ratio 50%, 50%, respectively.
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

parameter_candidates = [{'alpha': [0.5, 1, 1.5, 2, 2.5, 3], 'max_iter': [100000, 200000], 'normalize': [True]}]

# Create a regression object on training data
clf = GridSearchCV(estimator=Lasso(), param_grid=parameter_candidates, scoring = "neg_mean_absolute_error", cv=3, iid=False)

# Train the classifier on data1's feature and target data
### YOUR CODE HERE. 1 line of code.
None
### END OF YOUR CODE.

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid=False, n_jobs=None,
             param_grid=[{'alpha': [0.5, 1, 1.5, 2, 2.5, 3],
                          'max_iter': [100000, 200000], 'normalize': [True]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=0)

In [202]:
# Example: Grid Search Validation for Linear Regression (cont)

print('Best alpha:',clf.best_estimator_.alpha) 

Best alpha: 1


In [203]:
# Exercise: Grid Search Validation for Linear Regression (cont)

# Do GridSearch Cross validation with Ridge Linear Regression
### YOUR CODE HERE. Fill in the "None".
# Define parameter_candidates for the parameter param_grid
# with option to choose maximum number of iterations between 100000 and 200000
# with option to choose alpha among [0, 0.5, 1, 1.5, 2, 2.5]
# and normalization must be done
parameter_candidates = [{None}]

# Hint: Create a classifier object with the classifier and parameter candidates, mean-squarred error should be used
clf = None

# Hint: Train the classifier on training data
None

# Hint: Print the best alpha:
print('Best alpha:', None) 

### END OF YOUR CODE.

Best alpha: 0.5


# 3. Model Selection

---------------------------------------

## f. Randomized Search Cross Validation

------------------

### RandomizedSearchCV(estimator, param_distributions,n_iter=10, scoring=None, cv=’warn’, return_train_score=False)
* Randomized search on hyper parameters.

    The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings.

    In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter.

    If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters.
 
* #### Parameters
 1.    estimator: estimator object; either estimator needs to provide a score function, or scoring must be passed.
 2.    param_distributions: dictionary with parameters names (string) as keys and distributions/lists of parameters to try.
 3.    n_iter: number of parameter settings that are sampled.
 4.    scoring: a single string or a callable to evaluate the predictions.
     * string: define model evaluation metric
           * Classification:
               * 'accuracy': accuracy classification score
               * 'f1': F1-score
               * 'precision': precision ratio true_positive/(true_positive + false_positive)
               * 'recall': recall ratio true_positive/(true_positive + false_negative)
           * Regression:
               * 'explained_variance': explained variance regression score function.
               * 'neg_mean_absolute_error': mean absolute error regression loss.
               * 'neg_mean_squared_error': mean squared error regression loss.
               
     * callable: a function to evaluate the predictions on the test set
 5.     cv: determines the cross-validating splitting strategy. 
         * None: default 3-fold cross validation.
         * integer: specify the number of folds in a KFold
         * an iterable yielding (train,test) splits as arrays of indices.
 6.     return_train_score: whether to include training score.
       
 
*  #### Attributes (of the RandomizedSearchCV object):
 *    cv_results_: a dict with keys as column heaers and values as columns, that can be imported into a pandas DataFrame.
 *    best_estimator_: estimator that was chosen by the search (associated with highest score/smallest loss)
 *    best_score_: mean cross-validated score of the best_estimator
 *    best_params_: parameter setting that gave the best results on the hold out data.
 *    best_index_: the index of the cv_results) array which corresponds to the best candidate parameter setting. 
 *    n_splits: the number of cross-validation splits.
 
*  #### Methods (on the RandomizedSearchCV object)
 *    decision_function(X): call decision function on the estimator with the best found parameters.
 *    fit(X,y): run fit with all sets of parameters.
 *    get_params: get parameters for this estimator.
 *    inverse_transform(X): call inverse transform on the estimator with the best found parameters.
 *    predict(X): call predict on the estimator with the best found parameters.
 *    score(X, y): return the score on the given data, if the estimator as been refit.
 *    set_params(): set the parameters on this estimator.
 *    transform(X): call transform on the estimator with the best found parameters.

In [204]:
# Exercise: Randomized Search Cross Validation with Lasso Linear Regression

### YOUR CODE HERE. Fill in the "None".
# Define a dictionary so that maximum number of iterations is a list of 10000 and 100000
# and alpha is a range of 0.5, 0.6, ..., 5.4, 5.5 (Hint: use np.arange())
# and normalization must be done
parameter_candidates = None

# Hint: Create a classifier object with the classifier and parameter candidates, 50 parameter settings are examined.
clf = None

# Train the classifier on training data
None

# Print the best alpha:
print('Best alpha:', None) 

### END OF YOUR CODE.

Best alpha: 0.2


# 3. Model Selection

---------------------------------------

## g. Lasso Regression cross-validation

------------------

### LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, max_iter=1000, tol=0.0001, cv=’warn’)
* Lasso linear model with iterative fitting along a regularization path.
    
    The best model is selected by cross-validation.
 
* #### Parameters
 1.    eps: float, length of the path, = alpha_min/alpha_max.
 2.    param_grid: dict of list of dictionaries
 3.    fit_intercept: whether to calculate the intercept. If False, data is supposed to be centered.
 4.    normalize: whether to normalize; ignored when fit_intercept=False
 5.    max_iter : maximum number of iterations.
 6.    tol: tolerance for the optimization
 7.    cv: determines the cross-validating splitting strategy. 
         * None: default 3-fold cross validation.
         * integer: specify the number of folds in a KFold
         * an iterable yielding (train,test) splits as arrays of indices.
       
 
*  #### Attributes (of the LassoCV object):
 *    alpha_: weight of regularization
 *    coef_: trained parameter vector
 *    intercept_: trained intercept
 *    alphas_: the grid of alphas used for fitting.
 
*  #### Methods (on the LassoCV object)
 *    fit(X,y): run fit with all sets of parameters.
 *    get_params: get parameters for this estimator.
 *    predict(X): call predict on the estimator with the best found parameters.
 *    score(X, y): return the score on the given data, if the estimator as been refit.
 *    set_params(): set the parameters on this estimator.

In [205]:
# Exercise: Lasso Linear Regression revisited

# In Exercise 1 of Lasso Linear Regression, a value 'best_alpha' =  2.2410206769492604 without any explanations.

# Let us find it by Lasso Cross-validation

### YOUR CODE HERE. Fill in the "None".
# Hint: define a Lasso cross-validation object, 10 folds, maximum 100000 iterations, normalization and no specified range of alpha, 
lassocv = None
# Hint: train it on the train set
None
# Hint: print out the optimal alpha
print(None)
### END OF YOUR CODE.

2.2410206769492604


This is exactly the value that we used!

# 3. Model Selection

---------------------------------------

## h. Ridge Regression cross-validation

------------------

### RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, cv=None)
* Ridge regression with built-in cross-validation.


    By default, it performs Generalized Cross-Validation, which is a form of efficient Leave-One-Out cross-validation.
 
* #### Parameters
 1.    alphas: array pf alpha values to try, weight of regularization.
 2.    fit_intercept: whether to calculate the intercept. If False, data is supposed to be centered.
 3.    normalize: whether to normalize; ignored when fit_intercept=False
 4.    scoring: a single string or a callable to evaluate the predictions.
     * string: define model evaluation metric
           * Classification:
               * 'accuracy': accuracy classification score
               * 'f1': F1-score
               * 'precision': precision ratio true_positive/(true_positive + false_positive)
               * 'recall': recall ratio true_positive/(true_positive + false_negative)
           * Regression:
               * 'explained_variance': explained variance regression score function.
               * 'neg_mean_absolute_error': mean absolute error regression loss.
               * 'neg_mean_squared_error': mean squared error regression loss.
 5.    cv: determines the cross-validating splitting strategy. 
         * None: default 3-fold cross validation.
         * integer: specify the number of folds in a KFold
         * an iterable yielding (train,test) splits as arrays of indices.
       
 
*  #### Attributes (of the RidgeCV object):
 *    alpha_: weight of regularization
 *    coef_: trained parameter vector
 *    intercept_: trained intercept
 *    cv_values_: cross-validation values for each alpha. after fit() called, contain the mean squared errors.

*  #### Methods (on the RidgeCV object)
 *    fit(X,y): run fit with all sets of parameters.
 *    get_params: get parameters for this estimator.
 *    predict(X): call predict on the estimator with the best found parameters.
 *    score(X, y): return the score on the given data, if the estimator as been refit.
 *    set_params(): set the parameters on this estimator.

In [206]:
# Exercise: Ridge Linear Regression revisited

# Similarly, in Exercise 3 of Ridge Linear Regression, a value 'best_alpha' =  0.5748784976988678 without any explanations.

# Let us find it by Ridge Cross-validation

# Range of alpha values to consider
alphas = 10**np.linspace(10,-2,100)*0.5

### YOUR CODE HERE. Fill in the "None".
# Hint: define a Lasso cross-validation object, mean squared error metric, with normalization and range of alpha values defined above
ridgecv = None
# Hint: train it on the train set
None
# Hint: print out the optimal alpha
print(None)
### END OF YOUR CODE.

0.5748784976988678


This is also the value that we used!

# 3. Model Selection

---------------------------------------

## i. Logistic Regression cross-validation

------------------

### LogisticRegressionCV(Cs=10, fit_intercept=True, cv=’warn’, penalty=’l2’, scoring=None, solver=’lbfgs’, tol=0.0001, max_iter=100, 1_ratios=None)
* Logistic Regression CV (aka logit, MaxEnt) classifier.
 
* #### Parameters
 1.    Cs: each of the values in Cs describes the inverse of regularization strength.
 2.    fit_intercept: whether to calculate the intercept. If False, data is supposed to be centered.
 3.    cv: determines the cross-validating splitting strategy. 
         * None: default 3-fold cross validation.
         * integer: specify the number of folds in a KFold
         * an iterable yielding (train,test) splits as arrays of indices.
 4.    penalty: type of regularization ("l1", "l2", "elasticnet")
 5.    scoring: a single string or a callable to evaluate the predictions.
     * string: define model evaluation metric
           * Classification:
               * 'accuracy': accuracy classification score
               * 'f1': F1-score
               * 'precision': precision ratio true_positive/(true_positive + false_positive)
               * 'recall': recall ratio true_positive/(true_positive + false_negative)
           * Regression:
               * 'explained_variance': explained variance regression score function.
               * 'neg_mean_absolute_error': mean absolute error regression loss.
               * 'neg_mean_squared_error': mean squared error regression loss.
 6.    solver: optimization algorithm (‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’)
 7.    tol: tolerance for stopping criteria
 8.    max_iter: maximum number of iterations.
 9.    l1_ratios: list of floats, ratio of l1-regularization, only applicable when penalty='elasticnet'.
 
*  #### Attributes (of the LogisticRegressionCV object):
 *    classes_: a list of class labels known to the classifier.
 *    coef_: trained parameter vector
 *    intercept_: trained intercept
 *    Cs_: array of C i.e. inverse of regularization parameter values used for cross-validation.
 *    l1_ratios_: array of l1_ratios used for cross-validation.
 *    scores_: dict with classes as keys, values as grid of scores obtained during cross-validating each fold.
 *    C_: array of C that maps to the best scores across every class.
 *    l1_ratio_: array of l1_ratio_ that maps to the best scores across every class.

*  #### Methods (on the LogisticRegressionCV object)
 *    decision_function(X): predict confidence scores for samples.
 *    fit(X,y): fit the model according to the given training data.
 *    get_params: get parameters for this estimator.
 *    predict(X): predict class labels for samples in X.
 *    score(X, y): return the score using the scoring option on the given test data and labels.
 *    set_params(): set the parameters on this estimator.

In [207]:
# Exercise: simple LogisticRegressionCV

# Load the dataset
df1=pd.read_csv('dataset_train_woed.csv')

X=df1.drop(['Unnamed: 0','ID','target'],axis=1).values
y=df1.target.values

# Some insights into the data
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 26 columns):
Unnamed: 0    30000 non-null int64
ID            30000 non-null int64
LIMIT_BAL     30000 non-null float64
SEX           30000 non-null float64
EDUCATION     30000 non-null float64
MARRIAGE      30000 non-null float64
AGE           30000 non-null float64
PAY_0         30000 non-null float64
PAY_2         30000 non-null float64
PAY_3         30000 non-null float64
PAY_4         30000 non-null float64
PAY_5         30000 non-null float64
PAY_6         30000 non-null float64
BILL_AMT1     30000 non-null float64
BILL_AMT2     30000 non-null float64
BILL_AMT3     30000 non-null float64
BILL_AMT4     30000 non-null float64
BILL_AMT5     30000 non-null float64
BILL_AMT6     30000 non-null float64
PAY_AMT1      30000 non-null float64
PAY_AMT2      30000 non-null float64
PAY_AMT3      30000 non-null float64
PAY_AMT4      30000 non-null float64
PAY_AMT5      30000 non-null float64
PAY_AMT

In [208]:
# Exercise: simple LogisticRegressionCV

# Define a K-Fold object of 5 folds, do shuffling before spliting
### YOUR CODE HERE. 1 line of code
kf = None
### END OF YOUR CODE.

### YOUR CODE HERE. Fill in the "None".
for train_index, test_index in None:
    X_train, X_test = None
    y_train, y_test = None

    # 3 lines of code. Do LogisticRegressionCV. 
    clf = None
    None
    y_pred = None
    
    acc = accuracy_score(y_test, y_pred)
    print(acc)
    
### END OF YOUR CODE.

0.8175
0.8186666666666667
0.8146666666666667
0.8231666666666667
0.8178333333333333


# 3. Model Selection

---------------------------------------

## j. Ridge classifier cross-validation

------------------

### RidgeClassifierCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, cv=None)
* Ridge classifier with built-in cross-validation.
 
* #### Parameters
 1.    alphas: array of alpha values, weights of regularization, to try.
 2.    fit_intercept: whether to calculate the intercept. If False, data is supposed to be centered.
 3.    normalize: whether to compute normalization; ignored when fit_intercept=False.
 4.    scoring: a single string or a callable to evaluate the predictions.
     * string: define model evaluation metric
           * Classification:
               * 'accuracy': accuracy classification score
               * 'f1': F1-score
               * 'precision': precision ratio true_positive/(true_positive + false_positive)
               * 'recall': recall ratio true_positive/(true_positive + false_negative)
           * Regression:
               * 'explained_variance': explained variance regression score function.
               * 'neg_mean_absolute_error': mean absolute error regression loss.
               * 'neg_mean_squared_error': mean squared error regression loss.
 5.    cv: determines the cross-validating splitting strategy. 
         * None: default 3-fold cross validation.
         * integer: specify the number of folds in a KFold
         * an iterable yielding (train,test) splits as arrays of indices.
 
*  #### Attributes (of the RidgeClassifierCV object):
 *    cv_values_: cross-validation values for each alpha. After fit() called, this contains the mean squared errors or the           value of loss function.
 *    coef_: trained parameter vector
 *    intercept_: trained intercept
 *    alpha_: estimated regularization parameter

*  #### Methods (on the RidgeClassifierCV object)
 *    decision_function(X): predict confidence scores for samples.
 *    fit(X,y): fit the model according to the given training data.
 *    get_params: get parameters for this estimator.
 *    predict(X): predict class labels for samples in X.
 *    score(X, y): return the score using the scoring option on the given test data and labels.
 *    set_params(): set the parameters on this estimator.

In [209]:
# Exercise: simple RidgeClassifierCV

# Load the data
X, y = load_breast_cancer(return_X_y=True)

# Define a K-Fold object of 4 folds, do shuffling before spliting
### YOUR CODE HERE. 1 line of code
kf = None
### END OF YOUR CODE.

### YOUR CODE HERE. Fill in the "None".
for train_index, test_index in None:
    X_train, X_test = None
    y_train, y_test = None

    # 3 lines of code. Define a LogisticRegressionCV object and train it on the training data.
    # Hint: make sure that alpha can have values: 0.001, 0.01, 0.1, 1
    clf = None
    None
    # 'score' is the average accuracy
    score = None
    print(score)
    
### END OF YOUR CODE.

0.951048951048951
0.9788732394366197
0.9436619718309859
0.9577464788732394


## References:
1. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
2. https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
3. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection
4. https://chrisalbon.com/machine_learning/model_evaluation/cross_validation_parameter_tuning_grid_search/
5. https://www.kaggle.com/wilsonf/uci-credit-carefrom-python-woe-pkg/downloads/UCI_Credit_Card.csv/1
6. https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model

That is the end of today class. We hope you had fun!