# Exercise 1: K-Fold

In [3]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array(np.arange(1,21).reshape(10,-1))
y = np.array(np.arange(1,11))

kf = KFold(n_splits=5, shuffle=False)
counter=1
for train_index, test_index in kf.split(X):
    print(f'Fold: {counter}')
    print("TRAIN:", train_index, "TEST:", test_index, "\n")
    counter+=1

Fold: 1
TRAIN: [2 3 4 5 6 7 8 9] TEST: [0 1] 

Fold: 2
TRAIN: [0 1 4 5 6 7 8 9] TEST: [2 3] 

Fold: 3
TRAIN: [0 1 2 3 6 7 8 9] TEST: [4 5] 

Fold: 4
TRAIN: [0 1 2 3 4 5 8 9] TEST: [6 7] 

Fold: 5
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9] 



# Exercise 2: Cross validation (k-fold)

In [11]:
#Exercise 2
# imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import cross_validate
# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']
# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=43)
# pipeline
pipeline = [('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('lr', LinearRegression())]
pipe = Pipeline(pipeline)

scores = cross_validate(pipe, X_train, y_train, cv=10, return_train_score=False)

# Afficher les scores sur chaque fold, la moyenne et l'écart type
print("\nScores on validation sets :", scores['test_score'])
print("\nMean of scores on validation sets :", np.mean(scores['test_score']))
print("\nStandard deviation of scores on validation sets :", np.std(scores['test_score']))


Scores on validation sets : [0.62433594 0.61648956 0.62486602 0.59891024 0.59284295 0.61307055
 0.54630341 0.60742976 0.60014575 0.59574508]

Mean of scores on validation sets : 0.6020139252674299

Standard deviation of scores on validation sets : 0.02149838227734664


# Exercise 3: GridsearchCV

In [7]:
#Exercise 3

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

#1
# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']


parameters = {'n_estimators':[20, 40, 60],
            'max_depth':[2, 7, 10]}

rf = RandomForestRegressor()
gridsearch = GridSearchCV(rf,
                        parameters,
                        cv = 5,
                        n_jobs=-1,
                        scoring='neg_mean_squared_error')

gridsearch.fit(X_train, y_train)

In [9]:
#2
print(gridsearch.best_score_)
print(gridsearch.best_params_)
print(gridsearch.cv_results_)

-0.2907671084843436
{'max_depth': 10, 'n_estimators': 40}
{'mean_fit_time': array([0.59473577, 1.14236765, 1.64283695, 1.66200109, 3.39975648,
       5.33733101, 2.43376188, 4.89586773, 6.49755025]), 'std_fit_time': array([0.05952681, 0.08721997, 0.07332778, 0.0277187 , 0.08860794,
       0.14292617, 0.08845411, 0.10033321, 0.80883501]), 'mean_score_time': array([0.00461836, 0.00736723, 0.01083469, 0.00795479, 0.01549487,
       0.02136774, 0.01232495, 0.0228581 , 0.02807684]), 'std_score_time': array([3.30651463e-04, 9.20711433e-05, 4.65594677e-04, 4.91309818e-04,
       2.25946851e-03, 5.59499220e-04, 2.25121765e-03, 3.56789423e-03,
       4.44917887e-03]), 'param_max_depth': masked_array(data=[2, 2, 2, 7, 7, 7, 10, 10, 10],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[20, 40, 60, 20, 40, 60, 20, 40, 60],
             mask=[False, False, 

In [10]:
#3
print(gridsearch.score(X_test, y_test))

-0.28259007168394673


# Exercise 4: Validation curve and Learning curve