# Chapter 24 - More Supervised Machine Learning Techniques with Scikit-learn
## Building Machine Learning and Deep Learning Models on Google Cloud Platform
### Ekaba Bisong

## Statistical tests to select the best  features using the SelectKBest module

In [0]:
# import packages
from sklearn import datasets
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# display first 5 rows
X[0:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [0]:
# feature engineering. Let's see the best 3 features by setting k = 3
kBest_chi = SelectKBest(score_func=chi2, k=3)
fit_test = kBest_chi.fit(X, y)

In [0]:
# print test scores
fit_test.scores_

array([ 10.81782088,   3.7107283 , 116.31261309,  67.0483602 ])

In [0]:
# we can transform the dataset to subset only the important features.
adjusted_features = fit_test.transform(X)
adjusted_features[0:5,:]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2]])

## Recursive Feature Elimination (RFE)

In [0]:
# import packages
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn import datasets

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# feature engineering
linear_reg = LinearRegression()
rfe = RFE(estimator=linear_reg, n_features_to_select=6)
rfe_fit = rfe.fit(X, y)

In [0]:
# print the feature ranking
rfe_fit.ranking_

array([3, 5, 4, 1, 1, 1, 8, 1, 2, 6, 1, 7, 1])

## Feature Importances

In [0]:
# import packages
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import datasets

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# original data shape
X.shape

(150, 4)

In [0]:
# feature engineering
ada_boost_classifier = AdaBoostClassifier()
ada_boost_classifier.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [0]:
# print the feature importances
ada_boost_classifier.feature_importances_

array([0. , 0. , 0.5, 0.5])

In [0]:
# create a subset of data based on the relevant features
model = SelectFromModel(ada_boost_classifier, prefit=True)
new_data = model.transform(X)

In [0]:
# the irrelevant features have been removed
new_data.shape

(150, 2)

## Resampling Methods

### k-Fold cross validation

In [0]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# initialize KFold - with shuffle = True, shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True)

In [0]:
# create the model
knn_clf = KNeighborsClassifier(n_neighbors=3)

In [0]:
# fit the model using cross validation
cv_result = cross_val_score(knn_clf, X, y, cv=kfold)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 95.333% (0.943%)


### Leave-One-Out cross validation (LOOCV)

In [0]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# initialize LOOCV
loocv = LeaveOneOut()

In [0]:
# create the model
knn_clf = KNeighborsClassifier(n_neighbors=3)

In [0]:
# fit the model using cross validation
cv_result = cross_val_score(knn_clf, X, y, cv=loocv)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 96.000% (19.596%)


## Model evaluation

### Regression evaluation metrics

In [0]:
# import packages
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
# setting normalize to true normalizes the dataset before fitting the model
linear_reg = LinearRegression(normalize = True)

In [0]:
# fit the model on the training set
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [0]:
# make predictions on the test set
predictions = linear_reg.predict(X_test)

In [0]:
# evaluate the model performance using mean square error metric
print("Mean squared error: %.2f" % mean_squared_error(y_test, predictions))

Mean squared error: 28.52


In [0]:
# evaluate the model performance using mean absolute error metric
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, predictions))

Mean absolute error: 3.61


In [0]:
# evaluate the model performance using r-squared error metric
print("R-squared score: %.2f" % r2_score(y_test, predictions))

R-squared score: 0.62


### Regression evaluation metrics implemented with cross validation

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# initialize KFold - with shuffle = True, shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True)

In [0]:
# create the model
linear_reg = LinearRegression(normalize = True)

In [0]:
# fit the model using cross validation - score with Mean square error (MSE)
mse_cv_result = cross_val_score(linear_reg, X, y, cv=kfold, scoring="neg_mean_squared_error")

In [0]:
# print mse cross validation output
print("Negtive Mean squared error: %.3f%% (%.3f%%)" % (mse_cv_result.mean(), mse_cv_result.std()))

Negtive Mean squared error: -22.875% (2.686%)


In [0]:
# fit the model using cross validation - score with Mean absolute error (MAE)
mae_cv_result = cross_val_score(linear_reg, X, y, cv=kfold, scoring="neg_mean_absolute_error")

In [0]:
# print mse cross validation output
print("Negtive Mean absolute error: %.3f%% (%.3f%%)" % (mae_cv_result.mean(), mse_cv_result.std()))

Negtive Mean absolute error: -3.451% (2.686%)


In [0]:
# fit the model using cross validation - score with R-squared
r2_cv_result = cross_val_score(linear_reg, X, y, cv=kfold, scoring="r2")

In [0]:
# print mse cross validation output
print("R-squared score: %.3f%% (%.3f%%)" % (r2_cv_result.mean(), r2_cv_result.std()))

R-squared score: 0.722% (0.028%)


### Classification evaluation metrics

In [0]:
# import packages
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
logistic_reg = LogisticRegression()

In [0]:
# fit the model on the training set
logistic_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# make predictions on the test set
predictions = logistic_reg.predict(X_test)

In [0]:
# evaluate the model performance using accuracy
print("Accuracy score: %.2f" % accuracy_score(y_test, predictions))

Accuracy score: 0.97


In [0]:
# evaluate the model performance using log loss

### output the probabilities of assigning an observation to a class
predictions_probabilities = logistic_reg.predict_proba(X_test)

In [0]:
print("Log-Loss likelihood: %.2f" % log_loss(y_test, predictions_probabilities))

Log-Loss likelihood: 0.32


In [0]:
# evaluate the model performance using classification report
print("Classification report: \n", classification_report(y_test, predictions, target_names=data.target_names))

Classification report: 
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       1.00      0.92      0.96        12
   virginica       0.92      1.00      0.96        11

    accuracy                           0.97        38
   macro avg       0.97      0.97      0.97        38
weighted avg       0.98      0.97      0.97        38



### Classification evaluation metrics implemented with cross validation

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# initialize KFold - with shuffle = True, shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True)

In [0]:
# create the model
logistic_reg = LogisticRegression()

In [0]:
# fit the model using cross validation - score with accuracy
accuracy_cv_result = cross_val_score(logistic_reg, X, y, cv=kfold, scoring="accuracy")



In [0]:
# print accuracy cross validation output
print("Accuracy: %.3f%% (%.3f%%)" % (accuracy_cv_result.mean(), accuracy_cv_result.std()))

Accuracy: 0.953% (0.019%)


In [0]:
# fit the model using cross validation - score with Log-Loss
logloss_cv_result = cross_val_score(logistic_reg, X, y, cv=kfold, scoring="neg_log_loss")



In [0]:
# print mse cross validation output
print("Log-Loss likelihood: %.3f%% (%.3f%%)" % (logloss_cv_result.mean(), logloss_cv_result.std()))

Log-Loss likelihood: -0.346% (0.019%)


## Pipelines: Streamlining Machine Learning Workflows

In [0]:
# import packages
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# create the pipeline
estimators = [
    ('standardize' , StandardScaler()),
    ('svc', SVC())
]

In [0]:
# build the pipeline model
pipe = Pipeline(estimators)

In [0]:
# run the pipeline
kfold = KFold(n_splits=3, shuffle=True)
cv_result = cross_val_score(pipe, X, y, cv=kfold)

In [0]:
# evaluate the model performance
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 94.667% (0.943%)


### Pipelines using make_pipeline

In [0]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# build the pipeline model
pipe = make_pipeline(
    PCA(n_components=9),
    RandomForestRegressor()
)

In [0]:
# run the pipeline
kfold = KFold(n_splits=4, shuffle=True)
cv_result = cross_val_score(pipe, X, y, cv=kfold)



In [0]:
# evaluate the model performance
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 71.706% (2.767%)


### Pipelines using FeatureUnion

In [0]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# construct pipeline for feature engineering - make_union similar to make_pipeline
feature_engr = make_union(
    RFE(estimator=RandomForestRegressor(n_estimators=100), n_features_to_select=6),
    PCA(n_components=9)
)

In [0]:
# build the pipeline model
pipe = make_pipeline(
    feature_engr,
    GradientBoostingRegressor(n_estimators=100)
)

In [0]:
# run the pipeline
kfold = KFold(n_splits=4, shuffle=True)
cv_result = cross_val_score(pipe, X, y, cv=kfold)

In [0]:
# evaluate the model performance
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 88.088% (3.096%)


## Model tuning

### Grid Search

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# construct grid search parameters in a dictionary
parameters = {
    'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
    'max_depth': [2, 4, 6, 8],
    'min_samples_leaf': [1,2,3,4,5]
    }

In [0]:
# create the model
rf_model = RandomForestRegressor()

In [0]:
# run the grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=parameters)

In [0]:
# fit the model
grid_search.fit(X,y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [2, 4, 6, 8],

In [0]:
# evaluate the model performance
print("Best Accuracy: %.3f%%" %  (grid_search.best_score_*100.0))

Best Accuracy: 58.510%


In [0]:
# best set of hyper-parameter values
print("Best n_estimators: %d \nBest max_depth: %d \nBest min_samples_leaf: %d " %  \
            (grid_search.best_estimator_.n_estimators, \
            grid_search.best_estimator_.max_depth, \
            grid_search.best_estimator_.min_samples_leaf))

Best n_estimators: 2 
Best max_depth: 8 
Best min_samples_leaf: 2 


### Randomized Search

In [0]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# construct grid search parameters in a dictionary
parameters = {
    'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
    'max_depth': [2, 4, 6, 8],
    'min_samples_leaf': [1,2,3,4,5]
    }

In [0]:
# create the model
rf_model = RandomForestRegressor()

In [0]:
# run the grid search
randomized_search = RandomizedSearchCV(estimator=rf_model, param_distributions=parameters, n_iter=10)

In [0]:
# fit the model
randomized_search.fit(X,y)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_st

In [0]:
# evaluate the model performance
print("Best Accuracy: %.3f%%" %  (randomized_search.best_score_*100.0))

Best Accuracy: 57.770%


In [0]:
# best set of hyper-parameter values
print("Best n_estimators: %d \nBest max_depth: %d \nBest min_samples_leaf: %d " %  \
            (randomized_search.best_estimator_.n_estimators, \
            randomized_search.best_estimator_.max_depth, \
            randomized_search.best_estimator_.min_samples_leaf))

Best n_estimators: 14 
Best max_depth: 6 
Best min_samples_leaf: 3 
