<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Chapter-18---Introduction-to-Scikit-learn" data-toc-modified-id="Chapter-18---Introduction-to-Scikit-learn-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Chapter 18 - Introduction to Scikit-learn</a></span><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Load-Datasets" data-toc-modified-id="Load-Datasets-1.0.0.1"><span class="toc-item-num">1.0.0.1&nbsp;&nbsp;</span>Load Datasets</a></span></li><li><span><a href="#Preprocessing-the-Data" data-toc-modified-id="Preprocessing-the-Data-1.0.0.2"><span class="toc-item-num">1.0.0.2&nbsp;&nbsp;</span>Preprocessing the Data</a></span></li><li><span><a href="#Encoding-Categorical-Variables" data-toc-modified-id="Encoding-Categorical-Variables-1.0.0.3"><span class="toc-item-num">1.0.0.3&nbsp;&nbsp;</span>Encoding Categorical Variables</a></span></li><li><span><a href="#Imput-Missing-Data" data-toc-modified-id="Imput-Missing-Data-1.0.0.4"><span class="toc-item-num">1.0.0.4&nbsp;&nbsp;</span>Imput Missing Data</a></span></li><li><span><a href="#Generating-Higer-Order-Polynomial-Features" data-toc-modified-id="Generating-Higer-Order-Polynomial-Features-1.0.0.5"><span class="toc-item-num">1.0.0.5&nbsp;&nbsp;</span>Generating Higer Order Polynomial Features</a></span></li></ul></li></ul></li></ul></li><li><span><a href="#CH-24---ML-Techniques-by-Ekaba-Bisong" data-toc-modified-id="CH-24---ML-Techniques-by-Ekaba-Bisong-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>CH 24 - ML Techniques by Ekaba Bisong</a></span><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Statistical-tests-to-select-the-best--features-using-the-SelectKBest-module" data-toc-modified-id="Statistical-tests-to-select-the-best--features-using-the-SelectKBest-module-2.0.0.1"><span class="toc-item-num">2.0.0.1&nbsp;&nbsp;</span>Statistical tests to select the best  features using the SelectKBest module</a></span></li><li><span><a href="#Recursive-Feature-Elimination-(RFE)" data-toc-modified-id="Recursive-Feature-Elimination-(RFE)-2.0.0.2"><span class="toc-item-num">2.0.0.2&nbsp;&nbsp;</span>Recursive Feature Elimination (RFE)</a></span></li><li><span><a href="#Feature-Importances" data-toc-modified-id="Feature-Importances-2.0.0.3"><span class="toc-item-num">2.0.0.3&nbsp;&nbsp;</span>Feature Importances</a></span></li><li><span><a href="#Resampling-Methods" data-toc-modified-id="Resampling-Methods-2.0.0.4"><span class="toc-item-num">2.0.0.4&nbsp;&nbsp;</span>Resampling Methods</a></span></li><li><span><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-2.0.0.5"><span class="toc-item-num">2.0.0.5&nbsp;&nbsp;</span>Model evaluation</a></span></li><li><span><a href="#Pipelines:-Streamlining-Machine-Learning-Workflows" data-toc-modified-id="Pipelines:-Streamlining-Machine-Learning-Workflows-2.0.0.6"><span class="toc-item-num">2.0.0.6&nbsp;&nbsp;</span>Pipelines: Streamlining Machine Learning Workflows</a></span></li><li><span><a href="#Model-tuning" data-toc-modified-id="Model-tuning-2.0.0.7"><span class="toc-item-num">2.0.0.7&nbsp;&nbsp;</span>Model tuning</a></span></li></ul></li></ul></li></ul></li></ul></div>

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [87]:
from sklearn import datasets
from sklearn import datasets as ds

In [88]:
from sklearn.feature_selection import SelectKBest, SelectFromModel, chi2, RFE

In [89]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,\
                            accuracy_score, log_loss, classification_report

In [90]:
from sklearn.model_selection import train_test_split, cross_val_score,\
                                    GridSearchCV, KFold, RandomizedSearchCV, LeaveOneOut

In [91]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier,GradientBoostingRegressor
from sklearn.svm import SVR, SVC

In [92]:
from sklearn.pipeline import make_pipeline, Pipeline, make_union

In [93]:
np.set_printoptions(3)

# Chapter 18 - Introduction to Scikit-learn
by Ekaba Bisong

#### Load Datasets

In [0]:
# load library
from sklearn import datasets
import numpy as np

In [0]:
# load iris
iris = datasets.load_iris()
iris.data.shape

(150, 4)

In [0]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [0]:
# import module
from sklearn.model_selection import train_test_split
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, shuffle=True)
X_train.shape


(112, 4)

In [0]:
X_test.shape

(38, 4)

In [0]:
y_train.shape

(112,)

In [0]:
y_test.shape

(38,)

#### Preprocessing the Data

##### Data Rescaling

In [0]:
# import packages
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler

# load dataset
data = datasets.load_iris()
# separate features and target
X = data.data
y = data.target

# print first 5 rows of X before rescaling
X[0:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [0]:
# rescale X
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_X = scaler.fit_transform(X)

# print first 5 rows of X after rescaling
rescaled_X[0:5,:]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

##### Standardization

In [117]:
# import packages
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# load dataset
data = datasets.load_iris()
# separate features and target
X = data.data
y = data.target

# print first 5 rows of X before standardization
X[0:5, :]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [118]:
# standardize X
scaler = StandardScaler().fit(X)
standardize_X = scaler.transform(X)

# print first 5 rows of X after standardization
standardize_X[0:5, :]

array([[-0.901,  1.019, -1.34 , -1.315],
       [-1.143, -0.132, -1.34 , -1.315],
       [-1.385,  0.328, -1.397, -1.315],
       [-1.507,  0.098, -1.283, -1.315],
       [-1.022,  1.249, -1.34 , -1.315]])

In [119]:
standardize_X.mean(axis=0)

array([-1.690e-15, -1.843e-15, -1.699e-15, -1.409e-15])

In [121]:
standardize_X.std(axis=0)

array([1., 1., 1., 1.])

##### Normalization

In [122]:
# import packages
from sklearn import datasets
from sklearn.preprocessing import Normalizer

# load dataset
data = datasets.load_iris()

# separate features and target
X = data.data
y = data.target

# print first 5 rows of X before normalization
X[0:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

- Normalize samples individually to unit norm.

- Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1, l2 or inf) equals one.

In [123]:
# normalize X
scaler = Normalizer().fit(X)
normalize_X = scaler.transform(X)

# print first 5 rows of X after normalization
normalize_X[0:5, :]

array([[0.804, 0.552, 0.221, 0.032],
       [0.828, 0.507, 0.237, 0.034],
       [0.805, 0.548, 0.223, 0.034],
       [0.8  , 0.539, 0.261, 0.035],
       [0.791, 0.569, 0.221, 0.032]])

In [130]:
np.linalg.norm(normalize_X, axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

##### Binarization

In [131]:
# import packages
from sklearn import datasets
from sklearn.preprocessing import Binarizer

# load dataset
data = datasets.load_iris()
# separate features and target
X = data.data
y = data.target

# print first 5 rows of X before binarization
X[0:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [132]:
# binarize X
scaler = Binarizer(threshold = 1.5).fit(X)
binarize_X = scaler.transform(X)

# print first 5 rows of X after binarization
binarize_X[0:5,:]

array([[1., 1., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 0., 0.]])

#### Encoding Categorical Variables

##### LabelEncoder

In [0]:
# import packages
from sklearn.preprocessing import LabelEncoder

# create dataset
data = np.array([[5,8,"calabar"],[9,3,"uyo"],[8,6,"owerri"],
                    [0,5,"uyo"],[2,3,"calabar"],[0,8,"calabar"],
                    [1,8,"owerri"]])
data

array([['5', '8', 'calabar'],
       ['9', '3', 'uyo'],
       ['8', '6', 'owerri'],
       ['0', '5', 'uyo'],
       ['2', '3', 'calabar'],
       ['0', '8', 'calabar'],
       ['1', '8', 'owerri']], dtype='<U21')

In [0]:
# separate features and target
X = data[:, :2]
y = data[:, -1]

In [0]:
# encode y
encoder = LabelEncoder()
encode_y = encoder.fit_transform(y)

In [0]:
# adjust dataset with encoded targets
data[:,-1] = encode_y
data

array([['5', '8', '0'],
       ['9', '3', '2'],
       ['8', '6', '1'],
       ['0', '5', '2'],
       ['2', '3', '0'],
       ['0', '8', '0'],
       ['1', '8', '1']], dtype='<U21')

##### OneHotEncoder

In [0]:
# import packages
from sklearn.preprocessing import OneHotEncoder

# create dataset
data = np.array([[5,"efik", 8,"calabar"],[9,"ibibio",3,"uyo"],[8,"igbo",6,"owerri"],
                    [0,"ibibio",5,"uyo"],[2,"efik",3,"calabar"],[0,"efik",8,"calabar"],
                    [1,"igbo",8,"owerri"]])

# separate features and target
X = data[:,:3]
y = data[:,-1]

# print the feature or design matrix X
X

array([['5', 'efik', '8'],
       ['9', 'ibibio', '3'],
       ['8', 'igbo', '6'],
       ['0', 'ibibio', '5'],
       ['2', 'efik', '3'],
       ['0', 'efik', '8'],
       ['1', 'igbo', '8']], dtype='<U21')

In [0]:
# one_hot_encode X
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
encode_categorical = X[:, 1].reshape(len(X[:,1]), 1)
one_hot_encode_X = one_hot_encoder.fit_transform(encode_categorical)

# print one_hot encoded matrix - use todense() to print sparse matrix
# or convert to array with toarray()
one_hot_encode_X.todense()

matrix([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.]])

In [0]:
# remove categorical label
X = np.delete(X, 1, axis=1)
# append encoded matrix
X = np.append(X, one_hot_encode_X.toarray(), axis=1)
X

array([['5', '8', '1.0', '0.0', '0.0'],
       ['9', '3', '0.0', '1.0', '0.0'],
       ['8', '6', '0.0', '0.0', '1.0'],
       ['0', '5', '0.0', '1.0', '0.0'],
       ['2', '3', '1.0', '0.0', '0.0'],
       ['0', '8', '1.0', '0.0', '0.0'],
       ['1', '8', '0.0', '0.0', '1.0']], dtype='<U32')

#### Imput Missing Data

In [0]:
# import packages
from sklearn.impute import SimpleImputer

# create dataset
data = np.array([[5,np.nan,8],[9,3,5],[8,6,4],
                 [np.nan,5,2],[2,3,9],[np.nan,8,7],
                 [1,np.nan,5]])
data

array([[ 5., nan,  8.],
       [ 9.,  3.,  5.],
       [ 8.,  6.,  4.],
       [nan,  5.,  2.],
       [ 2.,  3.,  9.],
       [nan,  8.,  7.],
       [ 1., nan,  5.]])

In [0]:
# impute missing values - axix=0: impute along columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit_transform(data)

array([[5., 5., 8.],
       [9., 3., 5.],
       [8., 6., 4.],
       [5., 5., 2.],
       [2., 3., 9.],
       [5., 8., 7.],
       [1., 5., 5.]])

#### Generating Higer Order Polynomial Features

In [0]:
# import packages
from sklearn.preprocessing import PolynomialFeatures

# create dataset
data = np.array([[5,8],[9,3],[8,6],
                 [5,2],[3,9],[8,7],
                 [1,5]])
data

array([[5, 8],
       [9, 3],
       [8, 6],
       [5, 2],
       [3, 9],
       [8, 7],
       [1, 5]])

In [0]:
# create polynomial features
polynomial_features = PolynomialFeatures(2)
data = polynomial_features.fit_transform(data)
data

array([[ 1.,  5.,  8., 25., 40., 64.],
       [ 1.,  9.,  3., 81., 27.,  9.],
       [ 1.,  8.,  6., 64., 48., 36.],
       [ 1.,  5.,  2., 25., 10.,  4.],
       [ 1.,  3.,  9.,  9., 27., 81.],
       [ 1.,  8.,  7., 64., 56., 49.],
       [ 1.,  1.,  5.,  1.,  5., 25.]])

# CH 24 - ML Techniques by Ekaba Bisong

#### Statistical tests to select the best  features using the SelectKBest module

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# display first 5 rows
X[0:5,:]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [0]:
# feature engineering. Let's see the best 3 features by setting k = 3
kBest_chi = SelectKBest(score_func=chi2, k=3)
fit_test = kBest_chi.fit(X, y)

In [0]:
# print test scores
fit_test.scores_

array([ 10.81782088,   3.7107283 , 116.31261309,  67.0483602 ])

In [0]:
# we can transform the dataset to subset only the important features.
adjusted_features = fit_test.transform(X)
adjusted_features[0:5,:]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2]])

#### Recursive Feature Elimination (RFE)

In [57]:
# load dataset
data = datasets.load_boston()

In [58]:
# separate features and target
X = data.data
y = data.target

In [59]:
# feature engineering
linear_reg = LinearRegression()
rfe = RFE(estimator=linear_reg, n_features_to_select=6)
rfe_fit = rfe.fit(X, y)

In [60]:
# print the feature ranking
rfe_fit.ranking_

array([3, 5, 4, 1, 1, 1, 8, 1, 2, 6, 1, 7, 1])

#### Feature Importances

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# original data shape
X.shape

(150, 4)

In [0]:
# feature engineering
ada_boost_classifier = AdaBoostClassifier()
ada_boost_classifier.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [0]:
# print the feature importances
ada_boost_classifier.feature_importances_

array([0. , 0. , 0.5, 0.5])

In [0]:
# create a subset of data based on the relevant features
model = SelectFromModel(ada_boost_classifier, prefit=True)
new_data = model.transform(X)

In [0]:
# the irrelevant features have been removed
new_data.shape

(150, 2)

#### Resampling Methods

##### k-Fold cross validation

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# initialize KFold - with shuffle = True, shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True)

In [0]:
# create the model
knn_clf = KNeighborsClassifier(n_neighbors=3)

In [0]:
# fit the model using cross validation
cv_result = cross_val_score(knn_clf, X, y, cv=kfold)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 95.333% (0.943%)


##### Leave-One-Out cross validation (LOOCV)

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# initialize LOOCV
loocv = LeaveOneOut()

In [0]:
# create the model
knn_clf = KNeighborsClassifier(n_neighbors=3)

In [0]:
# fit the model using cross validation
cv_result = cross_val_score(knn_clf, X, y, cv=loocv)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 96.000% (19.596%)


#### Model evaluation

##### Regression evaluation metrics

In [7]:
# load dataset
data = datasets.load_boston() # print(data.DESCR)

In [8]:
# separate features and target
X = data.data
y = data.target

In [9]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [10]:
# create the model
# setting normalize to true normalizes the dataset before fitting the model
linear_reg = LinearRegression(normalize = True)

In [11]:
# fit the model on the training set
linear_reg.fit(X_train, y_train)

LinearRegression(normalize=True)

In [12]:
# make predictions on the test set
predictions = linear_reg.predict(X_test)

In [13]:
# evaluate the model performance using mean square error metric
print("Mean squared error: %.2f" % mean_squared_error(y_test, predictions))

Mean squared error: 23.81


In [14]:
# evaluate the model performance using mean absolute error metric
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, predictions))

Mean absolute error: 3.54


In [15]:
# evaluate the model performance using r-squared error metric
print("R-squared score: %.2f" % r2_score(y_test, predictions))

R-squared score: 0.76


##### Regression evaluation metrics implemented with cross validation

In [16]:
# load dataset
data = datasets.load_boston()

In [17]:
# separate features and target
X = data.data
y = data.target

In [18]:
# initialize KFold - with shuffle = True, shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True)

In [19]:
# create the model
linear_reg = LinearRegression(normalize = True)

In [20]:
# fit the model using cross validation - score with Mean square error (MSE)
mse_cv_result = cross_val_score(linear_reg, X, y, cv=kfold, scoring="neg_mean_squared_error")

In [21]:
# print mse cross validation output
print("Negtive Mean squared error: %.3f%% (%.3f%%)" % (mse_cv_result.mean(), mse_cv_result.std()))

Negtive Mean squared error: -23.646% (4.658%)


In [22]:
# fit the model using cross validation - score with Mean absolute error (MAE)
mae_cv_result = cross_val_score(linear_reg, X, y, cv=kfold, scoring="neg_mean_absolute_error")

In [23]:
# print mse cross validation output
print("Negtive Mean absolute error: %.3f%% (%.3f%%)" % (mae_cv_result.mean(), mse_cv_result.std()))

Negtive Mean absolute error: -3.482% (4.658%)


In [24]:
# fit the model using cross validation - score with R-squared
r2_cv_result = cross_val_score(linear_reg, X, y, cv=kfold, scoring="r2")

In [25]:
# print mse cross validation output
print("R-squared score: %.3f%% (%.3f%%)" % (r2_cv_result.mean(), r2_cv_result.std()))

R-squared score: 0.712% (0.034%)


##### Classification evaluation metrics

In [17]:
# load dataset
data = datasets.load_iris()

In [18]:
# separate features and target
X = data.data
y = data.target

In [19]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [21]:
# create the model
logistic_reg = LogisticRegression()

NameError: name 'LogisticRegression' is not defined

In [31]:
# fit the model on the training set
logistic_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [32]:
# make predictions on the test set
predictions = logistic_reg.predict(X_test)

In [46]:
# evaluate the model performance using accuracy
print("Accuracy score: %.2f" % accuracy_score(y_test, predictions))

Accuracy score: 0.95


In [34]:
# evaluate the model performance using log loss

### output the probabilities of assigning an observation to a class
predictions_probabilities = logistic_reg.predict_proba(X_test)

In [35]:
print("Log-Loss likelihood: %.2f" % log_loss(y_test, predictions_probabilities))

Log-Loss likelihood: 0.20


In [36]:
# evaluate the model performance using classification report
print("Classification report: \n", classification_report(y_test, predictions, target_names=data.target_names))

Classification report: 
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       1.00      0.87      0.93        15
   virginica       0.88      1.00      0.93        14

    accuracy                           0.95        38
   macro avg       0.96      0.96      0.95        38
weighted avg       0.95      0.95      0.95        38



##### Classification evaluation metrics implemented with cross validation

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

In [48]:
# load dataset
data = datasets.load_iris()

In [49]:
# separate features and target
X = data.data
y = data.target

In [50]:
# initialize KFold - with shuffle = True, shuffle the data before splitting
kfold = KFold(n_splits=3, shuffle=True)

In [51]:
# create the model
logistic_reg = LogisticRegression()

In [52]:
# fit the model using cross validation - score with accuracy
accuracy_cv_result = cross_val_score(logistic_reg, X, y, cv=kfold, scoring="accuracy")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [53]:
# print accuracy cross validation output
print("Accuracy: %.3f%% (%.3f%%)" % (accuracy_cv_result.mean(), accuracy_cv_result.std()))

Accuracy: 0.967% (0.025%)


In [54]:
# fit the model using cross validation - score with Log-Loss
logloss_cv_result = cross_val_score(logistic_reg, X, y, cv=kfold, scoring="neg_log_loss")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [55]:
# print mse cross validation output
print("Log-Loss likelihood: %.3f%% (%.3f%%)" % (logloss_cv_result.mean(), logloss_cv_result.std()))

Log-Loss likelihood: -0.148% (0.005%)


#### Pipelines: Streamlining Machine Learning Workflows

In [0]:
# load dataset
data = datasets.load_iris()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# create the pipeline
estimators = [
    ('standardize' , StandardScaler()),
    ('svc', SVC())
]

In [0]:
# build the pipeline model
pipe = Pipeline(estimators)

In [0]:
# run the pipeline
kfold = KFold(n_splits=3, shuffle=True)
cv_result = cross_val_score(pipe, X, y, cv=kfold)

In [0]:
# evaluate the model performance
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 94.667% (0.943%)


##### Pipelines using make_pipeline

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# build the pipeline model
pipe = make_pipeline(
    PCA(n_components=9),
    RandomForestRegressor()
)

In [0]:
# run the pipeline
kfold = KFold(n_splits=4, shuffle=True)
cv_result = cross_val_score(pipe, X, y, cv=kfold)



In [0]:
# evaluate the model performance
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 71.706% (2.767%)


##### Pipelines using FeatureUnion

In [0]:
# load dataset
data = datasets.load_boston()

In [0]:
# separate features and target
X = data.data
y = data.target

In [0]:
# construct pipeline for feature engineering - make_union similar to make_pipeline
feature_engr = make_union(
    RFE(estimator=RandomForestRegressor(n_estimators=100), n_features_to_select=6),
    PCA(n_components=9)
)

In [0]:
# build the pipeline model
pipe = make_pipeline(
    feature_engr,
    GradientBoostingRegressor(n_estimators=100)
)

In [0]:
# run the pipeline
kfold = KFold(n_splits=4, shuffle=True)
cv_result = cross_val_score(pipe, X, y, cv=kfold)

In [0]:
# evaluate the model performance
print("Accuracy: %.3f%% (%.3f%%)" % (cv_result.mean()*100.0, cv_result.std()*100.0))

Accuracy: 88.088% (3.096%)


#### Model tuning

##### Grid Search

In [46]:
boston_houses = ds.load_boston()
# print(boston_houses['DESCR'])

In [35]:
df = pd.DataFrame(boston_houses['data'], columns=boston_houses['feature_names'])
df['MedianValue'] = boston_houses['target']*1000
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MedianValue
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24000.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21600.0
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34700.0
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33400.0
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22400.0
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20600.0
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23900.0
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22000.0


In [14]:
# separate features and target
X = boston_houses.data
y = boston_houses.target

In [47]:
# construct grid search parameters in a dictionary
parameters = {
    'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
    'max_depth': [2, 4, 6, 8],
    'min_samples_leaf': [1,2,3,4,5]
    }

In [17]:
# create the model
rf = RandomForestRegressor()

In [18]:
# run the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=parameters)

In [19]:
# fit the model
grid_search.fit(X,y)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16]})

In [20]:
# evaluate the model performance
print("Best Accuracy: %.3f%%" %  (grid_search.best_score_*100.0))

Best Accuracy: 63.311%


In [0]:
# best set of hyper-parameter values
print("Best n_estimators: %d \nBest max_depth: %d \nBest min_samples_leaf: %d " %  \
            (grid_search.best_estimator_.n_estimators, \
            grid_search.best_estimator_.max_depth, \
            grid_search.best_estimator_.min_samples_leaf))

Best n_estimators: 2 
Best max_depth: 8 
Best min_samples_leaf: 2 


##### Randomized Search

In [38]:
# load dataset
data = datasets.load_boston()

In [39]:
# separate features and target
X = data.data
y = data.target

In [40]:
# construct grid search parameters in a dictionary
parameters = {
    'n_estimators': [2, 4, 6, 8, 10, 12, 14, 16],
    'max_depth': [2, 4, 6, 8],
    'min_samples_leaf': [1,2,3,4,5]
}

In [41]:
# create the model
rf_model = RandomForestRegressor()

In [42]:
# run the grid search
randomized_search = RandomizedSearchCV(estimator=rf_model, param_distributions=parameters, n_iter=10)

In [43]:
# fit the model
randomized_search.fit(X,y)

RandomizedSearchCV(estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [2, 4, 6, 8],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'n_estimators': [2, 4, 6, 8, 10, 12, 14,
                                                         16]})

In [44]:
# evaluate the model performance
print("Best Accuracy: %.3f%%" %  (randomized_search.best_score_*100.0))

Best Accuracy: 57.369%


In [45]:
# best set of hyper-parameter values
print("Best n_estimators: %d \nBest max_depth: %d \nBest min_samples_leaf: %d " %  \
            (randomized_search.best_estimator_.n_estimators, \
            randomized_search.best_estimator_.max_depth, \
            randomized_search.best_estimator_.min_samples_leaf))

Best n_estimators: 8 
Best max_depth: 4 
Best min_samples_leaf: 3 
