## Phase 1: Script Configuration 

### 1.1. Loading User-defined libraries developed by Author 

In [66]:
import numpy             as np
import pandas            as pd
import sklearn           as sk
import statistics        as st
import pprint            as pp
import math              as mth
import seaborn           as sns
import matplotlib        as mpl
import os                as os
import matplotlib.pyplot as plt
import atexit
import functools 
import importlib

from time                  import clock

from sklearn.datasets      import load_digits
from sklearn.datasets      import fetch_olivetti_faces

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import minmax_scale

from sklearn.linear_model  import LinearRegression
from sklearn.linear_model  import LogisticRegression
from sklearn.linear_model  import LogisticRegressionCV
from sklearn.linear_model  import Ridge
from sklearn.linear_model  import Lasso
from sklearn.linear_model  import SGDClassifier

from sklearn.svm           import SVC

from sklearn.ensemble      import RandomForestClassifier

from sklearn.decomposition import PCA
from sklearn.decomposition import NMF

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis    as LDA 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_validate 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn.metrics         import mean_squared_error
from sklearn.metrics         import mean_absolute_error
from sklearn.metrics         import confusion_matrix
from sklearn.metrics         import accuracy_score
from sklearn.metrics         import recall_score
from sklearn.metrics         import precision_score
from sklearn.metrics         import roc_curve
from sklearn.metrics         import f1_score
from sklearn.metrics         import roc_auc_score
from sklearn.metrics         import classification_report

from tensorflow                    import keras
from tensorflow.keras.datasets     import cifar10
from tensorflow.keras              import Sequential

from tensorflow.keras.layers       import Dense
from tensorflow.keras.layers       import Conv2D
from tensorflow.keras.layers       import Flatten
from tensorflow.keras.layers       import MaxPooling2D
from tensorflow.keras.utils        import to_categorical

%matplotlib inline

### 1.2. Loading user-defined libraries for different pre-processing and model build functions

In [67]:
# ----------------------------------------------------------------------
# Loading User-defined libraries developed by Author
# ----------------------------------------------------------------------
import timing
import common
importlib.reload(timing)
importlib.reload(common)

1:52:24.467 - Start Program
None
Library COMMON loaded.


<module 'common' from 'C:\\Users\\HP\\Google Drive\\Notebooks\\Python\\Statistical Learning\\Innovations\\common.py'>

## Phase 2: Data Acquisition

### 2.1 Load Original Datasets

In [68]:
# ----------------------------------------------------------------------
# Fetching the Cifar Data and displaying the shape
# ----------------------------------------------------------------------
(X_CFR_train, y_CFR_train), (X_CFR_test, y_CFR_test) = cifar10.load_data()
X_CFR_train.shape, y_CFR_train.shape, X_CFR_test.shape, y_CFR_test.shape

((50000, 32, 32, 3), (50000, 1), (10000, 32, 32, 3), (10000, 1))

### 2.2 Creating the Subsets including dogs and horses

In [69]:
# ----------------------------------------------------------------------
# Retaining the images of dogs and horses only in Training and Test DS
# ----------------------------------------------------------------------
img_train_index = np.where((y_CFR_train == 5) | (y_CFR_train == 7))[0]
img_test_index  = np.where((y_CFR_test  == 5) | (y_CFR_test  == 7))[0]

X_CFR_train = X_CFR_train[img_train_index]
y_CFR_train = y_CFR_train[img_train_index]

X_CFR_test  = X_CFR_test [img_test_index]
y_CFR_test  = y_CFR_test [img_test_index]

### 2.3 Replacing Labels in Training and Test with boolean values

In [70]:
# ----------------------------------------------------------------------
# Replacing the response values in the Displaying the final shape
# ----------------------------------------------------------------------
y_CFR_train = np.where(y_CFR_train == 7, 1, 0) 
y_CFR_test  = np.where(y_CFR_test == 7, 1, 0) 

### 2.4 Splitting the Data Subsets into training, validation and test datasets

In [71]:
# -----------------------------------------------------------------------------------
# Splitting the X_Train and Y_Train by 80/20 split to create validation dataset
# -----------------------------------------------------------------------------------
X_CFR_train, X_CFR_valid, y_CFR_train, y_CFR_valid = train_test_split(X_CFR_train, 
                                                                      y_CFR_train, 
                                                                      test_size    = .2,  
                                                                      stratify     = y_CFR_train, 
                                                                      random_state = 1)

# ----------------------------------------------------------------------
# Displaying the final shape
# ----------------------------------------------------------------------
X_CFR_train.shape, y_CFR_train.shape, X_CFR_valid.shape, y_CFR_valid.shape, X_CFR_test.shape, y_CFR_test.shape

((8000, 32, 32, 3),
 (8000, 1),
 (2000, 32, 32, 3),
 (2000, 1),
 (2000, 32, 32, 3),
 (2000, 1))

## Phase 3: Pre-processing for model build exercise

### 3.1. Refreshing the data-sets from master data subset for dogs and horses

In [72]:
# --------------------------------------------------------------------
# Reloading the Training and Test Data sets from the master dataset
# --------------------------------------------------------------------
X_train = X_CFR_train
y_train = y_CFR_train

X_valid = X_CFR_valid
y_valid = y_CFR_valid

X_test  = X_CFR_test
y_test  = y_CFR_test

# --------------------------------------------------------------------
# Showing the current shapes
# --------------------------------------------------------------------
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((8000, 32, 32, 3),
 (8000, 1),
 (2000, 32, 32, 3),
 (2000, 1),
 (2000, 32, 32, 3),
 (2000, 1))

### 3.2. Flattening the feature training, validation and test datasets

In [73]:
#------------------------------------------------------------------------
# New Modular Code for doing Random Forest Classification for images
#------------------------------------------------------------------------
X_train = common.f_flatten_img_ds(X_train)
X_valid = common.f_flatten_img_ds(X_valid)
X_test  = common.f_flatten_img_ds(X_test)

y_train = y_train.reshape(8000)
y_valid = y_valid.reshape(2000)
y_test  = y_test.reshape(2000)

# --------------------------------------------------------------------
# Showing the current shapes
# --------------------------------------------------------------------
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape 

((8000, 3072), (8000,), (2000, 3072), (2000,), (2000, 3072), (2000,))

In [74]:
#--------------------------------------------------------------------------------
# Function for building PCA Components for all X training, validation and 
# test datasets
#--------------------------------------------------------------------------------
def f_build_PC_model(arg_X_train, arg_X_valid, arg_X_test, arg_DR_type, arg_exp_var):

    #Build a default PCA model
    if arg_DR_type == 'PCA':
        dr_model = PCA()
    elif arg_DR_type == 'NMF':
        dr_model = NMF()
    else:
        None
        
        
    dr_model.fit_transform(arg_X_train)

    # Calculating optimal k to have x% (say) variance 
    k = 0
    total = sum(dr_model.explained_variance_)
    current_sum = 0

    while(current_sum / total < arg_exp_var):
        current_sum += dr_model.explained_variance_[k]
        k += 1

    ## Applying PCA with k calculated above
    dr_model2 = PCA(n_components = k, whiten = True)

    X_train_pca = dr_model2.fit_transform(arg_X_train)
    X_valid_pca = dr_model2.transform(arg_X_valid)
    X_test_pca  = dr_model2.transform(arg_X_test)
    
    return (X_train_pca, X_valid_pca, X_test_pca, k)
# --------------------- END OF FUNCTION --------------------------	


### 3.3. Dimensional Reduction using PCA with 95% variance

In [75]:
#------------------------------------------------------------------------
# Creating the PC Factors with 95% variance using PCA
#------------------------------------------------------------------------
X_train_PCA, X_valid_PCA, X_test_PCA, PCA_Factors = f_build_PC_model(X_train, X_valid, X_test, 'PCA', 0.99)
print('PC Components = ', PCA_Factors)

PC Components =  598


### 3.4. Dimensional Reduction using NMF with 95% variance

In [76]:
#------------------------------------------------------------------------
# Creating the PC Factors with 95% variance using NMF
#------------------------------------------------------------------------
#X_train_NMF, X_valid_NMF, X_test_NMF, NMF_Factors = f_build_PC_model(X_train, X_valid, X_test, 'NMF', 0.95)
#print('NMF Components = ', NMF_Factors)

## Phase 4: Model Building

In [14]:
#------------------------------------------------------------------------
# Function for generating Decision Tree based Model
#------------------------------------------------------------------------
def f_build_RF_CV(arg_X_train, arg_y_train, 
                  arg_X_valid, arg_y_valid, 
                  arg_X_test, arg_y_test,
                  arg_model_type, arg_random_state, arg_fold):

    #------------------------------------------------------------------------
    # Initiating the Random Forest Model
    #------------------------------------------------------------------------
    print('stage 0 completed.')

    if arg_model_type == 'DecisionTree':
        None
    elif arg_model_type == 'RF':
        model = RandomForestClassifier(random_state = arg_random_state)
    elif arg_model_type == 'GBD':
        model = GradientBoostingClassifier()
    elif arg_model_type == 'SGD':
        model = SGDClassifier(random_state = arg_random_state)
    else:
        None
    print('stage 1 completed.')
    #------------------------------------------------------------------------
    # Fitting the Decision Tree Type Model
    #------------------------------------------------------------------------
    model.fit(arg_X_train, arg_y_train)    
    print('stage 2 completed.')
    
    #------------------------------------------------------------------------
    # Doing the Cross-validation on Valdation dataset
    #------------------------------------------------------------------------
    cv_val = cross_validate(model, arg_X_valid, arg_y_valid, cv = arg_fold)
    print('stage 3 completed.')
   
    #------------------------------------------------------------------------
    # Doing Predictions on Test dataset
    #------------------------------------------------------------------------
    y_test_pred = cross_val_predict(model, arg_X_test, arg_y_test, 
                                    cv = arg_fold, method = "predict")
    print('stage 4 completed.')

    #------------------------------------------------------------------------
    # Creating a dictionary to store model metrics
    #------------------------------------------------------------------------
    model_metrics = {}

    #------------------------------------------------------------------------
    # Collecting metrics
    #------------------------------------------------------------------------
    model_accuracy      = accuracy_score (arg_y_test, y_test_pred)
    #model_recall        = recall_score   (arg_y_test, y_test_pred)
    #model_precision     = precision_score(arg_y_test, y_test_pred)
    #model_roc_score     = roc_curve      (arg_y_test, y_test_pred)
    #model_f1_score      = f1_score       (arg_y_test, y_test_pred)
    
    #------------------------------------------------------------------------
    # Storing metrics in the dictionary
    #------------------------------------------------------------------------
    model_metrics['accuracy_score']  = model_accuracy
    #model_metrics['recall_score']    = model_recall
    #model_metrics['precision_score'] = model_precision
    #model_metrics['roc_score']       = model_roc_score
    #model_metrics['f1_score']        = model_f1_score

    print('Ready to return.')
    return model_metrics
# --------------------- END OF FUNCTION --------------------------	


### 4.1 RandomForest 5-fold cross-validation based model

In [15]:
# ----------------------------------------------------------------------
# Displaying the final shape
# ----------------------------------------------------------------------
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((8000, 3072), (8000, 1), (2000, 3072), (2000, 1), (2000, 3072), (2000, 1))

In [57]:
y_train = y_train.ravel()
y_valid = y_valid.ravel()
y_test = y_test.ravel()

In [16]:
#------------------------------------------------------------------------
# Random Forest Model with 5 fold cross-validation
#------------------------------------------------------------------------
#from common import f_build_RF_CV

model1 = f_build_RF_CV(X_train, y_train, X_valid, y_valid, X_test, y_test,
                              'RF', 42, 5)
model1

stage 0 completed.
stage 1 completed.




stage 2 completed.


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


stage 3 completed.


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


stage 4 completed.
Ready to return.


{'accuracy_score': 0.7555}

###  4.2 Gradient Boosting Decision Trees 5-fold cross-validation based model

In [None]:
#--------------------------------------------------------------------------------
# Gradient Boosting Decision Tree Model with 5 fold cross-validation
#--------------------------------------------------------------------------------
model2 = common.f_build_RF_CV(X_train, y_train, X_valid, y_valid, X_test, y_test,
                              'GBD', 42, 5)
model2

### 4.3 Gradient Descent Decision Trees 5-fold cross-validation based model

In [None]:
#--------------------------------------------------------------------------------
# Stoichastic Gradient Descent Decision Tree Model with 5 fold cross-validation
#--------------------------------------------------------------------------------
model3 = common.f_build_RF_CV(X_train, y_train, X_valid, y_valid, X_test, y_test,
                              'SGD', 42, 5)
model3

### 4.4 RandomForest 5-fold cross-validation based model using PCA factors

In [None]:
#------------------------------------------------------------------------
# Random Forest Model with 5 fold cross-validation
#------------------------------------------------------------------------
model4 = f_build_RF_CV(X_train_PCA, y_train, X_valid_PCA, y_valid, X_test_PCA, y_test,
                              'RF', 42, 5)
model4

###  4.5 Gradient Boosting Decision Trees 5-fold cross-validation based model using PCA factors

In [None]:
model5 = common.f_build_RF_CV(X_train_PCA, y_train, X_valid_PCA, y_valid, X_test_PCA, y_test,
                              'GBD', 42, 5)
model5

### 4.6 Gradient Descent Decision Trees 5-fold cross-validation based model using PCA factors

In [None]:
#--------------------------------------------------------------------------------
# Stoichastic Gradient Descent Decision Tree Model with 5 fold cross-validation
#--------------------------------------------------------------------------------
model6 = common.f_build_RF_CV(X_train_PCA, y_train, X_valid_PCA, y_valid, X_test_PCA, y_test,
                              'SGD', 42, 5)
model6

### 4.7. Support Vector based model

In [68]:
#--------------------------------------------------------------------------------
# Function for building PCA Components for all X training, validation and 
# test datasets
#--------------------------------------------------------------------------------
def f_build_SVM_model(arg_X_train, arg_y_train, arg_X_valid, arg_y_valid, arg_X_test, arg_y_test,
                      arg_model_type, arg_kernel, arg_gamma, arg_C_vals, arg_cv, arg_metric):

    # -----------------------------------------------------------------------
    # Running a loop over the model dictionary
    # -----------------------------------------------------------------------
    print('MODEL: ', arg_model_type)

    # -----------------------------------------------------------------------
    # Applying GridSearch approach only for SVC and SVM
    # -----------------------------------------------------------------------
    if arg_model_type == 'SVC': 
        mdl = SVC(kernel = arg_kernel, probability = True)
        gs = GridSearchCV(mdl, param_grid = {'C':arg_C_vals}, cv = arg_cv, scoring = arg_metric).fit(X_train, y_train)    
        print('For Model ', arg_model_type,  ', Best C Value is : ', gs.best_params_)
    elif arg_model_type == 'SVM': 
        mdl = SVC(kernel = arg_kernel, gamma = arg_gamma, probability = True).fit(X_train, y_train)
        gs = GridSearchCV(mdl, param_grid = {'C':arg_C_vals}, cv = arg_cv, scoring = arg_metric).fit(X_train, y_train)    
        print('For Model ', arg_model_type,  ', Best C Value is : ', gs.best_params_)
    elif arg_model_type == 'LDA': 
        mdl = LDA()
        gs = mdl.fit(arg_X_train, arg_y_train)
    elif arg_model_type == 'QDA': 
        mdl = QDA()
        gs = mdl.fit(arg_X_train, arg_y_train)
    else:
        None    

    # -----------------------------------------------------------------------
    # Doing Training and Test Data Set Predictions
    # -----------------------------------------------------------------------
    #preds_train_ds = gs.predict(X_train)
    #probs_train_ds = gs.predict_proba(X_train)
    y_test_pred  = gs.predict(arg_X_test)
    y_test_probs = gs.predict_proba(arg_X_test)

    #------------------------------------------------------------------------
    # Creating a dictionary to store model metrics
    #------------------------------------------------------------------------
    model_metrics = {}

    #------------------------------------------------------------------------
    # Collecting metrics
    #------------------------------------------------------------------------
    model_accuracy      = accuracy_score (arg_y_test, y_test_pred)
    model_recall        = recall_score   (arg_y_test, y_test_pred)
    model_precision     = precision_score(arg_y_test, y_test_pred)
    model_roc_score     = roc_curve      (arg_y_test, y_test_pred)
    model_f1_score      = f1_score       (arg_y_test, y_test_pred)
    
    #------------------------------------------------------------------------
    # Storing metrics in the dictionary
    #------------------------------------------------------------------------
    model_metrics['accuracy_score']  = model_accuracy
    model_metrics['recall_score']    = model_recall
    model_metrics['precision_score'] = model_precision
    model_metrics['roc_score']       = model_roc_score
    model_metrics['f1_score']        = model_f1_score
    
    return model_metrics


In [None]:
# -----------------------------------------------------------------------
# Setting up the range of tuning parameter from 0.1 and 10
# -----------------------------------------------------------------------
c_vals = list(np.arange(0.1, 2.1, 1.0))

# -----------------------------------------------------------------------
# Setting up Model Dictionary for SVC, SVM, LDA and QDA Models
# -----------------------------------------------------------------------
model7 = f_build_SVM_model(X_train, y_train, X_valid, y_valid, X_test, y_test,
                      'SVM', 'rbf', 'scale', c_vals, 4, 'accuracy')

model7

### 4.8. Feed-forward Neural Network and Convolutional Neural Network based Image Classifications

In [77]:
# --------------------------------------------------------------------
# Reloading the Training and Test Data sets from the master dataset
# --------------------------------------------------------------------
X_train = X_CFR_train
y_train = y_CFR_train
X_valid = X_CFR_valid
y_valid = y_CFR_valid
X_test  = X_CFR_test
y_test  = y_CFR_test

In [78]:
# --------------------------------------------------------------------
# Displaying the shapes
# --------------------------------------------------------------------
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((8000, 32, 32, 3),
 (8000, 1),
 (2000, 32, 32, 3),
 (2000, 1),
 (2000, 32, 32, 3),
 (2000, 1))

In [79]:
# --------------------------------------------------------------------
# Transforming training and test response values
# --------------------------------------------------------------------
print('Example Y variable before transformation:', y_train[0])
y_train = to_categorical(y_train)
y_valid = to_categorical(y_valid)
y_test  = to_categorical(y_test)

print('Example Y variable after transformation:', y_train[0])

Example Y variable before transformation: [1]
Example Y variable after transformation: [0. 1.]


In [80]:
# --------------------------------------------------------------------
# Displaying the shapes
# --------------------------------------------------------------------
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((8000, 32, 32, 3),
 (8000, 2),
 (2000, 32, 32, 3),
 (2000, 2),
 (2000, 32, 32, 3),
 (2000, 2))

In [87]:
from common import f_build_CNN
importlib.reload(common)

arg_conv_layers = [[32, (3,3), 1, 'relu', 'same', (32,32,3)],
                   [20, (3,3), 1, 'relu', 'same', (32,32,3)]]

arg_conv_layers = [[10, (3,3), 1, 'relu', 'same', (32,32,3)]]
arg_pool_layers = [[(2,2), 2]]
arg_out_layers = [[2, 'relu'], [2, 'softmax']]
arg_compile_parms = [['adam', 'categorical_crossentropy', ['accuracy']]] 
arg_fit_parms = [[30, 30]]

model1 = f_build_CNN(X_train, y_train, X_test, y_test, 
                arg_conv_layers, 
                arg_pool_layers, 
                arg_out_layers,
                arg_compile_parms,
                arg_fit_parms)

Library COMMON loaded.
Current CONV Item :  [10, (3, 3), 1, 'relu', 'same', (32, 32, 3)]
Current POOL Item :  [(2, 2), 2]
Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 32, 32, 10)        280       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 16, 10)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 2560)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 5122      
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 6         
Total params: 5,408
Trainable params: 5,408
Non-trainable params: 0
_____________________________________________________________

In [None]:
f_CNN_feature_maps('CNN', X_train, y_train, X_test, y_test, 10):


In [64]:
from common import f_build_ANN

arg_in_layers     = [[8, 'relu', (32, 32, 3)], 
                     [16, 'relu', (32, 32, 3)], 
                     [32, 'relu', (32, 32, 3)], 
                     [64, 'relu', (32, 32, 3)]]
arg_out_layers    = [[2, 'softmax']]
arg_compile_parms = [['adam', 'categorical_crossentropy', ['accuracy']]] 
arg_fit_parms = [[20, 20]]
#arg_in_layers.reverse()
#arg_in_layers
model1 = f_build_ANN(X_train, y_train, X_test, y_test, 
                     arg_in_layers, arg_out_layers, arg_compile_parms, arg_fit_parms)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 32, 32, 8)         32        
_________________________________________________________________
dense_7 (Dense)              (None, 32, 32, 16)        144       
_________________________________________________________________
dense_8 (Dense)              (None, 32, 32, 32)        544       
_________________________________________________________________
dense_9 (Dense)              (None, 32, 32, 64)        2112      
_________________________________________________________________
flatten_2 (Flatten)          (None, 65536)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 131074    
Total params: 133,906
Trainable params: 133,906
Non-trainable params: 0
________________________________________________