# Student Template Experiments 7: Feature Engineering and Model Comparison

# Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import graphviz
import mglearn as mglearn
import mglearn.plots
import mglearn.datasets
import matplotlib.pyplot as plt
# ensures plots are inlined for the notebook presentation
%matplotlib inline 
import scipy as sp
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, ward
import time
from IPython.display import display
import seaborn as sns
import os
import tensorflow as tf

In [2]:
# Sci-Kit Imports
import sklearn as sklearn
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, classification_report
# from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import Ridge
# from sklearn.linear_model import Lasso
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
# from sklearn.svm import LinearSVC
# from sklearn.svm import SVC
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# from sklearn.decomposition import NMF
# from sklearn.manifold import TSNE
# from sklearn.cluster import KMeans
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.cluster import DBSCAN

# import sklearn.datasets as datasets
# from sklearn.datasets import make_blobs
# from sklearn.datasets import load_breast_cancer
# from sklearn.datasets import fetch_lfw_people
# from sklearn.datasets import load_digits
# from sklearn.datasets import make_moons
# from sklearn.datasets import make_circles
from sklearn.datasets import load_wine

In [3]:
# Mute warnings
import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
# warnings.simplefilter(action='ignore', category=WARNING)

# Preprocess Wine data and transform

In [4]:
# Load Wine data set, and describe
# from sklearn.datasets import load_wine
# wine = load_wine(return_X_y=False)

# print(wine['DESCR'])

# review key, value
# for key,value in wine.items():
#     print(key,'\n',value,'\n')

In [5]:
# Create Pandas DataFrame


In [6]:
# Plot the Target as separated by distinct alcohol distributions showing three relatively
# separate classes for wine df using sns distplot


In [7]:
# Plot Correlation Heatmap


# NOTE: total_phenols and flavanoids show a high correlation appearing to be above or about 0.8 
#       on our correlation heatmap

# Random Forest Classifier model 1

In [8]:
# Random Forest with no feature selection to remove unnecessary variables
# Create Train Test Split


# fit X_train and y_train


# score y and print


# NOTE: Basic RFC has a ~93% accuracy score (your results may vary)

### Feature selection (drop feature) with multicollinearity 

In [9]:
# Dropping highly correlated total_phenols (vs. flavanoids) for optimized analysis


In [10]:
# Random Forest Model with feature selection
# X = Remove y from wine_df (so we can use 'engineered' dataframe)


# y = Remove X from wine_df by selecting the Target column



# Train Test Split


# score accuracy


# NOTE: Random Forest model 1 with Feature Selection to remove multicollinearity improves score to 96%

# Feature Extraction via PCA on Random Forest model 1

In [11]:
# Run PCA as Feature Extraction as reduced feature may not be identical to any original feature
# Run RandomForestClassifier 'as is'
# make pipeline with PCA and RandomForestClassifier


# accuracy score


# score PCA with R^2 .score


# NOTE: Prediction accuracy with PCA on a few less than 13 components remains the same as base model 
#      since the RFC also reduces multicollinearity as part of its process and accuracy remains ~96%

# Random Forest model 2 with hyperparameter tuning

In [12]:
# Run RandomForestClassifier with additional hyperparameter tuning


# Fit to training data for X and y

# Predict y by applying fitted RFC to X test features


# Score y test holdout against y predictions


# Score train and tests for comparison as well


# NOTE: Prediction accuracy improves with additional hyperparameter tuning on RFC model
#       increasing prediction accuracy to ~98.15%

# Random Forest model 3 - with addt'l hyperparameter tuning examples

In [13]:
#///////////////// Multiple RFC's with Tuning, but no improvemnet //////////////////////
# Run RandomForestClassifier with additional hyperparameter tuning ~96%
# rf3 = RandomForestClassifier(criterion='gini', n_estimators=288, n_jobs=2, 
#                             random_state=321, max_depth=5, bootstrap=True)

# Run RandomForestClassifier with additional hyperparameter tuning ~96%
# rf4 = RandomForestClassifier(criterion='entropy', n_estimators=288, n_jobs=2, 
#                             random_state=321, max_depth=5, bootstrap=True, verbose=2)

# Run RandomForestClassifier with additional hyperparameter tuning ~94%
# rf5 = RandomForestClassifier (bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=2, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
#             oob_score=False, random_state=0, verbose=0, warm_start=False)
#///////////////////////////////////////////////////////////////////////////////////////


# Simplest model but no more improvement over ~98.15%

# Create Random Forest model

# Fit to training data for X and y


# Predict y by applying fitted RFC to X test features


# Score y test holdout against y predictions


# Score train and tests for additional evaluation


# NOTE: Prediction accuracy does not improve with addit'l hyperparameter tuning on RFC model
#      increasing prediction accuracy slightly to ~98.15%

# Apply Kfold and StratifiedKFold Cross-Validations to RF model 6

In [14]:
# library.method import class
 # method: model_selection, class: StratifiedKFold
# Stratified Kfold - if the estimator is a classifier and y either binary or multiclass,
# :class:`StratifiedKFold` is used.

# Comparing StratifiedKFold classes
# Stratified Kfold 5 Splits


# Stratified Kfold 10 Splits


# Stratified Kfold 5 Splits, shuffle, random state


# NOTE: K-fold with 5 splits, shuffle, random applied to RF model 6 appears to have the best
#       performance without overfitting as with 10 splits

# Apply Grid Search for-loop method to Wine data set

In [15]:
# Apply simple Grid Search to find the best parameters for the Wine data set 


 

        # for each combination of parameters, train a RandomForest

        # fit to the training data

        # evaluate the RFC on the test set)

        # if we got a better score, store the score and parameters



# NOTE: Model with Grid Search shows signs of overfitting. Will rebuild on combo train+valid and
#       evaluate on Test set.

# Rebuild Model on Combo Train+Valid and Eval on Test

In [16]:
# split the data into train + validation set and test set

# split the train + validation set into training and validation set

        # for each combination of parameters, train a RandomForest

        # fit to the training data

        # evaluate the RFC model on the validation set)

        # if we got a better score, store the score and parameters


# rebuild the model on the combined training and validation set,
# and evalaute it on the test set


# NOTE: The Best score on the validcation set is 100% and is the same as simple grid search
#       above indicating simple grid search was overfitting. Test set score then shows 98% so we
#       can likely only claim to classify new data at 98% accuracy.