## Install standalone library

In [3]:
## stop word list
!pip install stop_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32911 sha256=6dad6623bfc7a66e1a90752cce189aa28eb6f670f315067942b39a1b076693fe
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [4]:
## interpret library
!pip install interpret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import necessary library 

In [5]:
## Normal libs to work with data
## adding support for large, multi-dimensional arrays and matrices.
import numpy as np 
## data structures and operations for manipulating numerical tables and time series.
import pandas as pd 

################################################################################
# Visualization
## the output of plotting commands is displayed inline within frontends like the Jupyter notebook, directly below the code cell that produced it.
%matplotlib inline 
## provides an implicit, MATLAB-like, way of plotting
import matplotlib.pyplot as plt 
## provides a high-level interface for drawing attractive and informative statistical
import seaborn as sns

################################################################################
## Utility
from collections import Counter
import string

################################################################################
## Feature engineering
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from stop_words import get_stop_words


################################################################################
## ML models
## split data into train and test
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics, linear_model, tree, naive_bayes, neighbors, ensemble, neural_network, svm, decomposition, manifold


################################################################################
## Evaluate model
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics

################################################################################
## Interpretation
## import shap
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

In [6]:
## download external text libs data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Get data

In [7]:
## get data from remote file if needed
## url = "https://docs.google.com/spreadsheets/d/1ZVemCFQ_cWCEjriTFBLYHGM33q56eISk/edit?usp=sharing&ouid=102981063366545209715&rtpof=true&sd=true"
## s = requests.get(url).text

## get data by uploading file
df = pd.read_excel('Womens_Clothing_E_Commerce_Reviews.xlsx', sheet_name='Reviews', index_col=0)

In [8]:
## check 5 rows of dataset
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0.0,767.0,33.0,,Absolutely wonderful - silky and sexy and comf...,4.0,1.0,0.0,Initmates,Intimate,Intimates
1.0,1080.0,34.0,,Love this dress! it's sooo pretty. i happene...,5.0,1.0,4.0,General,Dresses,Dresses
2.0,1077.0,60.0,Some major design flaws,I had such high hopes for this dress and reall...,3.0,0.0,0.0,General,Dresses,Dresses
3.0,1049.0,50.0,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,1.0,0.0,General Petite,Bottoms,Pants
4.0,847.0,47.0,Flattering shirt,This shirt is very flattering to all due to th...,5.0,1.0,6.0,General,Tops,Blouses


## Feature engineering

#### Fill NaN value with ' '. Avoid nan string when combine texts

In [9]:
## avoid nan string when combine texts
df['Title'] = df['Title'].fillna('')
df['Review Text'] = df['Review Text'].fillna('')

#### Combine Title & Review Text column into 1 Review Description column

In [10]:
## combine Title & Review Text column into 1 Review Description column
df = df.assign(ReviewDescription = df['Title'].astype(str) + ' ' + df['Review Text'].astype(str))
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,ReviewDescription
0.0,767.0,33.0,,Absolutely wonderful - silky and sexy and comf...,4.0,1.0,0.0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...
1.0,1080.0,34.0,,Love this dress! it's sooo pretty. i happene...,5.0,1.0,4.0,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...
2.0,1077.0,60.0,Some major design flaws,I had such high hopes for this dress and reall...,3.0,0.0,0.0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...
3.0,1049.0,50.0,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,1.0,0.0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps..."
4.0,847.0,47.0,Flattering shirt,This shirt is very flattering to all due to th...,5.0,1.0,6.0,General,Tops,Blouses,Flattering shirt This shirt is very flattering...


#### Drop NaN values

In [11]:
## drop NaN values in categorical columns
df = df.dropna()
df.isnull().sum()

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
ReviewDescription          0
dtype: int64

#### Remove special character in ReviewDescription column

In [12]:
## remove special character
df['ReviewDescription'] = df['ReviewDescription'].str.replace(r"[^a-zA-Z ]","")
df['ReviewDescription'] = df['ReviewDescription'].str.replace(r"[0-9]","")
df.head()

  
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,ReviewDescription
0.0,767.0,33.0,,Absolutely wonderful - silky and sexy and comf...,4.0,1.0,0.0,Initmates,Intimate,Intimates,Absolutely wonderful silky and sexy and comf...
1.0,1080.0,34.0,,Love this dress! it's sooo pretty. i happene...,5.0,1.0,4.0,General,Dresses,Dresses,Love this dress its sooo pretty i happened ...
2.0,1077.0,60.0,Some major design flaws,I had such high hopes for this dress and reall...,3.0,0.0,0.0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...
3.0,1049.0,50.0,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,1.0,0.0,General Petite,Bottoms,Pants,My favorite buy I love love love this jumpsuit...
4.0,847.0,47.0,Flattering shirt,This shirt is very flattering to all due to th...,5.0,1.0,6.0,General,Tops,Blouses,Flattering shirt This shirt is very flattering...


#### Binning rating

In [13]:
## if 1-3 stars = 0 
## if 4-5 stars = 1
bins = [0, 3, 5]
labels = [0,1]
df['Binned Rating'] = pd.cut(df['Rating'], bins=bins, labels=labels)
df.tail()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,ReviewDescription,Binned Rating
23476.0,1104.0,34.0,Great dress for many occasions,I was very happy to snag this dress at such a ...,5.0,1.0,0.0,General Petite,Dresses,Dresses,Great dress for many occasions I was very happ...,1
23477.0,862.0,48.0,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3.0,1.0,0.0,General Petite,Tops,Knits,Wish it was made of cotton It reminds me of ma...,0
23478.0,1104.0,31.0,"Cute, but see through","This fit well, but the top was very see throug...",3.0,0.0,1.0,General Petite,Dresses,Dresses,Cute but see through This fit well but the top...,0
23479.0,1084.0,28.0,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3.0,1.0,2.0,General,Dresses,Dresses,Very cute dress perfect for summer parties and...,0
23480.0,1104.0,52.0,Please make more like this one!,This dress in a lovely platinum is feminine an...,5.0,1.0,22.0,General Petite,Dresses,Dresses,Please make more like this one This dress in a...,1


#### Copy df

In [14]:
## copy df into another dataframe to save original data
rating_class = df.copy()

#### Remove unfrequent words

In [15]:
## get a frequent word dictionary
words_fdist = FreqDist(word for word in word_tokenize(rating_class['ReviewDescription'].str.cat(sep=' ')))

## convert dict to df
words_df = pd.DataFrame.from_dict(words_fdist,\
                                       orient='index').\
                                rename(columns={0:'freq'})

## get common list words                                
common_l = words_df[words_df.freq > 50].index.to_list()

stop_words = list(get_stop_words('en'))
common_l = [w for w in common_l if not w in stop_words]
common_l.remove('im')
common_l.remove('c')
print(common_l)

['Absolutely', 'wonderful', 'silky', 'sexy', 'comfortable', 'Love', 'dress', 'pretty', 'happened', 'find', 'store', 'glad', 'bc', 'never', 'ordered', 'online', 'petite', 'bought', 'love', 'length', 'hits', 'just', 'little', 'knee', 'definitely', 'true', 'midi', 'someone', 'truly', 'design', 'flaws', 'I', 'high', 'hopes', 'really', 'wanted', 'work', 'initially', 'small', 'usual', 'size', 'found', 'fact', 'zip', 'medium', 'ok', 'overall', 'top', 'half', 'fit', 'nicely', 'bottom', 'tight', 'layer', 'several', 'somewhat', 'cheap', 'layers', 'flaw', 'sewn', 'zipper', 'My', 'favorite', 'buy', 'jumpsuit', 'fun', 'flirty', 'fabulous', 'every', 'time', 'wear', 'get', 'nothing', 'great', 'compliments', 'Flattering', 'shirt', 'This', 'flattering', 'due', 'adjustable', 'front', 'tie', 'perfect', 'leggings', 'sleeveless', 'pairs', 'well', 'cardigan', 'Not', 'dresses', 'one', 'feet', 'tall', 'usually', 'p', 'brand', 'package', 'lot', 'skirt', 'long', 'full', 'frame', 'take', 'away', 'garment', 'colo

In [16]:
## print number of common words
len(common_l)

1503

In [17]:
def remove_unfreq_words(review, common_l):
    ## tokenizer
    nopunc = []
    word_lst = []
    for word in review.split():
        if word.lower() not in common_l:
            review.replace(word, '')
        else:
            nopunc.append(word.lower()) ## lower text

    nopunc = ' '.join(nopunc)

    return nopunc

In [18]:
## remove unfrequen words
rating_class['ReviewDescription'] = rating_class['ReviewDescription'].apply(remove_unfreq_words, common_l=common_l)
rating_class['ReviewDescription']

0.0              absolutely wonderful silky sexy comfortable
1.0        love dress pretty happened find store glad bc ...
2.0        design flaws high hopes dress really wanted wo...
3.0        favorite buy love love love jumpsuit fun flirt...
4.0        flattering shirt shirt flattering due adjustab...
                                 ...                        
23476.0    great dress many occasions happy snag dress gr...
23477.0    wish made cotton reminds maternity clothes sof...
23478.0    cute see fit well top see never worked glad ab...
23479.0    cute dress perfect summer bought dress wedding...
23480.0    please make like one dress lovely feminine fit...
Name: ReviewDescription, Length: 23467, dtype: object

In [19]:
rating_class.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,ReviewDescription,Binned Rating
0.0,767.0,33.0,,Absolutely wonderful - silky and sexy and comf...,4.0,1.0,0.0,Initmates,Intimate,Intimates,absolutely wonderful silky sexy comfortable,1
1.0,1080.0,34.0,,Love this dress! it's sooo pretty. i happene...,5.0,1.0,4.0,General,Dresses,Dresses,love dress pretty happened find store glad bc ...,1
2.0,1077.0,60.0,Some major design flaws,I had such high hopes for this dress and reall...,3.0,0.0,0.0,General,Dresses,Dresses,design flaws high hopes dress really wanted wo...,0
3.0,1049.0,50.0,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,1.0,0.0,General Petite,Bottoms,Pants,favorite buy love love love jumpsuit fun flirt...,1
4.0,847.0,47.0,Flattering shirt,This shirt is very flattering to all due to th...,5.0,1.0,6.0,General,Tops,Blouses,flattering shirt shirt flattering due adjustab...,1


#### Text processing in general

In [20]:
lemmatizer = WordNetLemmatizer()

def text_process(review):

    def lemmatization(inputs):  # Ref.1
        return [lemmatizer.lemmatize(word=kk) for kk in inputs]

    ## tokenizer
    nopunc = []
    word_lst = []
    for word in review:
        if word not in string.punctuation:
            nopunc.append(word.lower()) ## lower text

    nopunc = ''.join(nopunc)

    ## stop words
    for word in nopunc.split():
        if word.lower() not in stopwords.words('english'):
            word_lst.append(word)

    lemma_lst = lemmatization(word_lst)
    return lemma_lst

In [21]:
## get text column to preprocess
X_review=rating_class['ReviewDescription']

## get y column to later join to processed X
y=rating_class['Binned Rating']

In [22]:
## tf idf 
bow_transformer=TfidfVectorizer(analyzer=text_process).fit(X_review)
X_review = bow_transformer.transform(X_review)

#### Get features matrix from vectorized tf-idf

In [23]:
## get feature names
features_df = pd.DataFrame(X_review.toarray(), columns = bow_transformer.get_feature_names())
features_df



Unnamed: 0,able,absolutely,accessory,accurate,across,actual,actually,add,added,adding,...,yes,yesterday,yet,yoga,youd,youll,youre,zip,zipped,zipper
0,0.000000,0.386802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173584,0.0,0.152412
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23462,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
23463,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
23464,0.295261,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
23465,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000


#### Combine multiple sources into a consolidated dataframe

In [24]:
## combine y into X, for undersampling
features_df = features_df.merge(y.rename('Binned Rating'), left_index=True, right_index=True)
features_df

Unnamed: 0,able,absolutely,accessory,accurate,across,actual,actually,add,added,adding,...,yesterday,yet,yoga,youd,youll,youre,zip,zipped,zipper,Binned Rating
0.0,0.000000,0.386802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
1.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.173584,0.0,0.152412,0
3.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
4.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23462.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
23463.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
23464.0,0.295261,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1
23465.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,1


In [25]:
## get the 'Recommended IND' column since it has high correlation with y
features_df['Recommended IND'] = df['Recommended IND']

## Prepare input

TODO: Change variable names

#### Prepare X, y

In [26]:
## random state to get the same result when run code, parameter for reproducibility
rand = 10
np.random.seed(rand)

In [27]:
## prepare X and y
X = features_df.drop(['Binned Rating'], axis=1).copy()
y = features_df['Binned Rating']

#### Random sampling

In [28]:
# instantiating the random undersampler
rus = RandomUnderSampler() 
# resampling X, y
X, y = rus.fit_resample(X, y)

In [29]:
# new class distribution
print(Counter(y))

Counter({0: 5270, 1: 5270})


#### Train test split

In [30]:
## train test split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.33, 
                                                    random_state=rand)

The following are the main topics that will be covered in this section:
- Reviewing traditional model interpretation methods
- Understanding the limitations of traditional model interpretation methods
- Studying intrinsically interpretable (white-box) models
- Recognizing the trade-off between performance and interpretability
- Discovering newer interpretable (glass-box) models

This includes model performance
evaluation methods such as RMSE, R-squared, AUC, ROC curves, and the many metrics
derived from confusion matrices. We will also explore several dimensionality reduction
visualization techniques that can be leveraged for interpretation purposes. We will then
examine the limitations of these traditional methods and explain what exactly makes
"white-box" models intrinsically interpretable and why we cannot always use whitebox models. To answer this question, we'll consider the trade-off between prediction
performance and model interpretability. Finally, we will discover some new "glass-box"
models such as EBM and skope-rules that attempt to not compromise in this trade-off.

# Modeling

Each model is its own dictionary
and the function that creates it in the model attribute. This structure will be used later to
store the fitted model neatly, and its metrics. Model classes in this dictionary have been
chosen to represent several model families and to illustrate important concepts that we
will discuss later

In [45]:
class_models = {
    ## Generalized Linear Models (GLMs)
    'logistic':{'model': linear_model.LogisticRegression()},
    'ridge':{'model': linear_model.RidgeClassifierCV()},
    # cv=5,  alphas=[1e-3, 1e-2, 1e-1, 1], class_weight='balanced' #testing

    ## SVC
    'SVC':{'model': SVC(probability=True)},
    # , gamma='auto', random_state=rand #testing

    ## Tree
    'decision_tree':{'model': tree.DecisionTreeClassifier(random_state=rand)},            

    ## Nearest Neighbors
    'knn':{'model': neighbors.KNeighborsClassifier()},
    # n_neighbors=7 #testing

    ## Naive Bayes
    'naive_bayes':{'model': naive_bayes.GaussianNB()},

    ## Ensemble Methods
    'gradient_boosting':{'model':ensemble.GradientBoostingClassifier()},
    # n_estimators=210 #testing

    'random_forest':{'model':ensemble.RandomForestClassifier(random_state=rand)},
    # max_depth=11, class_weight='balanced', #testing

    ## Neural Networks
    'mlp':{'model': neural_network.MLPClassifier(random_state=rand)}
    # hidden_layer_sizes=(7,), max_iter=500,  early_stopping=True, #testing
}

Some explanation for each models:
- logistic: it returns a prob between 0 and 1. If the outcome closer to 1, it denotes as positive class and negative class otherwise. Since we binned rating into 0/1, this is the binary classification problem with 0.5 is the threshold
- ridge: it converts the target values to -1 and hold value 1 for positive class and performs ridge regression (values between -1 and 1, then convert back to 0-1 scale). Firstly, it splits the data into different sets (same size, this time we put sets = 5). Secondly, it removes each feature to see how well model performs. Thirdly, with regularization techniques, it penalized the insignificant features.
- svc: SVM is a family of model classes that operate in high-dimensional space to find an
optimal hyperplane, where they attempt to separate the classes with the maximum margin
between them. Support vectors are the points closest to the decision boundary (the
dividing hyperplane) that would change it if were removed. To find the best hyperplane,
they use a cost function called hinge loss and a computationally cheap method to operate
in high-dimensional space, called the kernel trick, and even though a hyperplane suggests
linear separability, it's not always limited to a linear kernel. If probability=True, the scikit-learn
implementation uses cross-validation and then fits a logistic regression model to the SVC's
scores to produce the probabilities
- decision tree: its algorithm chooses the condition to split the branch of a tree, our example we have the tree depth of 7.
- knn: it chooses the most frequent labels. In our scenario, we use k=7
- naive bayes: it made assumptions that features are independent.
- gradient boosting: an ensemble method, which leverage boosting (in sequence). It trains weak model iteratively. In our case, we applied to decision tree with max_trees=210
- random forest: similar to decision tree, but it use multiple trees instead of only 1 tree, thus reduce the overfitting chance.
- mlp: it uses logistic function on the output layer, which produce prob between 0-1. There are 7 neurons in the first layer since binary classification tends to require fewer of them to
achieve an optimal result. 

In [46]:
## iterate over our dictionary of models (class_models)
for model_name in class_models.keys():
    
    ## fit them to the training data
    fitted_model = class_models[model_name]['model'].fit(X_train, y_train)
    y_train_pred = fitted_model.predict(X_train.values)
    
    ## except for ridge (which doesn't output probabilities)
    if model_name == 'ridge':
        y_test_pred = fitted_model.predict(X_test.values)
    else:
        y_test_prob = fitted_model.predict_proba(X_test.values)[:,1]
        y_test_pred = np.where(y_test_prob > 0.5, 1, 0)
    
    ## predict both probabilities and the class 
    class_models[model_name]['fitted'] = fitted_model
    class_models[model_name]['probs'] = y_test_prob
    class_models[model_name]['preds'] = y_test_pred
    class_models[model_name]['Accuracy_train'] = metrics.accuracy_score(y_train, y_train_pred)
    class_models[model_name]['Accuracy_test'] = metrics.accuracy_score(y_test, y_test_pred)
    class_models[model_name]['Recall_train'] = metrics.recall_score(y_train, y_train_pred)
    class_models[model_name]['Recall_test'] = metrics.recall_score(y_test, y_test_pred)

    if model_name != 'ridge':
        class_models[model_name]['ROC_AUC_test'] = metrics.roc_auc_score(y_test, y_test_prob)
    else:
        class_models[model_name]['ROC_AUC_test'] = 0
    class_models[model_name]['F1_test'] = metrics.f1_score(y_test, y_test_pred)
    class_models[model_name]['MCC_test'] = metrics.matthews_corrcoef(y_test, y_test_pred)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [47]:
## convert the dictionary to a DataFrame
class_metrics = pd.DataFrame.from_dict(class_models, 
                                       'index')[['Accuracy_train', 
                                                 'Accuracy_test',
                                                 'Recall_train', 
                                                 'Recall_test',
                                                 'ROC_AUC_test', 
                                                 'F1_test', 
                                                 'MCC_test']]

## display the metrics in a sorted and color-coded fashion
class_metrics.sort_values(by='ROC_AUC_test', ascending=False).style.background_gradient(cmap='plasma', 
                                                                                        low=0.3, 
                                                                                        high=1,
                                                                                        subset=['Accuracy_train', 
                                                                                                'Accuracy_test']).background_gradient(cmap='viridis', 
                                                                                                                                      low=1, 
                                                                                                                                      high=0.3,
                                                                                                                                      subset=['Recall_train', 
                                                                                                                                              'Recall_test',
                                                                                                                                              'ROC_AUC_test', 
                                                                                                                                              'F1_test', 
                                                                                                                                              'MCC_test'])

Unnamed: 0,Accuracy_train,Accuracy_test,Recall_train,Recall_test,ROC_AUC_test,F1_test,MCC_test
random_forest,0.995893,0.864329,0.999721,0.959811,0.910199,0.873118,0.743761
logistic,0.875372,0.86979,0.988821,0.990544,0.906993,0.880946,0.76353
SVC,0.875089,0.86979,0.992174,0.992317,0.903808,0.881134,0.764225
gradient_boosting,0.879054,0.869503,0.993013,0.990544,0.900407,0.880715,0.763056
mlp,0.995893,0.834148,0.999721,0.880615,0.887975,0.837785,0.672268
knn,0.881603,0.854556,0.97289,0.947991,0.872841,0.863759,0.723416
decision_tree,0.995893,0.806554,0.999721,0.808511,0.802502,0.802581,0.613034
naive_bayes,0.739697,0.656223,0.708496,0.632388,0.679348,0.641487,0.311533
ridge,0.874805,0.870078,0.992174,0.992908,0.0,0.881427,0.764931


Comment on the evaluation parameters:
- accuracy: measure the effectiveness of a model, which show the percentage of correct predictions over all data. 
- recall: in binary classification, recall is called sensitivity. It can be viewed as the probability that a relevant row is retrieved by the query. Since we undersampled, we used only recall.
- ROC-AUC: acronym for Receiver Operating Characteristic and was designed to separate signal from noise. ROC plots the proportion of true
positive rate (Recall) on the x axis and the false positive rate on the y axis. AUC
stands for area under the curve, which is a number between 0 and 1 that assesses
the prediction ability of the classifier. 1 being perfect, 0.5 being as good as a coin
toss, and anything lower meaning that if we inverted the results of our prediction,
we would have a better prediction
- F1 score: the harmonic average of precision and recall. Used when the dataset is imbalanced
- MCC: The Matthews correlation coefficient drawn from biostatistics. It is optimal for imbalanced
classification tasks

A single metric will not tell the whole
story, and interpretation is about telling the most relevant and sufficiently complete
story

# Discovering newer interpretable (glass-box) models

## Explainable Boosting Machine (EBM)


Microsoft's InterpretML framework 

In [None]:
ebm_mdl = ExplainableBoostingClassifier()

In [None]:
## sample a portion of the data
import math 
sample_size = 0.1
sample_idx = np.random.choice(X_train.shape[0], 
                              math.ceil(X_train.shape[0]*sample_size), 
                              replace=False)

In [None]:
ebm_mdl.fit(X_train.iloc[sample_idx],
            y_train.iloc[sample_idx])

ExplainableBoostingClassifier(feature_names=['able', 'absolutely', 'accessory',
                                             'accurate', 'across', 'actual',
                                             'actually', 'add', 'added',
                                             'adding', 'addition', 'additional',
                                             'adjustable', 'adorable', 'adore',
                                             'afraid', 'ag', 'age', 'ago',
                                             'agree', 'air', 'airy', 'aline',
                                             'allows', 'almost', 'alone',
                                             'along', 'already', 'also',
                                             'altered', ...],
                              feature_types=['continuous', 'cont...
                                             'categorical', 'continuous',
                                             'continuous', 'continuous',
                                

#### Global interpretation

In [None]:
show(ebm_mdl.explain_global())

Feature importance plot

#### Local interpretation

In [None]:
pd.set_option('display.max_colwidth', None)
df['ReviewDescription'].iloc[1440:1441]

1440.0    Makes my butt look amazing Perfection i am  and the size  fit me like a glove beautiful color too highly recommend
Name: ReviewDescription, dtype: object

In [None]:
ebm_lcl = ebm_mdl.explain_local(X_test.iloc[77:78],  
                                y_test[77:78], 
                                name='EBM')
show(ebm_lcl)