# A Machine Learning journey from customer reviews to business insights
# *Part 3: Modelling*

*Author: Federica Lionetto*  
*Email: federica.lionetto@gmail.com*  
*Date: 17 November 2020*  
*License: Creative Commons BY-NC-SA*

*Based on the dataset available at:*
- https://www.kaggle.com/efehandanisman/skytrax-airline-reviews

### Further readings

- "What can we learn from five-star airlines: a web scraping project from Skytrax", https://nycdatascience.com/blog/student-works/web-scraping/what-can-we-learn-from-five-star-airlines-a-web-scraping-project-from-skytrax/
- Pipelines for data processing: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html
- LightGBM, https://lightgbm.readthedocs.io/en/latest/index.html
- LightGBM parameters, https://lightgbm.readthedocs.io/en/latest/Parameters.html
- SHAP decision plots, https://slundberg.github.io/shap/notebooks/plots/decision_plot.html

## 0 - Configuration

In [None]:
use_review_text = True
use_count_vectorization = True

In [None]:
if not use_review_text:
    # Without review text.
    df_types_filename = '../Results/PreprocessedDataLightTypes.csv'
    df_filename = '../Results/PreprocessedDataLight.csv'
    df_out_filename = '../Results/Preds-WithoutText.csv'
else:
    # With review text.
    df_types_filename = '../Results/NLPFinalDataLightTypes.csv'
    df_filename = '../Results/NLPFinalDataLight.csv'
    df_out_filename = '../Results/Preds-WithText.csv'

In [None]:
# Define numerical and categorical features.
if not use_review_text:
    # Without review text.
    num_feats = ['date_flown_month',
                 'date_flown_year',
                 'review_date_date_flown_distance_days',
                 'review_characters',
                 'has_layover_num',
                 'seat_comfort',
                 'cabin_service',
                 'food_bev',
                 'entertainment',
                 'ground_service',
                 'value_for_money']
    cat_feats = ['airline',
                 'traveller_type',
                 'cabin']
else:
    # With review text.
    if not use_count_vectorization:
        num_feats = ['date_flown_month',
                     'date_flown_year',
                     'review_date_date_flown_distance_days',
                     'review_characters',
                     'has_layover_num',
                     'seat_comfort',
                     'cabin_service',
                     'food_bev',
                     'entertainment',
                     'ground_service',
                     'value_for_money',
                     'polarity']
    else:
        with open('../Results/VecReviewTextCleanFeats.csv','r') as f:
            vec_feats = f.read()
            vec_feats = vec_feats.split(', ')
        num_feats = ['date_flown_month',
                     'date_flown_year',
                     'review_date_date_flown_distance_days',
                     'review_characters',
                     'has_layover_num',
                     'seat_comfort',
                     'cabin_service',
                     'food_bev',
                     'entertainment',
                     'ground_service',
                     'value_for_money',
                     'polarity'] + vec_feats
    cat_feats = ['airline',
                 'traveller_type',
                 'cabin']

feats = num_feats + cat_feats

In [None]:
# Set this variable to the desired method for data transformation.
# Possible options are: scaling_and_one_hot_encoding, label_encoding, no_transformation.
transform_dataset = 'label_encoding'

## 1 - Import modules and helper functions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette('Set2')
import scipy.sparse

import datetime as dt
import dateutil

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score, confusion_matrix 

import lightgbm as lgb

import shap

import importlib

In [None]:
# Debugging capabilities.
import pdb

In [None]:
# Needed for Colab.
!git clone https://github.com/FedericaLionetto/UZHMLWorkshop2020-NLP
os.chdir('UZHMLWorkshop2020-NLP/')

In [None]:
import sys  
sys.path.insert(0, './helper_functions')

In [None]:
# Related to recommendation.
import assign_label_recommended

# Related to modelling.
import plot_roc_curve
import plot_feature_importance
import plot_confusion_matrix

# Related to visualization.
import plot_hist_sns

## 2 - Load the input data

In [None]:
# Type of each field in the input data.
df_dtype = pd.read_csv(df_types_filename)
dict_dtype = df_dtype[['index','dtypes']].set_index('index').to_dict()['dtypes']
dict_dtype['recommended'] = 'bool'

In [None]:
# Input data.
df = pd.read_csv(df_filename, dtype=dict_dtype, keep_default_na=False, na_values=['_'])
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.head()

In [None]:
df.shape

Get the names of the colums in the dataset.

In [None]:
cols = df.columns.to_list()
print('Columns in the dataset:')
print(cols)

Get the total number of customer reviews in the dataset.

In [None]:
n_reviews = df.shape[0]
print('Number of customer reviews in the dataset: {:d}'.format(n_reviews))

## 3 - Predict whether the customer would recommend the product

### 3.1 - Add the label to the dataset

In [None]:
df['label'] = df.apply(lambda x: assign_label_recommended.assign_label_recommended(x), axis=1)

In [None]:
df.head()

### 3.2 - Convert Boolean features to numerical

In [None]:
df['has_layover_num'] = df['has_layover'].astype(int)
df['date_flown_day'] = df['date_flown_day'].astype(int)
df['date_flown_month'] = df['date_flown_month'].astype(int)
df['date_flown_year'] = df['date_flown_year'].astype(int)

df['seat_comfort'] = df['seat_comfort'].astype(int)
df['cabin_service'] = df['cabin_service'].astype(int)
df['ground_service'] = df['ground_service'].astype(int)
df['food_bev'] = df['food_bev'].astype(int)
df['value_for_money'] = df['value_for_money'].astype(int)
df['entertainment'] = df['entertainment'].astype(int)

for feat in num_feats:
    if 'polarity' not in feat:
        df[feat] = df[feat].astype(int)

In [None]:
df.head()

### 3.3 - Select features for training

In [None]:
X = df[feats]
y = df['label'].values

### 3.4 - Check class balance

In [None]:
f_rec = (y[y==1].shape[0])/y.shape[0]
f_not_rec = (y[y==0].shape[0])/y.shape[0]
print('Fraction of customers that recommeded the service: {:.2f}'.format(f_rec))
print('Fraction of customers that did not recommed the service: {:.2f}'.format(f_not_rec))

### 3.5 - Scale numerical features and apply one-hot encoding to categorical features

Before feeding the selected features to the Machine Learning model, we can transform them to allow the model to correctly interpret them.  

We can specify how to impute missing values. In this exercise, we use the `SimpleImputer` from `sklearn`.

We might want to scale numerical features, so that they have values in a common range.   
In this exercise, we use the `StandardScaler` available in `sklearn` to normalize the features, that is, to subtract their mean and divide by their standard deviation. We transform `x` to `z = (x-u)/s`. We can specify whether or no we want to subtract the mean with the option `with_mean=True/False` and whether or no we want to divide by the standard deviation with the option `with_std=True/False`. As a result, all the numerical features will have mean zero and unit standard deviation. 

In addition to numerical features, we might want to transform categorical features as well.
Different algorithms require categorical features to have different formats. Two common options are one-hot encoding and label encoding.  
1) One-hot encoding allows to encode categorical features as one-hot vectors. The categorical feature is transformed into binary features, one for each category. By default, the econder derives the categories based on the unique values in each feature.  
   Let us consider the following example. The categorical feature `cabin` can have four possible values: `Economy Class`, `Premium Economy`, `Business Class` and `First Class`. The one-hot encoding transform this feature, with four possible values, into four new features, called `cabin_Economy Class`, `cabin_Premium Economy`, `cabin_Business Class` and `cabin_First Class`, with each new feature having two possible values, `0` or `1`, depending on the value of the original feature. A record with `cabin` equal to `Economy Class` will have `cabin_Economy Class` equal to `1` and all other three new features equal to `0`.  
   This leads to sparse data (most of the elements in the dataset will have the value `0`) if the features can have many possible values.  
2) Label encoding allows to encode categorical features as numbers.  
   For example, the categorical feature `cabin` can be encoded as one feature with values `0`, `1`, `2` and `3`.

We use a pipeline to define the data processing, so that we can repeat the same steps for the training and test datasets. In particular, the parameters of the data processing are defined based on the training dataset and are then applied to the test dataset. This is particularly important if the Machine Learning model has to be used in a live system and has to make predictions on new data.

In [None]:
# Create a pipeline for numerical features and a pipeline for categorical features.
num_proc = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='mean'), StandardScaler())
cat_proc = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore'))

# Create a preprocessing step for all features.
preprocessor = make_column_transformer((num_proc, num_feats),
                                       (cat_proc, cat_feats))

### 3.6 - Dataset split for training and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

#### 3.6.1 - Dataset transformation before training

Transform the training and test datasets as specified in the preprocessor.

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

The order of the features after the data processing is the same that is specified in the pipeline, in this case starting with the numerical features and continuing with the categorical features.

In [None]:
cat_feats_one_hot = preprocessor.transformers_[1][1]['onehotencoder'].get_feature_names(cat_feats)
print(cat_feats_one_hot)

all_feats = list(num_feats)+list(cat_feats_one_hot)
print(all_feats)

dict_for_renaming_cols = {}
for i in range(len(all_feats)):
    dict_for_renaming_cols[i] = all_feats[i]
print(dict_for_renaming_cols)

In [None]:
if scipy.sparse.issparse(X_train_transformed):
    X_train_transformed_2 = pd.DataFrame.sparse.from_spmatrix(X_train_transformed)
else:
    X_train_transformed_2 = pd.DataFrame(X_train_transformed)
X_train_transformed_2.rename(columns=dict_for_renaming_cols,inplace=True)

X_test_transformed = preprocessor.transform(X_test)
if scipy.sparse.issparse(X_test_transformed):
    X_test_transformed_2 = pd.DataFrame.sparse.from_spmatrix(X_test_transformed)
else:
    X_test_transformed_2 = pd.DataFrame(X_test_transformed)
X_test_transformed_2.rename(columns=dict_for_renaming_cols,inplace=True)

X_transformed = preprocessor.transform(X)
if scipy.sparse.issparse(X_transformed):
    X_transformed_2 = pd.DataFrame.sparse.from_spmatrix(X_transformed)
else:
    X_transformed_2 = pd.DataFrame(X_transformed)
X_transformed_2.rename(columns=dict_for_renaming_cols,inplace=True)

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
X_train_transformed_2.shape

In [None]:
X_train_transformed_2.head()

In [None]:
X_test.shape

In [None]:
X_test_transformed_2.shape

#### 3.6.1 - Dataset transformation before training according to label encoding

In [None]:
lb_make = LabelEncoder()

In [None]:
X_label_enc = X.copy()
X_train_label_enc = X_train.copy()
X_test_label_enc = X_test.copy()

In [None]:
for feat in cat_feats:
    print('Feature:', feat)
    X_label_enc[feat] = lb_make.fit_transform(X_label_enc[feat])
    X_train_label_enc[feat] = lb_make.fit_transform(X_train_label_enc[feat])
    X_test_label_enc[feat] = lb_make.fit_transform(X_test_label_enc[feat])

In [None]:
X_label_enc[cat_feats].head()

### 3.7 - Model training and test

In [None]:
if transform_dataset == 'scaling_and_one_hot_encoding':
    print('Method for data tranformation: scaling and one hot encoding')
    X_train_for_model = X_train_transformed_2
    X_test_for_model = X_test_transformed_2
    X_for_model = X_transformed_2
    X_test_for_shap = X_test_transformed_2
    X_for_shap = X_transformed_2
elif transform_dataset == 'label_encoding':
    print('Method for data transformation: label encoding')
    X_train_for_model = X_train_label_enc
    X_test_for_model = X_test_label_enc
    X_for_model = X_label_enc
    X_test_for_shap = X_test_label_enc
    X_for_shap = X_label_enc
elif transform_dataset == 'no_transformation':
    print('Method for data transformation: no transformation')
    X_train_for_model = X_train
    X_test_for_model = X_test 
    X_for_model = X
    X_test_for_shap = X_test
    X_for_shap = X

In [None]:
cat_feats

#### 3.7.1 - Training and test on transformed features

In [None]:
# LightGBM.
if transform_dataset == 'scaling_and_one_hot_encoding':
    train_data=lgb.Dataset(X_train_for_model,label=y_train)
    test_data=lgb.Dataset(X_test_for_model,label=y_test)
elif transform_dataset == 'label_encoding':    
    train_data=lgb.Dataset(X_train_for_model,label=y_train,categorical_feature=cat_feats)
    test_data=lgb.Dataset(X_test_for_model,label=y_test,categorical_feature=cat_feats)
elif transform_dataset == 'no_transformation':
    train_data=lgb.Dataset(X_train_for_model,label=y_train)
    test_data=lgb.Dataset(X_test_for_model,label=y_test)
else:
    train_data=lgb.Dataset(X_train_for_model,label=y_train)
    test_data=lgb.Dataset(X_test_for_model,label=y_test)
    
# Hyper-parameters.
params = {'metric': 'binary_logloss', # Possible options are 'auc', 'binary_logloss', 'multi_logloss'.
          'boosting_type': 'gbdt', # Gradient boosting decision tree.
          'objective': 'binary', # 'binary' for binary classification, 'multiclass' for multi classification, 'regression' for regression.
          'feature_fraction': 0.5,
          'num_leaves': 30,
          'max_depth': -1,
          'n_estimators': 200,
          'min_data_in_leaf': 100, 
          # 'min_child_weight': 0.1,
          'reg_alpha': 2,
          'reg_lambda': 5,
          'subsample': 0.8,
          'verbose': -1,
          # 'num_class': 4 # Number of classes minus 1 for multiclass classification.
          # 'num_threads': 4
}

lgbm = lgb.train(params,
                 train_data,
                 2500, # Epochs.
                 valid_sets=test_data,
                 early_stopping_rounds= 30,
                 verbose_eval= 10
                 )

y_prob = lgbm.predict(X_for_model)
y_pred = y_prob.round(0)

clf_roc_auc_score = roc_auc_score(y, y_prob)
clf_accuracy_score = accuracy_score(y, y_pred)

print('Model overall ROC AUC score: {:.3f}'.format(clf_roc_auc_score))
print('Model overall accuracy: {:.3f}'.format(clf_accuracy_score))

In [None]:
# Cross-checks.
print('Min value of prediction: {:.3f}'.format(y_pred.min()))
print('Max value of prediction: {:.3f}'.format(y_pred.max()))
print('Min value of probability: {:.3f}'.format(y_prob.min()))
print('Max value of probability: {:.3f}'.format(y_prob.max()))

In [None]:
print(np.sum(y==0))
print(np.sum(y==1))

In [None]:
dummy_accuracy = np.sum(y==1)/y.shape[0]
print('Accuracy of dummy classifier: %.2f' % dummy_accuracy)

In [None]:
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

sensitivity = tp / (tp+fn) # Recall.
specificity = tn / (tn+fp)
precision = tp / (tp+fp)

print('Sensitivity/Recall: %.2f' % sensitivity)
print('Specificity: %.2f' % specificity)
print('Precision: %.2f' % precision)

In [None]:
plot_confusion_matrix.plot_confusion_matrix(y=y, y_pred=y_pred, normalize_str='true', figsize_w=4, figsize_h=4, filename='../Results/03/ConfusionMatrix.png')

In [None]:
# True positive rate and false positive rate.
fpr, tpr, _ = roc_curve(y, y_prob)

In [None]:
plot_roc_curve.plot_roc_curve(fpr=fpr, tpr=tpr, clf_name='LightGBM', figsize_w=6, figsize_h=6, filename='../Results/03/ROCCurve.png')

In [None]:
feats_names = lgbm.feature_name()

In [None]:
feats_importances = lgbm.feature_importance()

In [None]:
feats_indices = np.argsort(feats_importances)[::-1]

# Print the feature ranking
# print("Feature ranking:")

# for f in range(50):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
feats_importances

In [None]:
feats_names

In [None]:
top_features_names = []
top_features_importances = []

# Print the feature ranking
print("Feature ranking:")

for f in range(min(30,len(feats_importances))):
    top_features_names.append(feats_names[feats_indices[f]])
    top_features_importances.append(feats_importances[feats_indices[f]])
    print("%d. feature %d - %s (%f)" % (f + 1, feats_indices[f], feats_names[feats_indices[f]], feats_importances[feats_indices[f]]))

In [None]:
feats_indices

In [None]:
plot_feature_importance.plot_feature_importance(features_names=top_features_names, features_importances=top_features_importances, figsize_w=6, figsize_h=6, filename='../Results/03/FeatureImportance.png')

In [None]:
# Plot feature importance.
ax = lgb.plot_importance(lgbm, max_num_features=10)

In [None]:
lgb.create_tree_digraph(lgbm)

In [None]:
df_out = pd.DataFrame()
df_out['y_pred'] = y_pred
df_out['y_prob'] = y_prob

In [None]:
plot_hist_sns.plot_hist_sns(df=df_out,
             feat='y_prob',
             bins=30,
             title='Distribution of model prediction',
             x_label='Predicted probability of being recommended',
             y_label='Entries / bin',
             filename='../Results/03/HistModelPredictions.png')

### 3.8 - Model explainability

#### 3.8.1 - Explainer, expected value, SHAP values and SHAP interaction values

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(model=lgbm)

In [None]:
# Take the first element of the shap values array. This refers to the prediction of the model.
shap_values_test = explainer.shap_values(X_test_for_shap)
shap_values = explainer.shap_values(X_for_shap)
if isinstance(shap_values_test, list):
    shap_values_test = shap_values_test[1]
if isinstance(shap_values, list):
    shap_values = shap_values[1]

In [None]:
if transform_dataset=='scaling_and_one_hot_encoding':
    shap_int_values_test = explainer.shap_interaction_values(X_test_for_shap)
    shap_int_values = explainer.shap_interaction_values(X_for_shap)
    if isinstance(shap_int_values_test, list):
        shap_int_values_test = shap_int_values_test[1]
    if isinstance(shap_int_values, list):
        shap_int_values = shap_int_values[1]

In [None]:
# shap_values[0].shape
# shap_values[1].shape
# shap_int_values_test.shape
# shap_int_values.shape

In [None]:
default_prob_for_expected_value = 0.5
default_prob_for_expected_value_pos = 0.999
default_prob_for_expected_value_neg = 0.001

default_expected_value = np.log(default_prob_for_expected_value / (1 - default_prob_for_expected_value)) 
default_expected_value_pos = np.log(default_prob_for_expected_value_pos / (1 - default_prob_for_expected_value_pos)) 
default_expected_value_neg = np.log(default_prob_for_expected_value_neg / (1 - default_prob_for_expected_value_neg)) 

expected_value = explainer.expected_value

if isinstance(expected_value, list):
    expected_value = expected_value[1]
print('Explainer expected value: {:.2f}'.format(expected_value))

if expected_value is None:
    expected_value = default_expected_value
    
expected_value_pos = default_expected_value_pos
expected_value_neg = default_expected_value_neg
    
print('Expected value used in the plots: {:.2f} for all records, {:.2f} for strong recommended and {:.2f} for strong not recommended'.format(expected_value, expected_value_pos, expected_value_neg))

In [None]:
th_strong_rec = 0.995
th_strong_not_rec = 0.005
th_misclassified = 0.8

# Subset corresponding to high probability to recommend.
X_strong_rec = X_for_shap[y_prob>th_strong_rec]
shap_values_strong_rec = shap_values[y_prob>th_strong_rec]
# Subset corresponding to low probability to recommend.
X_strong_not_rec = X_for_shap[y_prob<th_strong_not_rec]
shap_values_strong_not_rec = shap_values[y_prob<th_strong_not_rec]
# Misclassified records.
X_misclassified = X_for_shap[np.abs(y_prob-y)>th_misclassified]
shap_values_misclassified = shap_values[np.abs(y_prob-y)>th_misclassified]

n_strong_rec = X_strong_rec.shape[0]
n_strong_not_rec = X_strong_not_rec.shape[0]
n_misclassified = X_misclassified.shape[0]

print('Number of customer reviews with prediction of recommendation > {:.3f}: {:d}'.format(th_strong_rec,n_strong_rec))
print('Number of customer reviews with prediction of recommendation < {:.3f}: {:d}'.format(th_strong_not_rec,n_strong_not_rec))
print('Number of customer reviews with misclassified prediction of recommendation: {:d}'.format(n_misclassified))

#### 3.8.2 - Summary plot

The **summary plot** shows the feature importance based on the SHAP values.

In [None]:
shap.summary_plot(shap_values, X_for_shap, plot_type='bar')

In [None]:
shap.summary_plot(shap_values[df['traveller_type']=='Business'], X_for_shap[df['traveller_type']=='Business'], plot_type='bar')
shap.summary_plot(shap_values[df['traveller_type']=='Family Leisure'], X_for_shap[df['traveller_type']=='Family Leisure'], plot_type='bar')

In [None]:
X_for_shap

In [None]:
shap.summary_plot(shap_values, X_for_shap)

#### 3.8.3 - Dependence plot

The **dependence plot** allows to visualize how the SHAP value associated to a certain feature changes as a function of the value of that feature. The color scale adds information on the value of a different feature, showing possible interactions between the two features.

In [None]:
if 'value_for_money' in X_for_shap.columns.tolist():
    if transform_dataset=='scaling_and_one_hot_encoding':
        shap.dependence_plot('value_for_money', shap_values, X_for_shap, interaction_index='cabin_Economy Class')
        shap.dependence_plot('value_for_money', shap_values, X_for_shap, interaction_index='cabin_Business Class')
    elif transform_dataset=='label_encoding':
        shap.dependence_plot('value_for_money', shap_values, X_for_shap, interaction_index='cabin')

In [None]:
if transform_dataset=='scaling_and_one_hot_encoding':
    shap.dependence_plot('polarity', shap_values, X_for_shap, interaction_index='cabin_Economy Class')
    shap.dependence_plot('polarity', shap_values, X_for_shap, interaction_index='cabin_Business Class')
elif transform_dataset=='label_encoding':
    shap.dependence_plot('polarity', shap_values, X_for_shap, interaction_index='cabin')

In [None]:
if transform_dataset=='scaling_and_one_hot_encoding':
    shap.dependence_plot('review_characters', shap_values, X_for_shap, interaction_index='cabin_Economy Class')
    shap.dependence_plot('review_characters', shap_values, X_for_shap, interaction_index='cabin_Business Class')
elif transform_dataset=='label_encoding':
    shap.dependence_plot('review_characters', shap_values, X_for_shap, interaction_index='cabin')

#### 3.8.4 - Decision plot

**SHAP decision plots** allow to visualize how a model arrives at a certain prediction, thus giving some insights on how decisions are made.  
For each feature, from the bottom to the top, we see how the prediction changes when a certain feature is taken into account. The contribution of the features at the bottom is usually small (lower importance), while the contribution of the features at the top becomes larger and larger (higher importance).  
Individual predictions can be highlighted using a dotted line style.   
In the decision plot, we can look at the SHAP values or at the SHAP interaction values.  

By looking at several predictions in an aggregated form, we can identify typical prediction paths.  
For example, we can look for patterns among the most positive or most negative customer reviews, or look at the customer reviews that are misclassified by the model and try to understand why this is happening.

In [None]:
shap_values[:20].shape

In [None]:
X_for_shap.iloc[:20].head()

In [None]:
shap.decision_plot(base_value=expected_value, shap_values=shap_values[:20], features=X_for_shap.iloc[:20], link='logit', color_bar=True, highlight=0)

In [None]:
if transform_dataset=='scaling_and_one_hot_encoding':
    shap.decision_plot(base_value=expected_value, shap_values=shap_int_values[:20], features=X_for_shap.iloc[:20], link='logit', color_bar=True, highlight=0)

In [None]:
shap.decision_plot(base_value=expected_value, 
                   shap_values=shap_values_strong_rec, 
                   features=X_strong_rec, 
                   link='logit', 
                   color_bar=True, 
                   feature_order='hclust', 
                   ignore_warnings=True, 
                   xlim=(0.98,1.))

In [None]:
shap.decision_plot(base_value=expected_value, 
                   shap_values=shap_values_strong_not_rec, 
                   features=X_strong_not_rec, 
                   link='logit', 
                   color_bar=True, 
                   feature_order='hclust', 
                   ignore_warnings=True,
                   xlim=(0.,0.02))

In [None]:
shap.decision_plot(base_value=expected_value, shap_values=shap_values_misclassified, features=X_misclassified, link='logit', color_bar=True, feature_order='hclust', ignore_warnings=True)

In [None]:
shap.decision_plot(base_value=expected_value, 
                   shap_values=shap_values[df['traveller_type']=='Business'], 
                   features=X_for_shap[df['traveller_type']=='Business'], 
                   link='logit', 
                   color_bar=True, 
                   feature_order='hclust', 
                   ignore_warnings=True)

#### 3.8.5 - Force plot

In [None]:
shap.force_plot(base_value=expected_value, shap_values=shap_values[0], features=X_for_shap.iloc[0], link='logit')

In [None]:
X_for_shap.iloc[0].head()

In [None]:
shap.force_plot(base_value=expected_value, shap_values=shap_values[20], features=X_for_shap.iloc[20], link='logit')

In [None]:
X_for_shap.iloc[20].head()

## 4 - Save output

In [None]:
df_out.to_csv(df_out_filename)