# DSCI 552 Project
### Data Preparation

Data source:
https://archive.ics.uci.edu/ml/datasets/Online+Shoppers+Purchasing+Intention+Dataset

SHAP:
https://shap.readthedocs.io/en/latest/index.html

----

## TODO List:
- [ ] New SMOTE method
- [x] Scale all numerical data
- [ ] DAC and SHAP together modeling for each model
- [ ] Start compiling results from each model


In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pprint
import warnings
from collections import Counter

# data preprocessing
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, TransformerMixin
from imblearn.over_sampling import SMOTE  # try for imbalanced classes
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)

# metrics
from sklearn.metrics import accuracy_score, auc, roc_auc_score, classification_report

from google.colab import files

In [6]:
# import shap
# shap.initjs()
# shap.__version__

In [7]:
## Helper function to get feature names after Column Transformer
"""
Code used from
https://johaupt.github.io/scikit-
learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html
See reference in technical report for full citation.
"""

def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    #check_is_fitted(column_transformer)
    
    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == 'drop' or (
                hasattr(column, '__len__') and not len(column)):
            return []
        if trans == 'passthrough':
            if hasattr(column_transformer, '_df_columns'):
                if ((not isinstance(column, slice))
                        and all(isinstance(col, str) for col in column)):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return ['x%d' % i for i in indices[column]]
        if not hasattr(trans, 'get_feature_names'):
        # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn("Transformer %s (type %s) does not "
                                 "provide get_feature_names. "
                                 "Will return input column names if available"
                                 % (str(name), type(trans).__name__))
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [name + "__" + f for f in column]

        return [name + "__" + f for f in trans.get_feature_names()]
    
    ### Start of processing
    feature_names = []
    
    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
    else:
        # For column transformers, follow the original method
        l_transformers = list(column_transformer._iter(fitted=True))
    
    
    for name, trans, column, _ in l_transformers: 
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names)==0:
                _names = [name + "__" + f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))
    
    return feature_names

In [8]:
!wget https://github.com/EricaXia/dsci_552_project/raw/main/data/online_shoppers_intention.csv
df = pd.read_csv('online_shoppers_intention.csv')
print(df.shape)
df.head(3)

--2021-03-31 19:15:38--  https://github.com/EricaXia/dsci_552_project/raw/main/data/online_shoppers_intention.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/EricaXia/dsci_552_project/main/data/online_shoppers_intention.csv [following]
--2021-03-31 19:15:39--  https://raw.githubusercontent.com/EricaXia/dsci_552_project/main/data/online_shoppers_intention.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1059732 (1.0M) [text/plain]
Saving to: ‘online_shoppers_intention.csv.1’


2021-03-31 19:15:39 (13.2 MB/s) - ‘online_shoppers_intention.csv.1’ saved [1059732/1059732]

(12330, 18)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False


Target variable to predict is **Revenue**

In [9]:
df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157213,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [10]:
# class label counts for Revenue
counts = df.iloc[:,-1].value_counts()
print(counts)
n_false = counts.iloc[0]
n_true = counts.iloc[1]
total = n_false + n_true
print(f"{round(n_false / total, 2) * 100}% False") 
print(f"{round(n_true / total, 2) * 100}% True")

False    10422
True      1908
Name: Revenue, dtype: int64
85.0% False
15.0% True


# Data Cleaning:
1. Encoding categorical or boolean variables

In [11]:
## encode target variable using LabelEncoder (so the target var stays in 1 col and not 2)
labeler = LabelEncoder()
df['Revenue'] = labeler.fit_transform(df['Revenue'])

In [12]:
cat_cols = list(df.select_dtypes(include=['object', 'bool']).columns)
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
print("Categorical:", cat_cols)
print("Numerical:", num_cols)

Categorical: ['Month', 'VisitorType', 'Weekend']
Numerical: Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Revenue'],
      dtype='object')


In [13]:
## Transform pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), 
    ('std_scaler', StandardScaler()),  ## standardize num input
])

pipeline = ColumnTransformer([
    ("numerical", num_pipeline, num_cols),
    ("categorical", OneHotEncoder(), cat_cols)
])

In [14]:
arr = pipeline.fit_transform(df)  
print(arr.shape) 

(12330, 30)


In [15]:
feature_names = get_feature_names(pipeline)
print(len(feature_names))
feature_names2 = [x for x in feature_names if x != 'numerical__Revenue']

30




## Fix class imbalance with SMOTE

In [16]:
# transformed df
df2 = pd.DataFrame(arr, columns=feature_names)

# drop Weekend=False to avoid multicollinearity
df2.drop('categorical__x2_False', axis=1, inplace=True)
print(df2.shape)
df2.head(3)

(12330, 29)


Unnamed: 0,numerical__Administrative,numerical__Administrative_Duration,numerical__Informational,numerical__Informational_Duration,numerical__ProductRelated,numerical__ProductRelated_Duration,numerical__BounceRates,numerical__ExitRates,numerical__PageValues,numerical__SpecialDay,numerical__OperatingSystems,numerical__Browser,numerical__Region,numerical__TrafficType,numerical__Revenue,categorical__x0_Aug,categorical__x0_Dec,categorical__x0_Feb,categorical__x0_Jul,categorical__x0_June,categorical__x0_Mar,categorical__x0_May,categorical__x0_Nov,categorical__x0_Oct,categorical__x0_Sep,categorical__x1_New_Visitor,categorical__x1_Other,categorical__x1_Returning_Visitor,categorical__x2_True
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,-1.233426,-0.790293,-0.894178,-0.762629,-0.427872,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,-0.136078,-0.207952,-0.894178,-0.514182,-0.427872,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,2.058618,-0.790293,2.437081,-0.265735,-0.427872,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
## Train/Test split
# Do this BEFORE oversampling so that some of the same examples won't be in both the train and the test sets
X = df2.drop(['numerical__Revenue'], axis=1)
y = df2['numerical__Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8261, 28) (4069, 28) (8261,) (4069,)


In [18]:
# define X and Y
# n_features = arr.shape[1]
# X = arr[:, : n_features - 2]
# y = arr[:, -1:]


## SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
X_test, y_test = oversample.fit_resample(X_test, y_test)

y_train, y_test = y_train.ravel(), y_test.ravel()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(13972, 28) (6872, 28) (13972,) (6872,)


  TARGET_KIND, type_of_target(y)))
  TARGET_KIND, type_of_target(y)))


In [21]:
## Save arrays for other models use in other notebooks
## This just saves locally, can upload to Github, etc later

with open('X_train.npy', 'wb') as f:
    np.save(f, X_train)
files.download('X_train.npy')
with open('X_test.npy', 'wb') as f:
    np.save(f, X_test)
files.download('X_test.npy')
with open('y_train.npy', 'wb') as f:
    np.save(f, y_train)
files.download('y_train.npy')
with open('y_test.npy', 'wb') as f:
    np.save(f, y_test)
files.download('y_test.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>