In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('online_shoppers_intention.csv')

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


# Data Pre-Processing and Modelling

Data pre-processing is the process of performing operations on the raw data to make it compatible for machine learning algorithms.

It shall include the following steps:

- **Data Cleaning**: Since our data does not have any missing values, the process of data cleaning will primarily include removing outliers so that they do not mess with our algorithm.
- **Categorical Feature Encoding** and **Continuous Feature Scaling**: For our project, we will do this using a data pipeline to make the process more streamlined and organized.
- **Model Selection**: We will use the cleaned and tranformed data to evaluate a few machine learning models and select the best one
- **Hyper-parameter Tuning**: Select the best set of hyper-parameters for the chosen model
- **Final Evaluation**: Evaluate the final model fitted with the train data on the test data

## Data Cleaning:

Most columns in our data have quite a few extreme outliers (please check the Exploratory Data Analysis file). For the purpose of this project, we will term an extreme outlier as a value that has a z-score greater than 3.

We may lose quite some data if we delete every observation containing an outlier value. To avoid the loss, we will only delete those observations which contain more outliers than a specified threshold value.

Here's the **step-by-step** process:

- Assign a z-score to every value in every numberic(int or float) column
- Calculate the number of features having z-score>3 for every row
- Delete the rows where number or outliers is greater than the specified threshold value

In [4]:
##The zscore function will calculate the z-score for every value in a particular column
## z-score is basically the distance between the value and the mean expressed in terms of standard-deviation
## value-mean = z-score*standard-deviation
def zscore(x,col):
    mean, std = np.mean(df[col]), np.std(df[col])
    return (x-mean)/std


##The cleaned_data function takes a dataframe as input and returns the same after removing observations with too many outliers
##We will consider a value having |z-score|>3 as an outlier
##The function calculates the number of outliers present in each observations
##It deletes the observations having more outliers than the specified threshold value

def cleaned_data(data,threshold=5):
    ##creating an array of all numeric features
    numerics=['int64','float64']
    num_cols=data.select_dtypes(include=numerics).columns
    ##z_cols keeps track of the new columns created in the function
    z_cols=[]
    for col in num_cols:
        data[f'{col}_z']=(abs(data[col].apply(lambda x:zscore(x,col)))>3)
        z_cols.append(f'{col}_z')
    ##df['Total_Outliers'] contains the total number of outliers for every observation
    data['Total_Outliers']=np.sum(data[z_cols],axis=1)
    z_cols.append('Total_Outliers')
    ##delete observations with number of outliers greater than threshold
    data.drop(data[data['Total_Outliers']>threshold].index,axis=0,inplace=True)
    ##delete all columns created in this function
    data.drop(z_cols,axis=1,inplace=True)
    data.reset_index(inplace=True)
    data.drop('index',axis=1,inplace=True)
    return data

In [5]:
df=cleaned_data(df,threshold=3)

In [6]:
##Checking the number of rows in the cleaned data
df.shape[0]

12265

In [7]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [8]:
##Splitting the data into training and testing halves
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=101)

for train_idx, test_idx in split.split(df,df['Revenue']):
    train=df.iloc[train_idx]
    test=df.iloc[test_idx]

In [9]:
train.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
8576,6,148.208333,2,139.5,98,3714.352664,0.005,0.01343,0.0,0.0,Nov,2,2,1,2,Returning_Visitor,False,False
9934,11,250.25,0,0.0,169,8276.292565,0.002406,0.016052,8.174729,0.0,Nov,4,1,1,8,Returning_Visitor,True,False
10371,10,143.583333,1,92.0,245,7793.158685,0.001587,0.00461,59.424589,0.0,Nov,3,2,1,8,Returning_Visitor,False,True
5052,0,0.0,0,0.0,12,1287.0,0.0,0.016667,0.0,0.4,May,4,2,2,13,Returning_Visitor,False,False
1206,5,710.333333,0,0.0,23,7533.272727,0.0,0.007407,0.0,0.0,Mar,2,2,1,2,Returning_Visitor,False,False


In [10]:
test.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
5668,0,0.0,0,0.0,9,140.2,0.0,0.044444,0.0,0.0,Sep,2,2,9,2,New_Visitor,True,False
6008,8,116.972222,9,2252.033333,19,1135.880556,0.009677,0.011842,0.0,0.0,Jul,3,2,4,2,Returning_Visitor,True,False
5236,0,0.0,0,0.0,25,375.6,0.0,0.004,0.0,0.4,May,2,2,1,4,Returning_Visitor,False,False
3195,4,118.903226,6,104.5,111,2748.624538,0.00542,0.006678,119.641474,0.0,May,1,1,2,3,Returning_Visitor,False,False
1975,0,0.0,1,0.0,16,485.666667,0.0,0.0375,0.0,0.0,Mar,2,2,6,3,Returning_Visitor,True,False


## Feature Encoding/Scaling using Data Pipeline

Feature Encoding/Scaling is the process of bringing raw data into a format best suited for machine learning algorithms. A number of algorithms only support encoded categorical features. Similarly, a lot of algorithms can only give correct predictions when the continuous features have been scaled (Eg. distance-based algorithms like KNN). Instead of doing the work manually, we shall use Pipeline and ColumnTransformer from the Scikit-Learn library alongside custom functions to automate the task.

Here's the **step-by-step** process:
- Make all the necessary imports (OneHotEncoder, StandardScaler etc.)
- Define a **numeric_pipeline** function for continuous attributes that returns a Pipeline for imputation and scaling alongside an array having the names of all continuous attributes
- Create a master **data_transform** function that uses OneHotEncoder() and numeric_pipeline to transform categorical and continuous attributes respectively. It returns the features as a pandas DataFrame and the label(target) as a pandas Series.

In [11]:
##Necessary imports
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
##for continuous features
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
##for categorical features
from sklearn.preprocessing import OneHotEncoder

In [12]:
def numeric_pipeline(data):
    ##list of all numeric attributes in data
    num_attrs=data.select_dtypes(include=['int64','float64']).columns
    ##pipeline for numeric data tranformation
    num_pipeline=Pipeline([('impute',SimpleImputer(strategy='median')),
                          ('scale',StandardScaler(with_mean=False))])
    return num_attrs, num_pipeline

In [13]:
def data_transform(data):
    
    ##convert categories in the following categorical cols to string from int to avoid confusion
    cols=['OperatingSystems','Browser','Region','TrafficType','Weekend','Revenue']
    for col in cols:
        df[col]=df[col].apply(lambda x:f'{x}')
        
    ##use numeric_pipeline func to get num_attrs and num_pipeline
    num_attrs, num_pipeline=numeric_pipeline(data)
    ##creating a list of categorical features
    cat_attrs=data.select_dtypes(exclude=['int64','float64']).columns
    
    main_pip=ColumnTransformer([('num',num_pipeline,num_attrs),
                               ('cat',OneHotEncoder(),cat_attrs)])
    
    ##data in sparse matrix form
    my_data=main_pip.fit_transform(data)
    
    ##Creating labels for our data
    labels=[]
    for col in data.columns:
        if col in num_attrs:
            labels.append(col)
        else:
            labels.extend(np.sort([f'{col}_{x}' for x in df[col].unique()]))
            
    ##Creating a dataframe using the data and the labels
    transformed_df=pd.DataFrame(data=my_data,columns=labels)
    
    ##'Weekend' and 'Revenue' are boolean columns and hence can be represented by one column alone
    transformed_df.drop(['Weekend_False','Revenue_False'],axis=1,inplace=True)
    
    ##Separating the dataframe into feature and label sets
    features, label=transformed_df.drop('Revenue_True',axis=1), transformed_df['Revenue_True']
    
    
    return features, label

In [14]:
##Creating features and label sets for train data
X_train, y_train=data_transform(test)

In [15]:
X_train.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,Month_Oct,Month_Sep,OperatingSystems,Browser,Region,TrafficType,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_True
0,0.0,0.0,0.0,0.0,0.226709,0.088502,0.0,0.889136,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,2.477873,0.755669,7.125538,16.889713,0.478607,0.717027,0.194155,0.236896,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.629746,0.237098,0.0,0.080022,0.0,1.941821,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.238936,0.768144,4.750359,0.783725,2.796074,1.735075,0.108741,0.133604,6.892892,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.791726,0.0,0.403038,0.306578,0.0,0.750208,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [16]:
y_train.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Revenue_True, dtype: float64

## Model Selection

We will now use the cleaned and transformed data to select the best classifiction algorithm for our data. We will perform cross-validation on X_train and y_train for each algorithm and score them based on roc_auc. We will select the algorithm that gives the best result.

**Models** to evaluate:
- Decision Trees Classifier
- K Neighbors Classifier
- Support Vector Classifier

Here's **the step-by-step** process:
- Import the necessary algorithms and cross_val_score from scikit-learn
- Evaluate every algorithm on X_train and y_train
- Select the algorithm that gives the best score

In [17]:
##import all algorithms to be tested
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [18]:
##import cross_val_score
from sklearn.model_selection import cross_val_score

In [19]:
##create a function to evaluate model
def model_evaluation(model,X,y,folds):
    scores=cross_val_score(estimator=model,X=X,y=y,cv=folds,scoring='roc_auc')
    print(f'{str(model)}')
    print(f'scores: {scores}')
    print(f'mean_score: : {np.mean(scores)}')

In [20]:
##evaluating DecisionTreeClassifier()
model_evaluation(model=DecisionTreeClassifier(),X=X_train,y=y_train,folds=5)

DecisionTreeClassifier()
scores: [0.7161859  0.69587825 0.74017121 0.74385542 0.73421687]
mean_score: : 0.7260615295188851


In [21]:
##evaluating KNeighborsClassifier()
model_evaluation(model=KNeighborsClassifier(),X=X_train,y=y_train,folds=5)

KNeighborsClassifier()
scores: [0.79363782 0.79427711 0.7387603  0.78830522 0.81264257]
mean_score: : 0.785524604897322


In [22]:
##evaluating SVC()
model_evaluation(model=SVC(),X=X_train,y=y_train,folds=5)

SVC()
scores: [0.87064103 0.8862714  0.89606848 0.85012048 0.85583936]
mean_score: : 0.8717881501715364


Model Selected : **Support Vector Classifier**

## Hyper-Parameter Tuning

Hyperparameters are the configurations of a machine learning model that can be set externally by the user. They differ greatly from parameters, which are the configurations that the algorithm learns by itself during training. Hyperparameters give us some control over the learning process of the algorithm and correctly tuning then can greatly improve performance.

For the purpose of this project we will use **GridSearchCV** from sklearn.model_selection for hyperparameter-tuning. It takes in a parameter grid for the model and evaluates all possible combinations of hyperparameters before giving us the best combination.

Here's the **step-by-step** process:

- Import GridSearchCV from sklearn.model_selection
- Specify a parameter grid for SVC()
- Instantiate a GridSearchCV instance
- Fit the instance with X_train and y_train
- Create a model with the best hyperparameter combination

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
##'C' introduces a penalty for every mis-classified point - High value of 'C' means greater variance - Low 'C' means high bias
##'kernel' specifies the kernel function for the classifier
##'degree' specifies the degree of the kernel if it is polynomial
param_grid={'C':np.linspace(0.2,0.4,2),
           'kernel':['linear','poly','rbf'],
           'degree':[1,2,3]}

In [25]:
##Instantiating
gscv=GridSearchCV(estimator=SVC(),param_grid=param_grid,cv=5,scoring='roc_auc',verbose=4)

In [26]:
##Fit with train data
gscv.fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] C=0.2, degree=1, kernel=linear ..................................
[CV] ...... C=0.2, degree=1, kernel=linear, score=0.885, total=   0.1s
[CV] C=0.2, degree=1, kernel=linear ..................................
[CV] ...... C=0.2, degree=1, kernel=linear, score=0.883, total=   0.1s
[CV] C=0.2, degree=1, kernel=linear ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ...... C=0.2, degree=1, kernel=linear, score=0.900, total=   0.1s
[CV] C=0.2, degree=1, kernel=linear ..................................
[CV] ...... C=0.2, degree=1, kernel=linear, score=0.878, total=   0.1s
[CV] C=0.2, degree=1, kernel=linear ..................................
[CV] ...... C=0.2, degree=1, kernel=linear, score=0.872, total=   0.1s
[CV] C=0.2, degree=1, kernel=poly ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


[CV] ........ C=0.2, degree=1, kernel=poly, score=0.872, total=   0.1s
[CV] C=0.2, degree=1, kernel=poly ....................................
[CV] ........ C=0.2, degree=1, kernel=poly, score=0.877, total=   0.0s
[CV] C=0.2, degree=1, kernel=poly ....................................
[CV] ........ C=0.2, degree=1, kernel=poly, score=0.910, total=   0.1s
[CV] C=0.2, degree=1, kernel=poly ....................................
[CV] ........ C=0.2, degree=1, kernel=poly, score=0.867, total=   0.1s
[CV] C=0.2, degree=1, kernel=poly ....................................
[CV] ........ C=0.2, degree=1, kernel=poly, score=0.866, total=   0.1s
[CV] C=0.2, degree=1, kernel=rbf .....................................
[CV] ......... C=0.2, degree=1, kernel=rbf, score=0.883, total=   0.1s
[CV] C=0.2, degree=1, kernel=rbf .....................................
[CV] ......... C=0.2, degree=1, kernel=rbf, score=0.888, total=   0.1s
[CV] C=0.2, degree=1, kernel=rbf .....................................
[CV] .

[CV] ...... C=0.4, degree=2, kernel=linear, score=0.878, total=   0.1s
[CV] C=0.4, degree=2, kernel=linear ..................................
[CV] ...... C=0.4, degree=2, kernel=linear, score=0.874, total=   0.1s
[CV] C=0.4, degree=2, kernel=poly ....................................
[CV] ........ C=0.4, degree=2, kernel=poly, score=0.869, total=   0.1s
[CV] C=0.4, degree=2, kernel=poly ....................................
[CV] ........ C=0.4, degree=2, kernel=poly, score=0.861, total=   0.1s
[CV] C=0.4, degree=2, kernel=poly ....................................
[CV] ........ C=0.4, degree=2, kernel=poly, score=0.903, total=   0.1s
[CV] C=0.4, degree=2, kernel=poly ....................................
[CV] ........ C=0.4, degree=2, kernel=poly, score=0.868, total=   0.1s
[CV] C=0.4, degree=2, kernel=poly ....................................
[CV] ........ C=0.4, degree=2, kernel=poly, score=0.834, total=   0.1s
[CV] C=0.4, degree=2, kernel=rbf .....................................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    7.1s finished


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': array([0.2, 0.4]), 'degree': [1, 2, 3],
                         'kernel': ['linear', 'poly', 'rbf']},
             scoring='roc_auc', verbose=4)

In [30]:
##Check best hyper-parameter combination
gscv.best_params_

{'C': 0.2, 'degree': 1, 'kernel': 'linear'}

In [31]:
##Check the score of the best estimator
gscv.best_score_

0.8835379835129993

In [32]:
##Create best model
svc_classifier=SVC(C=0.2,degree=1,kernel='linear')

## Final Evaluation

In this section we will fit our model on the train data and evaluate it on the test data. We will use roc_auc score for scoring.

In [33]:
from sklearn.metrics import roc_auc_score

In [34]:
##creating X_test and y_test using the data_transform function
X_test, y_test=data_transform(test)

In [35]:
##fitting the model with the train data
svc_classifier.fit(X_train,y_train)

SVC(C=0.2, degree=1, kernel='linear')

In [36]:
##prediciting
predictions=svc_classifier.predict(X_test)

In [37]:
##evaluating
roc_auc_score(y_test,predictions)

0.6776159774714687