In [None]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn import model_selection

## Importing Data

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv')

## Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.describe(include=['object','bool'])

In [None]:
num_col=['Administrative_Duration',
       'Informational_Duration', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues']

### Data Visualizations

In [None]:
plt.figure(figsize=(30,10))
df.boxplot()

In [None]:
df.hist(column=num_col,figsize=(20,20))

In [None]:
plt.figure(figsize=(10,10))
df['Revenue'].value_counts().plot(kind='pie',autopct='%1.1f', textprops={'fontsize': 15},startangle=90,explode =(0.1,0),colors=['slategray','cornflowerblue'])
plt.title('Revenue', fontsize = 18)
plt.ylabel('')

In [None]:
plt.title('Number of Customers adding Revenue')
sns.countplot(df['Revenue'])

In [None]:
column1l=['Administrative','Informational','ProductRelated','SpecialDay','OperatingSystems','Browser','Region','TrafficType','Month','VisitorType','Weekend']    
plt.figure(figsize=(30,30))
plot_number = 0
for i in column1l:
    plot_number = plot_number + 1
    ax = plt.subplot(6, 2, plot_number,adjustable='datalim')
    sns.countplot(df[i],hue=df['Revenue'])
    ax.set_title('Customers adding Revenue based on '+ i,fontdict=None)
    plt.tight_layout()

In [None]:
df1=df.copy()
df1.head()

In [None]:
df1.hist(column=num_col,figsize=(20,20))

In [None]:
df1['Administrative_Duration']=1/(df['Administrative_Duration']+1)
df1['Informational_Duration'],i = st.boxcox(df['Informational_Duration']+1)
df1['ProductRelated_Duration'],pd = st.boxcox(df['ProductRelated_Duration']+1)
df1['BounceRates']= df['BounceRates']**0.2 
df1['ExitRates']=df['ExitRates']**0.2 
df1['PageValues'],p = st.boxcox(df['PageValues']+1)

In [None]:
df1.hist(column=num_col,figsize=(20,20))

## Data preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df1['Weekend'] = le.fit_transform(df1['Weekend'])
df1['Revenue'] = le.fit_transform(df1['Revenue'])
df1.head()

In [None]:
df1['TrafficType'].value_counts()

In [None]:
top_10_traffic = [x for x in df1['TrafficType'].value_counts().sort_values(ascending=False).head(10).index]
top_10_traffic

In [None]:
def one_hot_top_x(dataframe,variable,top_x_labels):
    for label in top_x_labels:
        df1[variable+'_'+str(label)] = np.where(df1[variable]==label,1,0)

In [None]:
one_hot_top_x(df1,'TrafficType',top_10_traffic)
df1.head()

In [None]:
top_8_browser = [x for x in df1['Browser'].value_counts().sort_values(ascending=False).head(8).index]
top_8_browser

In [None]:
one_hot_top_x(df1,'Browser',top_8_browser)
df1.head()

In [None]:
df1.Month.hist()

In [None]:
top_8_month = [x for x in df1['Month'].value_counts().sort_values(ascending=False).head(8).index]
top_8_month

In [None]:
one_hot_top_x(df1,'Month',top_8_month)
df1.head()

In [None]:
top_5_os = [x for x in df1['OperatingSystems'].value_counts().sort_values(ascending=False).head(5).index]
top_5_os

In [None]:
one_hot_top_x(df1,'OperatingSystems',top_5_os)
df1.head()

In [None]:
labels = [x for x in df1['VisitorType'].value_counts().sort_values(ascending=False).head().index]
labels

In [None]:
def one_hot_encode(dataframe,variable,labels):
    for label in labels:
        df1[variable+'_'+str(label)] = np.where(df1[variable]==label,1,0)
one_hot_encode(df1,'VisitorType',labels)

In [None]:
df1.head()

In [None]:
df_final = df1.drop(['Month','OperatingSystems','Browser','TrafficType','VisitorType','Region'],axis=1)
df_final.head()

In [None]:
df_final.shape

In [None]:
X=df_final.drop(columns=['Revenue'],axis=1)
y=df_final['Revenue']

## Data Normalization

In [None]:
ss=StandardScaler()
Xs_pca=ss.fit_transform(X)

In [None]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(Xs_pca)

In [None]:
X_pca.shape

## Model Building

In [None]:
LR_pca=LogisticRegression()
rfc_pca=RandomForestClassifier(n_estimators=100,random_state=0)
knn_pca=KNeighborsClassifier()

In [None]:
models_pca=[]
models_pca.append(('Logistic',LR_pca))
models_pca.append(('Random Forest',rfc_pca))
models_pca.append(('KNN',knn_pca))

### Performance Analysis

In [None]:
results_pca=[]
acc_score_pca=[]
auc_score_pca=[]
bias_pca=[]
f1_score_pca=[]
precision_score_pca=[]
recall_score_pca=[]
names_pca=[]
for name,model in models_pca:
    kfold=model_selection.KFold(shuffle=True,n_splits=10,random_state=0)
    cv_results=model_selection.cross_val_score(model,X_pca,y,cv=kfold,scoring='roc_auc')
    results_pca.append(cv_results)
    bias_pca.append(np.var(cv_results,ddof=1))
    auc_score_pca.append(np.mean(cv_results))
    f1=model_selection.cross_val_score(model,X_pca,y,cv=kfold,scoring='f1_weighted')
    f1_score_pca.append(np.mean(f1))
    
    acc=model_selection.cross_val_score(model,X_pca,y,cv=kfold,scoring='accuracy')
    acc_score_pca.append(np.mean(acc))
    
    p=model_selection.cross_val_score(model,X_pca,y,cv=kfold,scoring='precision_weighted')
    precision_score_pca.append(np.mean(p))
    
    r=model_selection.cross_val_score(model,X_pca,y,cv=kfold,scoring='recall_weighted')
    recall_score_pca.append(np.mean(r))
    
    names_pca.append(name)

result_pca_df=pd.DataFrame({'Model':names_pca,
                           'Accuracy Score':acc_score_pca,
                            'ROC-AUC Score':auc_score_pca,
                            'Variance Error':bias_pca,
                            'F1 Score':f1_score_pca,
                            'Precision Score':precision_score_pca,
                            'Recall Score':recall_score_pca})

In [None]:
result_pca_df