In [7]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
full_data = pd.read_csv("../data/online_shoppers_intention.csv")
cols = list(full_data.columns)

In [3]:
full_data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

## Data Preprocessing

### Feature Transformation

In [24]:
data = full_data.copy()
booleanDic = {True:1, False:0}

#data["Weekend"] = data["Weekend"].map(booleanDic)
#data["Revenue"] = data["Revenue"].map(booleanDic)
labeler = LabelEncoder()
data['Revenue'] = labeler.fit_transform(data['Revenue'])

# X = data.copy().drop("Revenue",axis=1).to_numpy()
# y = data["Revenue"].to_numpy()

In [25]:
num_cols = cols[:10]
cat_cols = cols[10:]


cat_cols.remove('Month')
cat_cols.remove('VisitorType')
cat_cols.remove('Weekend')
ohe_cols = ["Month","VisitorType","Weekend"]

In [26]:
num_pipeline = Pipeline([
        ('standard_scaler', StandardScaler())
    ])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", OneHotEncoder(), ohe_cols)
],remainder="passthrough")

In [27]:
data_prepared = full_pipeline.fit_transform(data)

In [28]:
encoded_cols = list(full_pipeline.transformers_[1][1].get_feature_names(ohe_cols))

In [29]:
columns = num_cols+encoded_cols+cat_cols
drop= ["Weekend_False"]

In [30]:
dataDF = pd.DataFrame(data_prepared, columns=columns).reset_index(drop=True)
dataDF = dataDF.copy().drop(columns=drop)

In [57]:
X = dataDF.copy().drop("Revenue",axis=1).to_numpy()
y = dataDF["Revenue"].to_numpy()

columns = list(dataDF.columns)
columns.remove("Revenue")

In [63]:
columns

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'Month_Aug',
 'Month_Dec',
 'Month_Feb',
 'Month_Jul',
 'Month_June',
 'Month_Mar',
 'Month_May',
 'Month_Nov',
 'Month_Oct',
 'Month_Sep',
 'VisitorType_New_Visitor',
 'VisitorType_Other',
 'VisitorType_Returning_Visitor',
 'Weekend_True',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType']

## Splitting into Train and Test

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

## SMOTE

In [59]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImPipeline
# https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [60]:
from collections import Counter

In [61]:
oversample = SMOTE()
y=y.astype('float')

X_train1, y_train1 = oversample.fit_resample(X_train, y_train)
X_test1, y_test1 = oversample.fit_resample(X_test, y_test)

counter = Counter(y_train1)
print(counter)
counter = Counter(y_test1)
print(counter)

X1 = pd.DataFrame(X_train1,columns=columns).reset_index(drop=True)
#X1["Revenue"] = y1

Counter({0.0: 6986, 1.0: 6986})
Counter({0.0: 3436, 1.0: 3436})


In [62]:
np.save("X_train.npy",X_train1)
np.save("X_test.npy",X_test1)
np.save("y_train.npy",y_train1)
np.save("y_test.npy",y_test1)

#### SMOTE with Undersampling

In [42]:
over = SMOTE(sampling_strategy=0.30)
under = RandomUnderSampler(sampling_strategy=0.5)

steps = [('o', over), ('u', under)]
pipeline = ImPipeline(steps=steps)

X_train2, y_train2 = oversample.fit_resample(X_train, y_train)
X_test2, y_test2 = oversample.fit_resample(X_test, y_test)

# X2 = pd.DataFrame(X2,columns=columns).reset_index(drop=True)
# X2["Revenue"] = y2