In [7]:
import pandas as pd

In [8]:
# loading the dataset 
df = pd.read_csv("online-shoppers-intention/online_shoppers_intention.csv")

# checking the first 5 rows of the dataset
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [10]:
numerical_col = [i for i in df.columns if (df[i].dtype!='O') and (df[i].dtype!='bool')]
bool_col = [i for i in df.columns if df[i].dtype=='bool']
categorical_col = [i for i in df.columns if df[i].dtype =='O']

print(f"Numerical Columns in the dataset:\n{numerical_col}\n")
print(f"Boolen Columns in the dataset:\n{bool_col}\n")
print(f"Categorical Columns in the dataset:\n{categorical_col}")

Numerical Columns in the dataset:
['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']

Boolen Columns in the dataset:
['Weekend', 'Revenue']

Categorical Columns in the dataset:
['Month', 'VisitorType']


In [11]:
# removing 'Informational_Duration' from numercial_col as it is one of the target column
numerical_col.remove('Informational_Duration')

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import numpy as np


# Split the dataset into features and target variables
X = df.drop(['Revenue', 'Weekend', 'Informational_Duration'], axis=1)
y_revenue = df['Revenue']
y_weekend = df['Weekend']
y_duration = df['Informational_Duration']

# Split the data into training and testing sets
X_train, X_test, y_revenue_train, y_revenue_test, y_weekend_train, y_weekend_test, y_duration_train, y_duration_test = train_test_split(
    X, y_revenue, y_weekend, y_duration, test_size=0.2, random_state=42)


# performing one hot encoding and standardization with column transformation
# Define the transformers for one-hot encoding and standardization
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
numeric_transformer = StandardScaler()

# Create the column transformer with specified transformations
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', categorical_transformer, ["VisitorType",'Month']),
        ('std', numeric_transformer, numerical_col),
    ])

# Apply the transformation to your dataset
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)

# label encoding target variables
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the data
y_weekend_train_encoded = label_encoder.fit_transform(y_weekend_train)
y_revenue_train_encoded = label_encoder.fit_transform(y_revenue_train)

# transform on test dataset
y_weekend_test_encoded = label_encoder.transform(y_weekend_test)
y_revenue_test_encoded = label_encoder.transform(y_revenue_test)


# Model Building for "Informational_Duration"
# Create an instance of the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(colsample_bytree= 0.5826334695315012,
                                 gamma= 0.01563640674119393,
                                 learning_rate= 0.052340148070636965, 
                                 max_depth= 1, 
                                 n_estimators= 747, 
                                 reg_alpha= 0.005522117123602399, 
                                 reg_lambda= 0.8154614284548342, 
                                 subsample= 0.8534286719238086)

# Fit the model on the training data
xgb_regressor.fit(X_train_transformed, y_duration_train)

# Predict the labels for the test set
y_pred_duration = xgb_regressor.predict(X_test_transformed)

# Calculate mean squared error
mse = mean_squared_error(y_duration_test, y_pred_duration)
rmse = np.sqrt(mse)

print("Root Mean Squared Error for 'Informational_Duration' model:", rmse)

# Model Building for "Revenue"
# Create an instance of the SVM classifier
revenue_clf = SVC()

# Fit the model on the training data
revenue_clf.fit(X_train_transformed, y_revenue_train_encoded)

# Predict the labels for the test set
y_pred_revenue = revenue_clf.predict(X_test_transformed)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_revenue_test_encoded, y_pred_revenue)

print("Accuracy for 'revenue' model:", accuracy)

# Model Building for "weekend"
# create an instance of the xgboost classifier
clf_weekend = xgb.XGBClassifier()

# Fit the model on the training data
clf_weekend.fit(X_train_transformed, y_weekend_train_encoded)

# Predict the labels for the test set
y_pred_weekend = clf_weekend.predict(X_test_transformed)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_weekend_test_encoded, y_pred_weekend)

print("Accuracy for 'weekend' model:", accuracy)


Root Mean Squared Error for 'Informational_Duration' model: 112.16358496888647
Accuracy for 'revenue' model: 0.8844282238442822
Accuracy for 'weekend' model: 0.7493917274939172
