# Libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# Loading Data

In [10]:
df = pd.read_csv("C:/Users/goodb/PRIME BATCH/MINI PROJECTS/Project-2_Visitor_to_purchase_prdiction/shop_smart_ecommerce.csv")

# df.head()
# df.describe()
# df.info()
# df.isnull().sum()
# df.head()

# Feature Selection

In [11]:
X = df.drop(columns=["Revenue"])
y = df["Revenue"].astype(int)

num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object", "category"]).columns

# Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Preprocessing of data

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)

X_train

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
4263,8,732.250000,0,0.0,34,1345.755952,0.005128,0.013342,12.274195,0.8,May,3,3,3,2,Returning_Visitor,False
5905,0,0.000000,0,0.0,4,157.200000,0.040000,0.100000,0.000000,0.0,Oct,1,8,3,1,Returning_Visitor,True
9434,0,0.000000,0,0.0,4,42.000000,0.000000,0.050000,0.000000,0.0,Dec,2,10,1,2,Returning_Visitor,False
3505,2,338.000000,0,0.0,17,1205.566667,0.012500,0.037500,19.236250,0.8,May,3,2,2,2,Returning_Visitor,False
2067,0,0.000000,0,0.0,32,827.646212,0.000000,0.000587,54.676348,0.0,Mar,2,2,7,2,Returning_Visitor,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2419,4,69.000000,1,0.0,19,580.500000,0.010526,0.049123,22.329347,0.0,May,2,2,1,3,Returning_Visitor,True
1200,4,16.666667,0,0.0,68,2593.741667,0.002941,0.012843,0.000000,0.0,Mar,2,2,4,8,Returning_Visitor,False
2398,0,0.000000,0,0.0,17,433.133333,0.023529,0.051961,0.000000,1.0,May,3,2,4,4,Returning_Visitor,True
11106,0,0.000000,0,0.0,73,2784.800000,0.002740,0.015318,6.135982,0.0,Nov,2,2,5,2,Returning_Visitor,True


# Decision Tree Classifier

In [14]:
dt = DecisionTreeClassifier(
    max_depth = 6,
    min_samples_leaf = 30,
    class_weight = "balanced",
    random_state = 42
)

# Pipelining Preprocessed data

In [22]:
pipe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", dt)
    ]
)


4722    0
6835    0
5524    1
663     0
136     0
       ..
4648    0
1804    0
4278    0
6244    0
7573    0
Name: Revenue, Length: 2466, dtype: int64

In [16]:
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print("F1 Score:",f1_score(y_test, y_pred))
print("\nClassification Report:\n",classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))

F1 Score: 0.6278381046396841

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.85      0.90      2084
           1       0.50      0.83      0.63       382

    accuracy                           0.85      2466
   macro avg       0.73      0.84      0.77      2466
weighted avg       0.89      0.85      0.86      2466


Confusion Matrix:
 [[1771  313]
 [  64  318]]


# Hyperparameter Tuning

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__max_depth": [4, 6, 8],
    "model__min_samples_leaf": [20, 30, 50]
}

grid = GridSearchCV(
    pipe,
    param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1 # "n_jobes=-1" used to use all CPU cores for faster computation
)

grid.fit(X_train, y_train)

print("Best F1 Score:", grid.best_score_)
print("Best params:", grid.best_params_)

Best F1 Score: 0.6343735129725652
Best params: {'model__max_depth': 4, 'model__min_samples_leaf': 50}


In [32]:
# Creating best model
dt = DecisionTreeClassifier(
    max_depth = 4,
    min_samples_leaf = 50,
    class_weight = "balanced",
    random_state = 42
)

best_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", dt)
    ]
)

best_model.fit(X_train, y_train)

In [33]:
print("Enter customer session details:\n")

data = {
    "Administrative": [int(input("Administrative pages: "))],
    "Administrative_Duration": [float(input("Administrative duration: "))],
    "Informational": [int(input("Informational pages: "))],
    "Informational_Duration": [float(input("Informational duration: "))],
    "ProductRelated": [int(input("Product related pages: "))],
    "ProductRelated_Duration": [float(input("Product related duration: "))],
    "BounceRates": [float(input("Bounce rate: "))],
    "ExitRates": [float(input("Exit rate: "))],
    "PageValues": [float(input("Page value: "))],
    "SpecialDay": [float(input("Special day (0–1): "))],
    "Month": [input("Month (e.g. May): ")],
    "OperatingSystems": [int(input("Operating system (number): "))],
    "Browser": [int(input("Browser (number): "))],
    "Region": [int(input("Region (number): "))],
    "TrafficType": [int(input("Traffic type (number): "))],
    "VisitorType": [input("Visitor type (Returning_Visitor / New_Visitor): ")],
    "Weekend": [input("Weekend (True/False): ") == "True"]
}

user_df = pd.DataFrame(data)


Enter customer session details:



Administrative pages:  2
Administrative duration:  120
Informational pages:  0
Informational duration:  0
Product related pages:  18
Product related duration:  650
Bounce rate:  0.02
Exit rate:  0.07
Page value:  14.9
Special day (0–1):  0
Month (e.g. May):  Jan
Operating system (number):  5
Browser (number):  8
Region (number):  4
Traffic type (number):  7
Visitor type (Returning_Visitor / New_Visitor):  Returning_Visitor
Weekend (True/False):  True


In [35]:
prediction = best_model.predict(user_df)
probability = best_model.predict_proba(user_df)



if prediction[0] == 1:
    print("\nCustomer is likely to BUY")
else:
    print("\nCustomer is NOT likely to buy")

print(f"Purchase probability: {probability[0][1]:.2f}")



Customer is NOT likely to buy
Purchase probability: 0.49
