In [1]:
# Imports
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
)

sns.set_theme(style="whitegrid")



In [2]:
# Load dataset
data_path = Path("../../data/raw/online_shoppers_intention.csv")
df = pd.read_csv(data_path)

print(df.shape)
df.head()



(12330, 18)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
# Target & features
df = df.copy()
df["Revenue"] = df["Revenue"].astype(int)

y = df["Revenue"]
X = df.drop(columns=["Revenue"])

X.head()


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True


In [4]:
# Numeric & categorical feature lists
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)



Numeric features: ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']
Categorical features: ['Month', 'VisitorType', 'Weekend']


In [5]:
# Feature Engineering

X_fe = X.copy()

# 1) Total_Duration
X_fe["Total_Duration"] = (
    X_fe["Administrative_Duration"]
    + X_fe["Informational_Duration"]
    + X_fe["ProductRelated_Duration"]
)

# 2) PageValue_per_Product
X_fe["PageValue_per_Product"] = X_fe["PageValues"] / (X_fe["ProductRelated"] + 1e-3)

# 3) Is_HighSeason (tamamen yaklaşık, yorumda belirtirsin)
high_season_months = ["Nov", "Dec", "Mar", "May"]
X_fe["Is_HighSeason"] = X_fe["Month"].isin(high_season_months).astype(int)

# 4) Is_ReturningVisitor
X_fe["Is_ReturningVisitor"] = (X_fe["VisitorType"] == "Returning_Visitor").astype(int)

print(X.shape, "->", X_fe.shape)
X_fe.head()


(12330, 17) -> (12330, 21)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Total_Duration,PageValue_per_Product,Is_HighSeason,Is_ReturningVisitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,1,1,1,1,Returning_Visitor,False,0.0,0.0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,...,2,2,1,2,Returning_Visitor,False,64.0,0.0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,...,4,1,9,3,Returning_Visitor,False,0.0,0.0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,...,3,2,2,4,Returning_Visitor,False,2.666667,0.0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,...,3,3,1,4,Returning_Visitor,True,627.5,0.0,0,1


FE sonrası numeric / categorical listelerini güncelle

In [6]:
# Update feature lists after FE
numeric_features_fe = X_fe.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features_fe = X_fe.select_dtypes(include=["object", "bool"]).columns.tolist()

print("Numeric (FE):", numeric_features_fe)
print("Categorical (FE):", categorical_features_fe)

Numeric (FE): ['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Total_Duration', 'PageValue_per_Product', 'Is_HighSeason', 'Is_ReturningVisitor']
Categorical (FE): ['Month', 'VisitorType', 'Weekend']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_fe,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((9864, 21), (2466, 21))

In [8]:
# Preprocessing: scale numeric, one-hot encode categorical
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features_fe),
        ("cat", categorical_transformer, categorical_features_fe),
    ]
)


In [9]:
# Pipeline with Logistic Regression (balanced)
log_reg_fe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "clf",
            LogisticRegression(
                max_iter=2000,
                class_weight="balanced",
                solver="liblinear",
                random_state=42,
            ),
        ),
    ]
)

log_reg_fe.fit(X_train, y_train)

y_pred_log = log_reg_fe.predict(X_test)
y_proba_log = log_reg_fe.predict_proba(X_test)[:, 1]

acc_log = accuracy_score(y_test, y_pred_log)
roc_log = roc_auc_score(y_test, y_proba_log)

print(f"FE + LogisticRegression - Accuracy: {acc_log:.4f}")
print(f"FE + LogisticRegression - ROC-AUC: {roc_log:.4f}")
print("Classification report (FE + LogReg):\n", classification_report(y_test, y_pred_log))


FE + LogisticRegression - Accuracy: 0.8479
FE + LogisticRegression - ROC-AUC: 0.8959
Classification report (FE + LogReg):
               precision    recall  f1-score   support

           0       0.95      0.87      0.91      2084
           1       0.51      0.75      0.60       382

    accuracy                           0.85      2466
   macro avg       0.73      0.81      0.75      2466
weighted avg       0.88      0.85      0.86      2466



In [None]:
# Pipeline with RandomForest
rf_fe = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "clf",
            RandomForestClassifier(
                n_estimators=200,
                max_depth=None,
                n_jobs=-1,
                random_state=42,
                class_weight="balanced",
            ),
        ),
    ]
)

rf_fe.fit(X_train, y_train)

y_pred_rf = rf_fe.predict(X_test)
y_proba_rf = rf_fe.predict_proba(X_test)[:, 1]

acc_rf = accuracy_score(y_test, y_pred_rf)
roc_rf = roc_auc_score(y_test, y_proba_rf)

print(f"FE + RandomForest - Accuracy: {acc_rf:.4f}")
print(f"FE + RandomForest - ROC-AUC: {roc_rf:.4f}")
print("Classification report (FE + RF):\n", classification_report(y_test, y_pred_rf))


FE + RandomForest - Accuracy: 0.8966
FE + RandomForest - ROC-AUC: 0.9247
Classification report (FE + RF):
               precision    recall  f1-score   support

           0       0.92      0.96      0.94      2084
           1       0.73      0.53      0.62       382

    accuracy                           0.90      2466
   macro avg       0.82      0.75      0.78      2466
weighted avg       0.89      0.90      0.89      2466



# Feature Engineering Bulguları

Veriye domain bilgisine dayalı yeni değişkenler eklendi:

1. **Total_Duration**: Tüm sayfa kategorilerindeki toplam süre.
2. **PageValue_per_Product**: PageValues / ProductRelated
3. **Is_HighSeason**: Yüksek sezon aylarına göre oluşturulan ikili değişken.
4. **Is_ReturningVisitor**: Ziyaretçi tipinin dönüş ziyaretçisi olup olmadığı.

Bu değişkenler modele eklenince Logistic Regression sonuçları iyileşmiş,  
RandomForest modeli ise belirgin şekilde daha yüksek performans göstermiştir.

FE sonrası RandomForest modeli:
- Accuracy ≈ 0.896
- ROC-AUC ≈ 0.925
- Revenue=1 F1-score ≈ 0.62

Bu nedenle optimizasyon aşamasında FE + RandomForest pipeline'ı kullanılacaktır.
