# Prediction (Supervised learning)

In [2]:
import pandas as pd
df = pd.read_csv("online_shoppers_intention.csv")

df.head()
df.info()
df["Revenue"].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Revenue
False    0.845255
True     0.154745
Name: proportion, dtype: float64

In [3]:
X = df.drop("Revenue", axis=1)
y = df["Revenue"]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define categorical and numeric features explicitly
categorical_features = [
    "Month", "VisitorType", "Weekend",
    "Browser", "Region", "TrafficType", "OperatingSystems"
]

numeric_features = [c for c in X_train.columns if c not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(
        class_weight="balanced",
        random_state=42
    ))
])

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced_subsample",
        random_state=42,
        n_jobs=1
    ))
])

In [7]:
from sklearn.metrics import classification_report, roc_auc_score, log_loss

def evaluate (model, X_train, y_train, X_test, y_test, name="model"):
    model.fit(X_train,y_train)
    y_pred = logreg_pipeline.predict(X_test)
    
    y_proba = logreg_pipeline.predict_proba(X_test)[:, 1]
    
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test,y_proba))
    print("Log-loss:", log_loss(y_test, y_proba))

evaluate(logreg_pipeline, X_train, y_train, X_test, y_test, "Logistic Regression")
evaluate(dt_pipeline, X_train, y_train, X_test, y_test, "Decision Tree")
evaluate(rf_pipeline, X_train, y_train, X_test, y_test, "Random Forest")


=== Logistic Regression ===
              precision    recall  f1-score   support

       False       0.95      0.86      0.90      2084
        True       0.49      0.74      0.59       382

    accuracy                           0.84      2466
   macro avg       0.72      0.80      0.75      2466
weighted avg       0.88      0.84      0.85      2466

ROC-AUC: 0.8932442142074746
Log-loss: 0.45588421375820165

=== Decision Tree ===
              precision    recall  f1-score   support

       False       0.95      0.86      0.90      2084
        True       0.49      0.74      0.59       382

    accuracy                           0.84      2466
   macro avg       0.72      0.80      0.75      2466
weighted avg       0.88      0.84      0.85      2466

ROC-AUC: 0.8932442142074746
Log-loss: 0.45588421375820165

=== Random Forest ===
              precision    recall  f1-score   support

       False       0.95      0.86      0.90      2084
        True       0.49      0.74      0.59   

In [9]:
import numpy as np
from sklearn.metrics import roc_curve

logreg_pipeline.fit(X_train,y_train)
proba = logreg_pipeline.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, proba)
youden = tpr - fpr 
best_idx = np.argmax(youden)
best_threshold = thresholds[best_idx]

print("Best threshold (Youden):", best_threshold)
print("Youden's Index:", youden[best_idx])

Best threshold (Youden): 0.4075656607707226
Youden's Index: 0.6184492166695139
