In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/online-shoppers-purchase-intention/online_shoppers_intention.csv


# **INPUT**

In [2]:
shoppers_data=pd.read_csv("/kaggle/input/online-shoppers-purchase-intention/online_shoppers_intention.csv")

# **PREPROCESSING**

In [3]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

shoppers_data['Session_Length'] = shoppers_data['Administrative_Duration'] + shoppers_data['Informational_Duration'] + shoppers_data['ProductRelated_Duration']
# Create 'Visited_All_Three'
shoppers_data['Visited_All_Three'] = ((shoppers_data['Administrative'] > 0) & (shoppers_data['Informational'] > 0) & (shoppers_data['ProductRelated'] > 0)).astype(int)

# One-hot encode 'VisitorType'
one_hot_encoder = OneHotEncoder(sparse=False)
visitor_type_encoded = one_hot_encoder.fit_transform(shoppers_data[['VisitorType']])
visitor_type_df = pd.DataFrame(visitor_type_encoded, columns=one_hot_encoder.get_feature_names_out(['VisitorType']))
shoppers_data = pd.concat([shoppers_data, visitor_type_df], axis=1)
shoppers_data.drop('VisitorType', axis=1, inplace=True)

# Cyclical encoding for 'Month'
months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'June': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
shoppers_data['Month'] = shoppers_data['Month'].map(months)
shoppers_data['Month_Sin'] = np.sin(2 * np.pi * shoppers_data['Month']/12)
shoppers_data['Month_Cos'] = np.cos(2 * np.pi * shoppers_data['Month']/12)
shoppers_data.drop('Month', axis=1, inplace=True)

# Encode 'Weekend' and 'Revenue'
shoppers_data['Weekend'] = shoppers_data['Weekend'].astype(int)
shoppers_data['Revenue'] = shoppers_data['Revenue'].astype(int)

# Now you can save or further process the modified DataFrame




# Hard Voted Ensemble 

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
import warnings
import numpy as np

# Ignore FutureWarnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Assuming 'shoppers_data' is already loaded and is your DataFrame
# Identify numerical columns
numerical_columns = ['Administrative', 'Informational', 'ProductRelated', 'Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay','Month_Sin','Month_Cos','Session_Length']

# Preparing the data
X = shoppers_data.drop('Revenue', axis=1)
y = shoppers_data['Revenue']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RobustScaler
scaler = RobustScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Apply SVMSMOTE for oversampling
svmsmote = SVMSMOTE(sampling_strategy=0.6, random_state=42)
X_resampled, y_resampled = svmsmote.fit_resample(X_train, y_train)

# Apply RandomUnderSampler for undersampling to 80% of the majority class size
rus = RandomUnderSampler(sampling_strategy=0.8, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

# Configure AdaBoost
ada_boost = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=180,
    learning_rate=0.1,
    random_state=1
)

# Configure LightGBM
lgbm = LGBMClassifier(
    boosting_type='dart',
    colsample_bytree=0.8,
    learning_rate=0.05,
    max_depth=7,
    n_estimator=150,
    num_leaves=31,
    subsample=0.8,
    subsample_freq=1,
    min_child_weight=0.001,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=12
)

# Ensemble: Soft Voting
ensemble = VotingClassifier(
    estimators=[('ada', ada_boost), ('lgbm', lgbm)],
    voting='hard'
)

# Encoding categorical features
encoder = TargetEncoder(cols=['OperatingSystems', 'Browser', 'Region', 'TrafficType'], smoothing=50, min_samples_leaf=10)
X_resampled = encoder.fit_transform(X_resampled, y_resampled)
X_test_encoded = encoder.transform(X_test)

# Fit the ensemble on the resampled training data
ensemble.fit(X_resampled, y_resampled)

# Predicting and evaluating on the original (non-resampled) test set
predictions = ensemble.predict(X_test_encoded)
#probs = ensemble.predict_proba(X_test_encoded)[:, 1]  # Probability estimates needed for AUC
report_dict = classification_report(y_test, predictions, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
report_df['precision'] = report_df['precision'].apply(lambda x: format(x, '.4f'))
report_df['recall'] = report_df['recall'].apply(lambda x: format(x, '.4f'))
report_df['f1-score'] = report_df['f1-score'].apply(lambda x: format(x, '.4f'))
report_df['support'] = report_df['support'].apply(lambda x: int(x))

# Print the formatted classification report
print("Classification Report with Controlled Decimal Places")
print(report_df)

# Calculate and print F1 Score for the Positive Class
f1 = f1_score(y_test, predictions, pos_label=1)
print(f"F1 Score for the Positive Class: {f1:.4f}")

# Compute AUC score
#auc_score = roc_auc_score(y_test, probs)
#print(f"AUC Score: {auc_score:.4f}")

# Compute and print the confusion matrix
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(cm)

# Calculate TPR, FPR, etc from the confusion matrix
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn)  # True Positive Rate
fpr = fp / (fp + tn)  # False Positive Rate
tnr = tn / (tn + fp)  # True Negative Rate
fnr = fn / (tp + fn)  # False Negative Rate

print(f"True Positive Rate (TPR): {tpr:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"True Negative Rate (TNR): {tnr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")


[LightGBM] [Info] Number of positive: 5020, number of negative: 6275
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3746
[LightGBM] [Info] Number of data points in the train set: 11295, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444444 -> initscore=-0.223144
[LightGBM] [Info] Start training from score -0.223144
Classification Report with Controlled Decimal Places
             precision  recall f1-score  support
0               0.9381  0.9299   0.9340     2055
1               0.6643  0.6934   0.6786      411
accuracy        0.8905  0.8905   0.8905        0
macro avg       0.8012  0.8117   0.8063     2466
weighted avg    0.8925  0.8905   0.8914     2466
F1 Score for the Positive Class: 0.6786
Confusion Matrix:
[[1911  144]
 [ 126  285]]
True Positive Rate (TPR): 0.6934
False Positive Rate (FPR): 0.0701
True Nega

# Soft Voted Ensemble


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score
from category_encoders import TargetEncoder
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
import warnings
import numpy as np

# Ignore FutureWarnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Assuming 'shoppers_data' is already loaded and is your DataFrame
# Identify numerical columns
numerical_columns = ['Administrative', 'Informational', 'ProductRelated', 'Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay','Month_Sin','Month_Cos','Session_Length']

# Preparing the data
X = shoppers_data.drop('Revenue', axis=1)
y = shoppers_data['Revenue']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RobustScaler
scaler = RobustScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Apply SVMSMOTE for oversampling
svmsmote = SVMSMOTE(sampling_strategy=0.6, random_state=42)
X_resampled, y_resampled = svmsmote.fit_resample(X_train, y_train)

# Apply RandomUnderSampler for undersampling to 80% of the majority class size
rus = RandomUnderSampler(sampling_strategy=0.8, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

# Configure AdaBoost
ada_boost = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=180,
    learning_rate=0.1,
    random_state=1
)

# Configure LightGBM
lgbm = LGBMClassifier(
    boosting_type='dart',
    colsample_bytree=0.8,
    learning_rate=0.05,
    max_depth=7,
    n_estimator=150,
    num_leaves=31,
    subsample=0.8,
    subsample_freq=1,
    min_child_weight=0.001,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=12
)

# Ensemble: Soft Voting
ensemble = VotingClassifier(
    estimators=[('ada', ada_boost), ('lgbm', lgbm)],
    voting='soft'
)

# Encoding categorical features
encoder = TargetEncoder(cols=['OperatingSystems', 'Browser', 'Region', 'TrafficType'], smoothing=50, min_samples_leaf=10)
X_resampled = encoder.fit_transform(X_resampled, y_resampled)
X_test_encoded = encoder.transform(X_test)

# Fit the ensemble on the resampled training data
ensemble.fit(X_resampled, y_resampled)

# Predicting and evaluating on the original (non-resampled) test set
predictions = ensemble.predict(X_test_encoded)
probs = ensemble.predict_proba(X_test_encoded)[:, 1]  # Probability estimates needed for AUC
report_dict = classification_report(y_test, predictions, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
report_df['precision'] = report_df['precision'].apply(lambda x: format(x, '.4f'))
report_df['recall'] = report_df['recall'].apply(lambda x: format(x, '.4f'))
report_df['f1-score'] = report_df['f1-score'].apply(lambda x: format(x, '.4f'))
report_df['support'] = report_df['support'].apply(lambda x: int(x))

# Print the formatted classification report
print("Classification Report with Controlled Decimal Places")
print(report_df)

# Calculate and print F1 Score for the Positive Class
f1 = f1_score(y_test, predictions, pos_label=1)
print(f"F1 Score for the Positive Class: {f1:.4f}")

# Compute AUC score
auc_score = roc_auc_score(y_test, probs)
print(f"AUC Score: {auc_score:.4f}")

# Compute and print the confusion matrix
cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(cm)

# Calculate TPR, FPR, etc from the confusion matrix
tn, fp, fn, tp = cm.ravel()
tpr = tp / (tp + fn)  # True Positive Rate
fpr = fp / (fp + tn)  # False Positive Rate
tnr = tn / (tn + fp)  # True Negative Rate
fnr = fn / (tp + fn)  # False Negative Rate

print(f"True Positive Rate (TPR): {tpr:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"True Negative Rate (TNR): {tnr:.4f}")
print(f"False Negative Rate (FNR): {fnr:.4f}")


[LightGBM] [Info] Number of positive: 5020, number of negative: 6275
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3746
[LightGBM] [Info] Number of data points in the train set: 11295, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444444 -> initscore=-0.223144
[LightGBM] [Info] Start training from score -0.223144
Classification Report with Controlled Decimal Places
             precision  recall f1-score  support
0               0.9566  0.9119   0.9337     2055
1               0.6430  0.7932   0.7102      411
accuracy        0.8921  0.8921   0.8921        0
macro avg       0.7998  0.8526   0.8220     2466
weighted avg    0.9043  0.8921   0.8965     2466
F1 Score for the Positive Class: 0.7102
AUC Score: 0.9325
Confusion Matrix:
[[1874  181]
 [  85  326]]
True Positive Rate (TPR): 0.7932
False Positive Rate (FPR)