In [1]:
# 📦 Install dependencies
!pip install -q pandas scikit-learn faker

# 📁 Step 1: Generate Enhanced Synthetic Bot Data
import random, time, uuid, math
import pandas as pd
from faker import Faker


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/1.9 MB[0m [31m24.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
fake = Faker()

def shannon_entropy(s):
    prob = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))]
    return -sum([p * math.log(p) / math.log(2.0) for p in prob]) if s else 0

def request_path_depth(url):
    return len([p for p in url.split('/') if p])

In [3]:

def generate_log_entry(is_bot=False):
    timestamp_start = time.time()
    session_id = str(uuid.uuid4())

    # Device type based on UA
    device_type = "Headless" if is_bot and random.random() < 0.5 else random.choice(["Desktop", "Mobile"])
    if device_type == "Headless":
        user_agent = "Mozilla/5.0 (compatible; HeadlessChrome/88.0.4324.96)"
    else:
        user_agent = fake.user_agent()

    referer = fake.url()
    url_paths = [fake.uri_path(deep=random.randint(1, 4)) for _ in range(random.randint(5, 20))]
    unique_pages = list(set(url_paths))
    repeated_paths_ratio = 1 - len(unique_pages) / len(url_paths)
    path_depth_avg = sum([request_path_depth(p) for p in url_paths]) / len(url_paths)

    clicks = len(url_paths)
    session_duration = random.uniform(2, 20) if not is_bot else random.uniform(0.3, 5.0)
    time_between_requests = session_duration / clicks if clicks != 0 else 0.1

    ua_entropy = shannon_entropy(user_agent)
    referer_entropy = shannon_entropy(referer)
    click_rate = clicks / session_duration if session_duration > 0 else clicks

    suspicious_ua = int(any(bot_kw in user_agent.lower() for bot_kw in ["python", "bot", "scrapy", "headless", "phantomjs"]))
    cookies_enabled = int(random.random() > 0.1 if not is_bot else random.random() < 0.3)
    time_of_day = int(time.localtime(timestamp_start).tm_hour)

    return {
        "timestamp": timestamp_start,
        "session_id": session_id,
        "user_agent": user_agent,
        "referer": referer,
        "device_type": device_type,
        "clicks_per_session": clicks,
        "session_duration": session_duration,
        "time_between_requests": time_between_requests,
        "ua_entropy": ua_entropy,
        "referer_entropy": referer_entropy,
        "click_rate": click_rate,
        "suspicious_ua": suspicious_ua,
        "cookies_enabled": cookies_enabled,
        "time_of_day": time_of_day,
        "request_path_depth": path_depth_avg,
        "num_unique_pages": len(unique_pages),
        "repeated_paths_ratio": repeated_paths_ratio,
        "is_bot": int(is_bot)
    }

logs = [generate_log_entry(is_bot=random.random() > 0.7) for _ in range(10000)]
df = pd.DataFrame(logs)
df.to_csv("grinch_bot_logs_extended.csv", index=False)
df.head()


Unnamed: 0,timestamp,session_id,user_agent,referer,device_type,clicks_per_session,session_duration,time_between_requests,ua_entropy,referer_entropy,click_rate,suspicious_ua,cookies_enabled,time_of_day,request_path_depth,num_unique_pages,repeated_paths_ratio,is_bot
0,1750053000.0,d664dff9-1d08-4e0c-9d64-353469e60b5a,Mozilla/5.0 (Windows; U; Windows NT 5.1) Apple...,http://www.murray-greene.biz/,Desktop,19,3.80168,0.200088,5.063045,4.064203,4.99779,0,1,5,2.368421,18,0.052632,0
1,1750053000.0,e5c0de20-9a86-4c66-9e9e-062b4ba8e71a,Mozilla/5.0 (compatible; HeadlessChrome/88.0.4...,http://www.ware.com/,Headless,14,2.851435,0.203674,4.793774,3.484184,4.909809,1,1,5,2.357143,13,0.071429,1
2,1750053000.0,a3eae170-e90c-474a-858f-022ebf35788d,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_3;...,https://www.freeman.org/,Desktop,15,1.826806,0.121787,5.047272,3.855389,8.211054,0,1,5,3.066667,15,0.0,1
3,1750053000.0,71a61054-4539-4f6e-b38b-ee83c01c9a2b,Mozilla/5.0 (compatible; HeadlessChrome/88.0.4...,http://parks.biz/,Headless,13,0.80752,0.062117,4.793774,3.572469,16.098676,1,1,5,2.923077,13,0.0,1
4,1750053000.0,e7b9468a-70d0-44f7-babb-002ce60e79fc,Mozilla/5.0 (X11; Linux i686) AppleWebKit/531....,https://www.hughes.com/,Mobile,15,13.118517,0.874568,5.059172,3.64249,1.143422,0,1,5,2.733333,15,0.0,0


In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from urllib.parse import urlparse
import math

df = pd.read_csv("grinch_bot_logs_extended.csv")

# Extra features
df["user_agent_length"] = df["user_agent"].apply(len)
df["referer_domain"] = df["referer"].apply(lambda r: urlparse(r).netloc)
df["referer_domain_freq"] = df["referer_domain"].map(df["referer_domain"].value_counts(normalize=True))
df["click_entropy"] = df["clicks_per_session"].apply(lambda x: -x*np.log2(x) if x > 0 else 0)
df["click_variance"] = df["click_rate"] * df["time_between_requests"]  # Approx variance

# Encode categorical
df["device_type"] = LabelEncoder().fit_transform(df["device_type"])

# Final feature set
features = [
    'clicks_per_session', 'session_duration', 'time_between_requests', 'ua_entropy',
    'referer_entropy', 'click_rate', 'suspicious_ua', 'cookies_enabled', 'time_of_day',
    'request_path_depth', 'num_unique_pages', 'repeated_paths_ratio', 'device_type',
    'user_agent_length', 'referer_domain_freq', 'click_entropy', 'click_variance'
]

X = df[features]
y = df["is_bot"]


In [20]:
!pip install -q xgboost scikit-learn joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import joblib

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_all_scaled = scaler.transform(X)  # For full dataset inference

# XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)
xgb_test_preds = xgb_model.predict(X_test_scaled)

print("🎯 XGBoost Model Report")
print(confusion_matrix(y_test, xgb_test_preds))
print(classification_report(y_test, xgb_test_preds))


Parameters: { "use_label_encoder" } are not used.



🎯 XGBoost Model Report
[[1387   32]
 [  75  506]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1419
           1       0.94      0.87      0.90       581

    accuracy                           0.95      2000
   macro avg       0.94      0.92      0.93      2000
weighted avg       0.95      0.95      0.95      2000



In [21]:
# Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_test_preds = rf_model.predict(X_test_scaled)

print("🌲 Random Forest Report")
print(confusion_matrix(y_test, rf_test_preds))
print(classification_report(y_test, rf_test_preds))


🌲 Random Forest Report
[[1396   23]
 [  70  511]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1419
           1       0.96      0.88      0.92       581

    accuracy                           0.95      2000
   macro avg       0.95      0.93      0.94      2000
weighted avg       0.95      0.95      0.95      2000



In [22]:
# Predict on full dataset for ensemble
xgb_all_preds = xgb_model.predict(X_all_scaled)
rf_all_preds = rf_model.predict(X_all_scaled)

# Ensemble: if either detects as bot → classify as bot
ensemble_preds = np.logical_or(xgb_all_preds, rf_all_preds).astype(int)

print("🤖 Ensemble (Union) Report")
print(confusion_matrix(y, ensemble_preds))
print(classification_report(y, ensemble_preds))

🤖 Ensemble (Union) Report
[[7066   34]
 [  67 2833]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      7100
           1       0.99      0.98      0.98      2900

    accuracy                           0.99     10000
   macro avg       0.99      0.99      0.99     10000
weighted avg       0.99      0.99      0.99     10000

