# importing the dataset

In [15]:
import numpy as np
import pandas as pd

path = '../dataset/bot_detection_data.csv'
df = pd.read_csv(path)

print("First 5 rows:\n", df.head())
print("\nColumn Info:")
print(df.info())

First 5 rows:
    User ID        Username                                              Tweet  \
0   132131           flong  Station activity person against natural majori...   
1   289683  hinesstephanie  Authority research natural life material staff...   
2   779715      roberttran  Manage whose quickly especially foot none to g...   
3   696168          pmason  Just cover eight opportunity strong policy which.   
4   704441          noah87                      Animal sign six data good or.   

   Retweet Count  Mention Count  Follower Count  Verified  Bot Label  \
0             85              1            2353     False          1   
1             55              5            9617      True          0   
2              6              2            4363      True          0   
3             54              5            2242      True          1   
4             26              3            8438     False          1   

       Location           Created At            Hashtags  
0     

# preprocessing and feature engineering

In [16]:
# Datetime conversion
df['Created At'] = pd.to_datetime(df['Created At'], errors='coerce', utc=True)
now = pd.Timestamp.utcnow().tz_convert('UTC')
df['account_age_days'] = (now - df['Created At']).dt.days

# handling missing values for hashtags
df['Hashtags'] = df['Hashtags'].fillna("")
df['hashtag_count'] = df['Hashtags'].apply(lambda x: len(x.split()) if x else 0)

df['username_length'] = df['Username'].apply(len)
df['digit_count'] = df['Username'].str.count(r'\d')

df['tweet_length'] = df['Tweet'].apply(len)

df['Verified'] = df['Verified'].astype(int)

df['retweet_per_hashtag'] = df['Retweet Count'] / (df['hashtag_count'] + 1)

# print(df.head)

features = [
    "account_age_days",
    "hashtag_count",
    "username_length",
    "digit_count",
    "tweet_length",
    "retweet_per_hashtag",
    "Mention Count",
    "Follower Count",
    "Verified"
]


X = df[features]
y = df["Bot Label"]

print(X.head())
print(y.head())

   account_age_days  hashtag_count  username_length  digit_count  \
0              1972              0                5            0   
1              1044              2               14            0   
2              1154              2               10            0   
3              1512              4                6            0   
4              2000              2                6            2   

   tweet_length  retweet_per_hashtag  Mention Count  Follower Count  Verified  
0            83            85.000000              1            2353         0  
1            77            18.333333              5            9617         1  
2            61             2.000000              2            4363         1  
3            49            10.800000              5            2242         1  
4            29             8.666667              3            8438         0  
0    1
1    0
2    0
3    1
4    1
Name: Bot Label, dtype: int64


# splitting the dataset

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (40000, 9) (40000,)
Test set shape: (10000, 9) (10000,)


# Logistic Regression Model

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled =  scaler.transform(X_test)

lr = LogisticRegression(max_iter=2000, random_state=42)
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred, digits=4))

Logistic Regression Results:
              precision    recall  f1-score   support

           0     0.4919    0.4768    0.4842      4996
           1     0.4932    0.5084    0.5007      5004

    accuracy                         0.4926     10000
   macro avg     0.4926    0.4926    0.4925     10000
weighted avg     0.4926    0.4926    0.4925     10000



# Random Forest Model

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_limited = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42, n_jobs=-1)
rf_limited.fit(X_train, y_train)
print("Max_depth=20:\n")
print(classification_report(y_test, rf_limited.predict(X_test), digits=4))

Max_depth=20:

              precision    recall  f1-score   support

           0     0.4949    0.4954    0.4951      4996
           1     0.4957    0.4952    0.4955      5004

    accuracy                         0.4953     10000
   macro avg     0.4953    0.4953    0.4953     10000
weighted avg     0.4953    0.4953    0.4953     10000



# Gradient Boosting - LightGBM

In [20]:
import lightgbm as lgb

# LightGBM Classifier
lgb_clf = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb_clf.fit(X_train, y_train)
y_pred = lgb_clf.predict(X_test)

print("LightGBM Results:\n")
print(classification_report(y_test, y_pred, digits=4))

[LightGBM] [Info] Number of positive: 20014, number of negative: 19986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 883
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500350 -> initscore=0.001400
[LightGBM] [Info] Start training from score 0.001400
LightGBM Results:

              precision    recall  f1-score   support

           0     0.4943    0.5108    0.5024      4996
           1     0.4947    0.4782    0.4863      5004

    accuracy                         0.4945     10000
   macro avg     0.4945    0.4945    0.4944     10000
weighted avg     0.4945    0.4945    0.4944     10000



# Gradient Boosting - XGBoost

In [21]:
import xgboost as xgb
from sklearn.metrics import classification_report

xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

print("XGBoost Results:\n")
print(classification_report(y_test, y_pred, digits=4))


XGBoost Results:

              precision    recall  f1-score   support

           0     0.4978    0.5044    0.5011      4996
           1     0.4986    0.4920    0.4953      5004

    accuracy                         0.4982     10000
   macro avg     0.4982    0.4982    0.4982     10000
weighted avg     0.4982    0.4982    0.4982     10000



# SVM

In [22]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="linear", probability=True, random_state=42)
svm_clf.fit(X_train_scaled, y_train)

y_pred = svm_clf.predict(X_test_scaled)

print("SVM Results:\n")
print(classification_report(y_test, y_pred, digits=4))

SVM Results:

              precision    recall  f1-score   support

           0     0.5003    0.5036    0.5019      4996
           1     0.5011    0.4978    0.4994      5004

    accuracy                         0.5007     10000
   macro avg     0.5007    0.5007    0.5007     10000
weighted avg     0.5007    0.5007    0.5007     10000



# Neural Network - MLP

In [23]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),  # 2 hidden layers with 64 and 32 neurons
    activation="relu",
    solver="adam",
    max_iter=500,
    random_state=42
)
mlp.fit(X_train_scaled, y_train)

y_pred = mlp.predict(X_test_scaled)

print("Neural Network (MLP) Results:\n")
print(classification_report(y_test, y_pred, digits=4))

Neural Network (MLP) Results:

              precision    recall  f1-score   support

           0     0.4991    0.5088    0.5039      4996
           1     0.4999    0.4902    0.4950      5004

    accuracy                         0.4995     10000
   macro avg     0.4995    0.4995    0.4995     10000
weighted avg     0.4995    0.4995    0.4995     10000



# Evaluation metrics

In [24]:
def evaluate_model(name, model, X_test, y_test, scaled=False):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, digits=4)
    return { "name": name, "report": report }

results = []

results.append(evaluate_model("Logistic Regression", lr, X_test_scaled, y_test))
results.append(evaluate_model("Random Forest", rf_limited, X_test, y_test))
results.append(evaluate_model("LightGBM", lgb_clf, X_test, y_test))
results.append(evaluate_model("XGBoost", xgb_clf, X_test, y_test))
results.append(evaluate_model("Support Vector Machine", svm_clf, X_test_scaled, y_test))
results.append(evaluate_model("Multilayer Perceptron", mlp, X_test_scaled, y_test))

df_results = pd.DataFrame({
    res["name"]: {
        "Accuracy": res["report"]["accuracy"],
        "Precision (weighted)": res["report"]["weighted avg"]["precision"],
        "Recall (weighted)": res["report"]["weighted avg"]["recall"],
        "F1 (weighted)": res["report"]["weighted avg"]["f1-score"],
        "Recall (Bot=1)": res["report"]["1"]["recall"],
        "F1 (Bot=1)": res["report"]["1"]["f1-score"]
    }
    for res in results
}).T

print("\n Final Model Comparison:\n")
print(df_results.sort_values("F1 (Bot=1)", ascending=False))


 Final Model Comparison:

                        Accuracy  Precision (weighted)  Recall (weighted)  \
Logistic Regression       0.4926              0.492580             0.4926   
Support Vector Machine    0.5007              0.500703             0.5007   
Random Forest             0.4953              0.495300             0.4953   
XGBoost                   0.4982              0.498205             0.4982   
Multilayer Perceptron     0.4995              0.499508             0.4995   
LightGBM                  0.4945              0.494507             0.4945   

                        F1 (weighted)  Recall (Bot=1)  F1 (Bot=1)  
Logistic Regression          0.492473        0.508393    0.500689  
Support Vector Machine       0.500696        0.497802    0.499449  
Random Forest                0.495300        0.495204    0.495451  
XGBoost                      0.498181        0.492006    0.495273  
Multilayer Perceptron        0.499457        0.490208    0.495006  
LightGBM                 

# Saving the models

In [25]:
import os
import pickle
import json
from pathlib import Path

BASE_DIR = Path(os.getcwd())
ARTIFACT_DIR = BASE_DIR.parent / "backend" / "artifacts"
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)

# ---- best model so far ----
with open(ARTIFACT_DIR / "mlp_model.pkl", "wb") as f:
    pickle.dump(mlp, f)

# Saving the scaler used for MLP
with open(ARTIFACT_DIR / "scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Saving the feature schema
feature_columns = list(X.columns)
with open(ARTIFACT_DIR / "feature_columns.json", "w") as f:
    json.dump(feature_columns, f, indent=2)

print("MLP model, scaler, and schema saved in backend/artifacts/")

# ---- Saving Logistic Regression Model as backup ----
with open(ARTIFACT_DIR / "logreg_model.pkl", "wb") as f:
    pickle.dump(lr, f)

print("Logistic Regression saved as backup")


MLP model, scaler, and schema saved in backend/artifacts/
Logistic Regression saved as backup
