In [1]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)
import evaluate

In [2]:
# Config (v1 baseline)

CSV_PATH = "bot_detection_data.csv"   
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
SEED = 42

# Try likely column names first (we'll auto-detect)
TEXT_COL_CANDIDATES = ["text", "tweet", "status", "content"]
LABEL_COL_CANDIDATES = ["bot", "label", "target", "is_bot"]

set_seed(SEED)

In [3]:
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head(5))

Shape: (50000, 11)
Columns: ['User ID', 'Username', 'Tweet', 'Retweet Count', 'Mention Count', 'Follower Count', 'Verified', 'Bot Label', 'Location', 'Created At', 'Hashtags']


Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention


In [4]:
def pick_column(candidates, available_cols):
    for c in candidates:
        if c in available_cols:
            return c
    return None

TEXT_COL = "Tweet"
LABEL_COL = "Bot Label"

if TEXT_COL is None or LABEL_COL is None:
    raise ValueError(
        f"Could not auto-detect text/label columns.\n"
        f"Available columns: {df.columns.tolist()}\n"
        f"Please set TEXT_COL and LABEL_COL manually."
    )

print(f"Using TEXT_COL={TEXT_COL}, LABEL_COL={LABEL_COL}")

Using TEXT_COL=Tweet, LABEL_COL=Bot Label


In [5]:
# Keep only rows with non-null text/label
df = df[[TEXT_COL, LABEL_COL]].dropna().copy()

# Convert text to string
df[TEXT_COL] = df[TEXT_COL].astype(str)

# Convert label to int (works for bool / 0/1 / "0"/"1")
df[LABEL_COL] = df[LABEL_COL].astype(int)

# Optional sanity check
unique_labels = sorted(df[LABEL_COL].unique().tolist())
print("Unique labels:", unique_labels)

if not set(unique_labels).issubset({0, 1}):
    raise ValueError(f"Expected binary labels 0/1, got {unique_labels}")

df = df.rename(columns={TEXT_COL: "text", LABEL_COL: "labels"})

display(df.head(5))
print(df["labels"].value_counts(dropna=False))

Unique labels: [0, 1]


Unnamed: 0,text,labels
0,Station activity person against natural majori...,1
1,Authority research natural life material staff...,0
2,Manage whose quickly especially foot none to g...,0
3,Just cover eight opportunity strong policy which.,1
4,Animal sign six data good or.,1


labels
1    25018
0    24982
Name: count, dtype: int64


In [6]:
dataset = Dataset.from_pandas(df, preserve_index=False)

# 80/20 split -> train/temp
split1 = dataset.train_test_split(test_size=0.2, seed=SEED)
train_ds = split1["train"]
temp_ds = split1["test"]

# temp split into val/test -> 10/10 total
split2 = temp_ds.train_test_split(test_size=0.5, seed=SEED)
val_ds = split2["train"]
test_ds = split2["test"]

print(train_ds)
print(val_ds)
print(test_ds)

Dataset({
    features: ['text', 'labels'],
    num_rows: 40000
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 5000
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 5000
})


In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds = val_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
keep_cols = {"input_ids", "attention_mask", "labels"}

def keep_only(ds_split):
    remove_cols = [c for c in ds_split.column_names if c not in keep_cols]
    return ds_split.remove_columns(remove_cols)

train_ds = keep_only(train_ds)
val_ds = keep_only(val_ds)
test_ds = keep_only(test_ds)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print(train_ds)

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 40000
})


In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [10]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

In [11]:
os.makedirs("ml/bert_v1/checkpoints", exist_ok=True)

args = TrainingArguments(
    output_dir="ml/bert_v1/checkpoints",
    eval_strategy="epoch",          # if error, use evaluation_strategy="epoch"
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,             # v1 short run
    weight_decay=0.01,

    logging_steps=50,
    save_total_limit=2,
    report_to="none",
    fp16=False,                     # set True on supported NVIDIA GPU
)

In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,   
    compute_metrics=compute_metrics,
)

In [13]:
print("\nValidation metrics:")
val_metrics = trainer.evaluate(eval_dataset=val_ds)
for k, v in val_metrics.items():
    print(f"{k}: {v}")

print("\nTest metrics:")
test_metrics = trainer.evaluate(eval_dataset=test_ds)
for k, v in test_metrics.items():
    print(f"{k}: {v}")


Validation metrics:


  super().__init__(loader)


eval_loss: 0.6953549981117249
eval_model_preparation_time: 0.0026
eval_accuracy: 0.4964
eval_precision: 0.495208379763762
eval_recall: 0.8977777777777778
eval_f1: 0.638322321172077
eval_runtime: 6.6387
eval_samples_per_second: 753.161
eval_steps_per_second: 23.649

Test metrics:


  super().__init__(loader)


eval_loss: 0.6955486536026001
eval_model_preparation_time: 0.0026
eval_accuracy: 0.4934
eval_precision: 0.49337525263867055
eval_recall: 0.8880355699272433
eval_f1: 0.6343294355420817
eval_runtime: 6.3534
eval_samples_per_second: 786.982
eval_steps_per_second: 24.711


In [14]:
SAVE_DIR = "ml/bert_v1/model"
os.makedirs(SAVE_DIR, exist_ok=True)

trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print(f"Saved model to {SAVE_DIR}")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved model to ml/bert_v1/model


In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

LABELS = {0: "human", 1: "bot"}  # check if your dataset uses reversed labels

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

tok = AutoTokenizer.from_pretrained("ml/bert_v1/model")
mdl = AutoModelForSequenceClassification.from_pretrained("ml/bert_v1/model").to(device)
mdl.eval()

def predict_text(text):
    inputs = tok(text, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = mdl(**inputs).logits
        probs = torch.softmax(logits, dim=-1).squeeze().cpu().tolist()
    pred = int(np.argmax(probs))
    return {
        "label": LABELS[pred],
        "confidence": float(probs[pred]),
        "prob_human": float(probs[0]),
        "prob_bot": float(probs[1]),
    }

predict_text("This is a sample tweet for testing the model.")

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

{'label': 'bot',
 'confidence': 0.5287707448005676,
 'prob_human': 0.47122928500175476,
 'prob_bot': 0.5287707448005676}

In [None]:
# This is a basic model of bert. Lat we can test other models to find the best one. We can also do hyperparameter tuning and try to improve the model.