In [15]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [16]:
from transformers import set_seed

In [17]:
torch.cuda.empty_cache()

In [18]:
set_seed(69)

In [19]:
# Load datasets
df_bots_tweets = pd.read_csv('./content_polluters_tweets.txt',
                            sep='\t',
                            header=None,
                            names=["UserID", "TweetID", "Tweet", "CreatedAt"])

df_humans_tweets = pd.read_csv('./legitimate_users_tweets.txt',
                              sep='\t',
                              header=None,
                              names=["UserID", "TweetID", "Tweet", "CreatedAt"])

df_bots_tweets['is_bot'] = 1
df_humans_tweets['is_bot'] = 0

df_bots_tweets = df_bots_tweets.dropna(subset=['Tweet'])
df_humans_tweets = df_humans_tweets.dropna(subset=['Tweet'])

df_bots_tweets = df_bots_tweets.iloc[:25_000]
df_humans_tweets = df_humans_tweets.iloc[:25_000]

df_combined = pd.concat([df_bots_tweets, df_humans_tweets], axis=0, ignore_index=True)

df_combined = df_combined.dropna(subset=['Tweet'])
df_combined['Tweet'] = df_combined['Tweet'].astype(str)
df_combined = df_combined[df_combined['Tweet'].str.strip() != '']

In [20]:
df_humans_tweets.shape

(25000, 5)

In [21]:
df_bots_tweets.shape

(25000, 5)

In [22]:
df_combined.shape

(50000, 5)

In [23]:
type(df_combined['Tweet'])

pandas.core.series.Series

In [24]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset, random_split

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [27]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.cpu().numpy()

In [28]:
df_combined['Tweet Embedding'] = df_combined['Tweet'].apply(get_bert_embedding)
df_combined.head(5)

Unnamed: 0,UserID,TweetID,Tweet,CreatedAt,is_bot,Tweet Embedding
0,6301,5599519501,MELBOURNE ENQUIRY: Seeking a variety of acts f...,2009-11-10 15:14:31,1,"[0.14196883, -0.20637092, 0.53259736, -0.00723..."
1,6301,5600313663,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:46:05,1,"[0.2789788, -0.2811198, 0.6468443, 0.18883522,..."
2,6301,5600328557,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:46:40,1,"[0.28744933, -0.25366488, 0.67137194, 0.101002..."
3,6301,5600338093,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:47:03,1,"[0.21801575, -0.22536404, 0.63394326, 0.078535..."
4,6301,5600564863,"Come to ""The Burlesque Bootcamp - Sydney"" Satu...",2009-11-10 15:56:03,1,"[0.15251878, -0.28992698, 0.8231413, -0.060256..."


In [29]:
X = df_combined['Tweet Embedding']
y = df_combined['is_bot']
groups = df_combined['UserID']

In [30]:
X_tensor = torch.tensor(X.tolist(), dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)

  X_tensor = torch.tensor(X.tolist(), dtype=torch.float32)


In [31]:
train_size = int(0.8 * len(X))
test_size = len(X) - train_size
train_dataset, test_dataset = random_split(TensorDataset(X_tensor, y_tensor), [train_size, test_size])

In [205]:
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

In [206]:
from torch import nn, optim

class BotPredictionNN(nn.Module):
  def __init__(self, input_dim):
      super(BotPredictionNN, self).__init__()
      self.fc = nn.Sequential(
        nn.Dropout(0.4),
        nn.Linear(input_dim, 256),
        nn.GELU(),
        nn.Dropout(0.4),
        nn.Linear(256, 256),
        nn.GELU(),
        nn.Dropout(0.4),
        nn.Linear(256, 1),
        nn.Sigmoid()
      )

  def forward(self, x):
    return self.fc(x)

In [207]:
num_epochs = 1000

input_dim = X_tensor.shape[1]
model = BotPredictionNN(input_dim).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.LinearLR(optimizer, total_iters=20)
# scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.93)

In [208]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [209]:
early_stopper = EarlyStopper(patience=20, min_delta=0.003)
best_test_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    running_train_loss = 0.0   
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

    model.eval()
    running_test_loss = 0.0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_test_loss += loss.item()

    if (tl := running_test_loss / len(test_loader)) < best_test_loss:
        best_test_loss = tl
        torch.save(model.state_dict(), "best_model.pth")

    scheduler.step()
    print(
        f'Epoch [{epoch + 1}/{num_epochs}], '
        f'Training Loss: {running_train_loss / len(train_loader):.4f}, '
        f'Test Loss: {running_test_loss / len(test_loader):.4f}, '
        f'Best Test Loss: {best_test_loss:.4f}'
    )

    if early_stopper.early_stop(running_test_loss):
        break

print("Training complete.")

Epoch [1/1000], Training Loss: 0.6917, Test Loss: 0.6886, Best Test Loss: 0.6886
Epoch [2/1000], Training Loss: 0.6854, Test Loss: 0.6813, Best Test Loss: 0.6813
Epoch [3/1000], Training Loss: 0.6757, Test Loss: 0.6692, Best Test Loss: 0.6692
Epoch [4/1000], Training Loss: 0.6599, Test Loss: 0.6499, Best Test Loss: 0.6499
Epoch [5/1000], Training Loss: 0.6380, Test Loss: 0.6271, Best Test Loss: 0.6271
Epoch [6/1000], Training Loss: 0.6181, Test Loss: 0.6108, Best Test Loss: 0.6108
Epoch [7/1000], Training Loss: 0.6062, Test Loss: 0.6020, Best Test Loss: 0.6020
Epoch [8/1000], Training Loss: 0.5992, Test Loss: 0.5943, Best Test Loss: 0.5943
Epoch [9/1000], Training Loss: 0.5922, Test Loss: 0.5874, Best Test Loss: 0.5874
Epoch [10/1000], Training Loss: 0.5872, Test Loss: 0.5809, Best Test Loss: 0.5809
Epoch [11/1000], Training Loss: 0.5822, Test Loss: 0.5747, Best Test Loss: 0.5747
Epoch [12/1000], Training Loss: 0.5774, Test Loss: 0.5694, Best Test Loss: 0.5694
Epoch [13/1000], Training

In [210]:
model.load_state_dict(torch.load("best_model.pth"))

  model.load_state_dict(torch.load("best_model.pth"))


<All keys matched successfully>

In [211]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = (outputs >= 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')

evaluate_model(model, test_loader)

Test Accuracy: 0.7840
