In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import json
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.utils import shuffle
import torch.nn.functional as F

# Function to load the data
def get_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    df = pd.DataFrame(data)
    return df

# Load the dataset
data = get_data('output_file copy.json')

# Print original class distribution
print("Original class distribution:")
print(data['user_class'].value_counts())

# Separate data by user_class and balance
bots = data[data['user_class'] == 'bot']
humans = data[data['user_class'] == 'human']

# Select the minimum class size and balance the dataset
min_class_size = min(len(bots), len(humans))
bots_balanced = bots.sample(n=min_class_size, random_state=1)
humans_balanced = humans.sample(n=min_class_size, random_state=1)

# Combine the balanced classes and shuffle
balanced_data = pd.concat([bots_balanced, humans_balanced])
balanced_data = shuffle(balanced_data, random_state=1).reset_index(drop=True)

# Print balanced class distribution
print("Balanced class distribution:")
print(balanced_data['user_class'].value_counts())

# Define dataset class
class UserDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.loc[idx, 'action_blocstring']
        label = 1 if self.data.loc[idx, 'user_class'] == 'bot' else 0
        return {
            'text': text, 
            'label': label 
        }

# Create dataset
dataset = UserDataset(balanced_data)

# Split dataset into train, validation, and test sets
train_size = int(0.6 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Tokenizer to split text into individual characters
def tokenizer(text):
    return list(text)

# Token counts
from collections import Counter, OrderedDict

token_counts = Counter()

# Tokenize the training data
for entry in train_dataset:
    line = entry['text']
    tokens = tokenizer(line)
    token_counts.update(tokens)

# Sort tokens by frequency
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

# Create vocabulary dictionary with special tokens
vocab = {"<pad>": 0, "<unk>": 1}
for idx, (token, count) in enumerate(ordered_dict.items(), start=2):
    vocab[token] = idx

# Print vocab size
print('Vocab-size:', len(vocab))

# Function to encode tokens
def encode(tokens):
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

# Text pipeline for encoding tokens
text_pipeline = lambda x: [vocab.get(token, vocab["<unk>"]) for token in tokenizer(x)]
label_pipeline = lambda x: float(x)

# Collate batch function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for entry in batch:
        _label = entry['label']
        _text = entry['text']
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)

    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

# Attention-based LSTM Model
class AttentionLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size, num_layers=2, dropout_rate=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_rate)
        self.attn = nn.Linear(rnn_hidden_size, 1)  # Attention layer to calculate attention scores
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        # Embedding layer
        out = self.embedding(text)

        # Pack the sequences to handle padding correctly
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)

        # RNN output
        out, (hidden, cell) = self.rnn(out)

        # Unpack sequence (restore padded sequences)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)

        # Attention mechanism: Calculate attention scores
        attention_weights = F.softmax(self.attn(out), dim=1)

        # Weighted sum of the RNN outputs
        weighted_sum = torch.sum(attention_weights * out, dim=1)

        # Pass the weighted sum through the fully connected layers
        out = self.fc1(weighted_sum)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)

        return out, attention_weights

# Initialize model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 0.001
embed_dim = 64
rnn_hidden_size = 128
fc_hidden_size = 64
num_layers = 2
dropout_rate = 0.5

model = AttentionLSTM(len(vocab), embed_dim, rnn_hidden_size, fc_hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

# Training function
def train_epoch(dataloader, model, optimizer, loss_fn):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred, _ = model(text_batch, lengths)
        loss = loss_fn(pred.squeeze(), label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

# Evaluation function
def evaluate_epoch(dataloader, model, loss_fn):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred, _ = model(text_batch, lengths)
            loss = loss_fn(pred.squeeze(), label_batch)
            total_acc += ((pred >= 0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item() * label_batch.size(0)
    return total_acc / len(dataloader.dataset), total_loss / len(dataloader.dataset)

# Train the model with early stopping
num_epochs = 40
early_stop_patience = 10
best_val_loss = 10
train_accuracies = []
train_losses = []
valid_accuracies = []
valid_losses = []

for epoch in range(num_epochs):
    acc_train, loss_train = train_epoch(train_loader, model, optimizer, loss_fn)
    acc_valid, loss_valid = evaluate_epoch(val_loader, model, loss_fn)

    train_accuracies.append(acc_train)
    train_losses.append(loss_train)
    valid_accuracies.append(acc_valid)
    valid_losses.append(loss_valid)

    print(f'Epoch {epoch + 1} - train_accuracy: {acc_train:.4f}, val_accuracy: {acc_valid:.4f}, train_loss: {loss_train:.4f}, val_loss: {loss_valid:.4f}')

    if loss_valid < best_val_loss:
        best_val_loss = loss_valid
        best_epoch = epoch
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= early_stop_patience:
        print(f"Early stopping at epoch {epoch + 1}. Best validation loss was {best_val_loss:.4f} at epoch {best_epoch + 1}.")
        break

# Plot the training and validation loss and accuracy
epochs_completed = len(train_losses)

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs_completed + 1), train_losses, label='Training Loss', color='red')
plt.plot(range(1, epochs_completed + 1), valid_losses, label='Validation Loss', color='blue')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, epochs_completed + 1), train_accuracies, label='Training Accuracy', color='red')
plt.plot(range(1, epochs_completed + 1), valid_accuracies, label='Validation Accuracy', color='blue')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.show()
