In [19]:
%pip install torch transformers

Note: you may need to restart the kernel to use updated packages.


In [20]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [21]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [22]:
# Define the Transformer model
class CustomTransformerModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(CustomTransformerModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(input_size, output_size)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        logits = self.fc(pooled_output)
        return logits

In [23]:
# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['description'])
        label = int(self.data.iloc[idx]['label'])
        
        # Tokenize text
        encoding = self.tokenizer(text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [24]:
# Load data
train_data = pd.read_excel('train_new.xlsx')
dev_data = pd.read_excel('dev_new.xlsx')

In [25]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/vocab.txt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000299C973F210>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt


In [26]:
# Model and optimizer
input_size = 768
output_size = 2
model = CustomTransformerModel(input_size, output_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

'HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000299D355FC90>, 'Connection to huggingface.co timed out. (connect timeout=10)'))' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json


In [27]:
# Loss function
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 4
batch_size = 8

In [28]:
# Create DataLoader
train_dataset = CustomDataset(train_data, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [29]:
label_mapping = {'bot': 0, 'human': 1}
train_data['label'] = train_data['label'].map(label_mapping)
dev_data['label'] = dev_data['label'].map(label_mapping)

In [32]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}')

Epoch 1/4, Loss: 0.4240371968492447
Epoch 2/4, Loss: 0.30372580426584284
Epoch 3/4, Loss: 0.2426694126268587
Epoch 4/4, Loss: 0.19965338585038023


In [33]:
# Validation loop
model.eval()
with torch.no_grad():
    dev_dataset = CustomDataset(dev_data, tokenizer, max_length=128)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

    correct = 0
    total = 0
    for batch in dev_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.616
