In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

from tqdm.notebook import tqdm

In [2]:
train_df = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet")

texts = train_df['prompt'] + " " + train_df['response_a'] + " " + train_df['response_b']
labels = train_df['winner'].map(lambda x: 0 if x == 'model_a' else 1)

In [3]:
MAX_LEN = 4096
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')

In [4]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenize_text(self.texts[idx])
        return {"input_ids": encoding["input_ids"].squeeze(0), 
                "attention_mask": encoding["attention_mask"].squeeze(0),
                "label": torch.tensor(self.labels[idx], dtype=torch.long)}

In [5]:
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=1)

train_dataset = TextDataset(X_train.tolist(), y_train.tolist())
val_dataset = TextDataset(X_val.tolist(), y_val.tolist())
test_dataset = TextDataset(X_test.tolist(), y_test.tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [6]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=f) for f in filter_sizes
        ])
        self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, input_ids):
        x = self.embedding(input_ids).permute(0, 2, 1)  # Reshape for Conv1d
        x = [torch.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

In [7]:
VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 32
NUM_FILTERS = 64
FILTER_SIZES = [3, 4, 5]
NUM_CLASSES = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = TextCNN(VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, NUM_CLASSES).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

cuda


In [8]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        for batch in tqdm(train_loader):
            input_ids, labels = batch['input_ids'].to(device), batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        accuracy = correct / total
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}, Accuracy: {accuracy:.4f}")

In [None]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)

  0%|          | 0/1211 [00:00<?, ?it/s]

Epoch 1, Loss: 0.7259990688594878, Accuracy: 0.5023


  0%|          | 0/1211 [00:00<?, ?it/s]

Epoch 2, Loss: 0.7052910024227926, Accuracy: 0.4989


  0%|          | 0/1211 [00:00<?, ?it/s]

Epoch 3, Loss: 0.702501532216391, Accuracy: 0.5006


  0%|          | 0/1211 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), "textcnn_model.pth")