In [1]:
%pip install transformers torch scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaTokenizer
import torch

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_attention_mask=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_data(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    texts = [value for sublist in data.values() for value in sublist]
    labels = [key for key, sublist in data.items() for _ in sublist]
    return texts, labels

# Load and preprocess data
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
texts, labels = load_data('C:/Users/Maamar/Desktop/CS_ANLP_KaggComp/Data/augmented_train.json')
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# Create datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len=128)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len=128)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

In [4]:
from transformers import DebertaForSequenceClassification

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=len(set(labels)))


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-6)

def train(model, data_loader, optimizer, device, epochs=4):
    model.train()
    for epoch in range(epochs):
        loop = tqdm(data_loader, leave=True)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loop.set_description(f'Epoch {epoch+1}')
            loop.set_postfix(loss=loss.item())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
train(model, train_loader, optimizer, device)


Epoch 1: 100%|██████████| 84/84 [13:49<00:00,  9.87s/it, loss=2.41]
Epoch 2: 100%|██████████| 84/84 [13:14<00:00,  9.46s/it, loss=0.756]
Epoch 3: 100%|██████████| 84/84 [12:59<00:00,  9.28s/it, loss=0.647]
Epoch 4: 100%|██████████| 84/84 [13:00<00:00,  9.29s/it, loss=0.168]


In [6]:
def load_test_data(filename):
    with open(filename, 'r') as file:
        texts = [line.strip() for line in file.readlines()]
    return texts

test_texts = load_test_data('C:/Users/Maamar/Desktop/CS_ANLP_KaggComp/Data/test_shuffle.txt')
test_dataset = TextDataset(test_texts, [0]*len(test_texts), tokenizer, max_len=128)  # Dummy labels for compatibility
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [7]:
def predict(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

# Generate predictions
test_predictions = predict(model, test_loader, device)


In [8]:
# Assuming label_encoder was used to fit_transform labels
predicted_labels = label_encoder.inverse_transform(test_predictions)


In [9]:
import pandas as pd

submission_df = pd.DataFrame({
    'ID': range(len(predicted_labels)),
    'Label': predicted_labels
})

# Save to CSV file for submission
submission_df.to_csv('submission_deberta.csv', index=False)
