In [1]:
%pip install transformers torch


Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

def load_data(path):
    with open(path) as f:
        data = json.load(f)
    texts = []
    labels = []
    for label, text_list in data.items():
        for text in text_list:
            texts.append(text.strip())
            labels.append(label)
    return texts, labels

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load and process training data
train_texts, train_labels = load_data('C:/Users/Maamar/Desktop/CS_ANLP_KaggComp/Data/augmented_train.json')
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

# Training loop
for epoch in range(3):
    loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Epoch {epoch + 1}, Loss: {loss:.2f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 2.13
Epoch 2, Loss: 1.31
Epoch 3, Loss: 0.68


In [6]:
def load_test_data(path):
    with open(path, 'r') as file:
        texts = file.readlines()
    return [text.strip() for text in texts]

test_texts = load_test_data('C:/Users/Maamar/Desktop/CS_ANLP_KaggComp/Data/test_shuffle.txt')
test_dataset = CustomDataset(test_texts, [0]*len(test_texts), tokenizer, max_len=128)  # Dummy labels
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

def predict(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

predictions = predict(model, test_loader, device)
predicted_labels = label_encoder.inverse_transform(predictions)

# Export predictions
import pandas as pd
submission_df = pd.DataFrame({'Label': predicted_labels})
submission_df.to_csv('predictions.csv', index=False)


In [7]:
# Assuming predictions are already generated as 'predicted_labels'
import pandas as pd

# Create DataFrame for submission
submission_df = pd.DataFrame({
    'ID': range(len(predicted_labels)),  # Creating an ID column starting from 0
    'Label': predicted_labels           # Labels from the predictions
})

# Save to CSV file
submission_df.to_csv('predictions.csv', index=False)
