In [None]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import ast
from torch.cuda.amp import autocast, GradScaler

In [None]:
# 1) Load the datasets
train_df = pd.read_csv('/kaggle/input/processed-subtask1/subtask1-train_preprocessed.csv')
test_df = pd.read_csv('/kaggle/input/processed-subtask1/subtask1-test_preprocessed.csv')  # Corrected test data loading

# 2) Preprocess the 'techniques' column
train_df['techniques'] = train_df['techniques'].apply(ast.literal_eval)

In [None]:

# 3) Define all technique labels
all_techniques = [
    'straw_man', 'appeal_to_fear', 'fud', 'bandwagon',
    'whataboutism', 'loaded_language',
    'glittering_generalities', 'euphoria',
    'cherry_picking', 'cliche'
]

# 4) Build a multi-hot label matrix
label_matrix = pd.DataFrame(0, index=train_df.index, columns=all_techniques)
for idx, techniques in enumerate(train_df['techniques']):
    for tech in techniques:
        if tech in all_techniques:
            label_matrix.at[idx, tech] = 1
train_df = pd.concat([train_df, label_matrix], axis=1)

# 5) Define technique descriptions
technique_descriptions = {tech: tech.replace('_', ' ') for tech in all_techniques}

In [None]:
# 6) Load tokenizer and reranker model with XLM-RoBERTa Large
model_name = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
reranker_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# 7) Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
reranker_model = reranker_model.to(device)

# 8) Define optimizer and loss function
optimizer = AdamW(reranker_model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = nn.BCEWithLogitsLoss()
scaler = GradScaler()

In [None]:
# 9) Dataset wrapper
class TextDataset(Dataset):
    def __init__(self, df, has_labels=True):
        self.df = df
        self.has_labels = has_labels

    def __getitem__(self, idx):
        item = {'content': self.df.iloc[idx]['content']}
        if self.has_labels:
            labels = self.df.iloc[idx][all_techniques].values.astype(float)
            item['labels'] = torch.tensor(labels, dtype=torch.float)
        return item

    def __len__(self):
        return len(self.df)

# 10) Create datasets and dataloaders
train_dataset = TextDataset(train_df, has_labels=True)
test_dataset = TextDataset(test_df, has_labels=False)
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)


In [None]:
# 11) Training loop with early stopping
num_epochs = 100  # Adjusted to 100 as per user preference
patience = 4
best_loss = float('inf')
patience_counter = 0
accumulation_steps = 2

for epoch in range(num_epochs):
    reranker_model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for i, batch in enumerate(train_loader):
        batch_texts = batch['content']
        batch_labels = batch['labels'].to(device)  # (B, N)

        B = len(batch_texts)
        N = len(all_techniques)
        queries = [technique_descriptions[tech] for tech in all_techniques for _ in range(B)]
        passages = [text for _ in range(N) for text in batch_texts]

        # Tokenize
        inputs = tokenizer(queries, passages, padding=True, truncation=True, return_tensors='pt', max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get scores
        with autocast():
            outputs = reranker_model(**inputs)
            scores = outputs.logits.view(B, N)  # (B, N)
            loss = loss_fn(scores, batch_labels) / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

        if i % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {i}, Loss {total_loss / (i+1):.4f}")

        # Free up memory
        del inputs, outputs, scores, loss
        torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} completed — Avg Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save(reranker_model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


In [None]:

# 12) Inference
reranker_model.load_state_dict(torch.load('best_model.pt'))
reranker_model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch_texts = batch['content']

        B = len(batch_texts)
        N = len(all_techniques)
        queries = [technique_descriptions[tech] for tech in all_techniques for _ in range(B)]
        passages = [text for _ in range(N) for text in batch_texts]

        inputs = tokenizer(queries, passages, padding=True, truncation=True, return_tensors='pt', max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with autocast():
            outputs = reranker_model(**inputs)
            scores = outputs.logits.view(B, N)
            probs = torch.sigmoid(scores)
            preds = (probs > 0.5).float()
            predictions.extend(preds.cpu().numpy())

        del inputs, outputs, scores, probs, preds
        torch.cuda.empty_cache()

# 13) Build submission
submission_df = pd.DataFrame(columns=['id'] + all_techniques)
submission_df['id'] = test_df['id']
for idx, tech in enumerate(all_techniques):
    submission_df[tech] = [int(pred[idx]) for pred in predictions]

submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully!")