In [None]:
!pip install torch
!pip install transformers




In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel
import numpy as np
import ast
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler
import pandas as pd

In [None]:
# Load the datasets
combined_df = pd.read_csv('/content/drive/MyDrive/Sharedtask/combined_augmented_dataset_with_dev.csv')
train_df = combined_df.copy()
test_df = pd.read_csv('/content/drive/MyDrive/Sharedtask/test.csv')  # Replace with your test file path

# Preprocess techniques
train_df['techniques'] = train_df['techniques'].apply(ast.literal_eval)

In [None]:
# Define techniques columns
all_techniques = ['straw_man', 'appeal_to_fear', 'fud', 'bandwagon',
                  'whataboutism', 'loaded_language',
                  'glittering_generalities', 'euphoria',
                  'cherry_picking', 'cliche']

# Create label matrix for training data
label_matrix = pd.DataFrame(0, index=train_df.index, columns=all_techniques)
for idx, techniques in enumerate(train_df['techniques']):
    for technique in techniques:
        if technique in all_techniques:
            label_matrix.at[idx, technique] = 1
train_df = pd.concat([train_df, label_matrix], axis=1)

In [None]:
# Tokenization
model_name = "Alibaba-NLP/gte-multilingual-reranker-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(df, tokenizer, max_length=128, is_test=False):
    print("Tokenizing data...")
    encodings = tokenizer(
        df['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    if is_test:
        return encodings
    labels = torch.tensor(df[all_techniques].values, dtype=torch.float)
    return encodings, labels

# Tokenize training and test data
train_encodings, train_labels = tokenize_data(train_df, tokenizer)
test_encodings = tokenize_data(test_df, tokenizer, is_test=True)

In [None]:
# Dataset Class
class TextDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):  # Fixed method name
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):  # Fixed method name
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings)

In [None]:
# Model Definition
class MultiLabelClassifier(nn.Module):
    def __init__(self, transformer_model_name, num_labels, hidden_dim=256, lstm_layers=2):
        super(MultiLabelClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        transformer_hidden_dim = self.transformer.config.hidden_size

        # BiLSTM Layer
        self.bilstm = nn.LSTM(input_size=transformer_hidden_dim,
                              hidden_size=hidden_dim,
                              num_layers=lstm_layers,
                              batch_first=True,
                              bidirectional=True)

        # Classifier
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)  # *2 for bidirectional

        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = transformer_outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        # Pass through BiLSTM
        lstm_output, _ = self.bilstm(sequence_output)  # (batch_size, seq_len, hidden_dim*2)

        # Use the last hidden state of the BiLSTM for classification
        lstm_last_output = lstm_output[:, -1, :]  # (batch_size, hidden_dim*2)

        # Apply dropout
        lstm_last_output = self.dropout(lstm_last_output)

        # Get logits
        logits = self.classifier(lstm_last_output)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)

        return loss, logits

In [None]:
# Training Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiLabelClassifier(
    transformer_model_name=model_name,
    num_labels=len(all_techniques),
    hidden_dim=256,
    lstm_layers=2
)
model.to(device)

# Optimizer and Learning Rate
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scaler = GradScaler()

# Training Parameters
batch_size = 8  # Reduced from 16 to manage memory
num_epochs = 100
patience = 2
best_loss = float('inf')
patience_counter = 0
accumulation_steps = 2  # Gradient accumulation to simulate larger batch size

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)

# Training Loop with Early Stopping
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs} started")
    model.train()
    total_loss = 0
    optimizer.zero_grad()  # Zero gradients at the start

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)

        with autocast():  # Mixed precision to reduce memory usage
            loss, logits = model(input_ids, attention_mask, labels)
            loss = loss / accumulation_steps  # Scale loss for accumulation

        scaler.scale(loss).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps  # Adjust for accumulation

        if batch_idx % 10 == 0:  # Print every 10 batches
            print(f"Batch {batch_idx}/{len(train_loader)} - Loss: {loss.item() * accumulation_steps:.4f}")

        # Clear memory
        del input_ids, attention_mask, labels, loss, logits
        torch.cuda.empty_cache()

    # Calculate average loss
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Early stopping check
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')  # Save best model
    else:
        patience_counter += 1
        print(f"Patience counter: {patience_counter}/{patience}")

    if patience_counter >= patience:
        print(f"Early stopping triggered after epoch {epoch+1}")
        break


In [None]:
# Load best model for prediction
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        with autocast():  # Mixed precision for inference
            _, logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float()
        predictions.extend(preds.cpu().numpy())

        # Clear memory
        del input_ids, attention_mask, logits, probs, preds
        torch.cuda.empty_cache()

# Create Submission DataFrame
submission_df = pd.DataFrame(columns=['id'] + all_techniques)
submission_df['id'] = test_df['id']

for technique in all_techniques:
    submission_df[technique] = [pred[all_techniques.index(technique)] for pred in predictions]

# Save Submission
submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Tokenizing data...
Tokenizing data...
The repository for Alibaba-NLP/gte-multilingual-reranker-base contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-reranker-base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


  scaler = GradScaler()
  with autocast():  # Mixed precision to reduce memory usage


Epoch 1/100 started
Batch 0/1952 - Loss: 0.6987
Batch 10/1952 - Loss: 0.6847
Batch 20/1952 - Loss: 0.6662
Batch 30/1952 - Loss: 0.6352
Batch 40/1952 - Loss: 0.6128
Batch 50/1952 - Loss: 0.5789
Batch 60/1952 - Loss: 0.5683
Batch 70/1952 - Loss: 0.5363
Batch 80/1952 - Loss: 0.5420
Batch 90/1952 - Loss: 0.5278
Batch 100/1952 - Loss: 0.4808
Batch 110/1952 - Loss: 0.4358
Batch 120/1952 - Loss: 0.4840
Batch 130/1952 - Loss: 0.4220
Batch 140/1952 - Loss: 0.3979
Batch 150/1952 - Loss: 0.3691
Batch 160/1952 - Loss: 0.4060
Batch 170/1952 - Loss: 0.4351
Batch 180/1952 - Loss: 0.3204
Batch 190/1952 - Loss: 0.3049
Batch 200/1952 - Loss: 0.3567
Batch 210/1952 - Loss: 0.3681
Batch 220/1952 - Loss: 0.4328
Batch 230/1952 - Loss: 0.4574
Batch 240/1952 - Loss: 0.3497
Batch 250/1952 - Loss: 0.4491
Batch 260/1952 - Loss: 0.3152
Batch 270/1952 - Loss: 0.4779
Batch 280/1952 - Loss: 0.4115
Batch 290/1952 - Loss: 0.3580
Batch 300/1952 - Loss: 0.3936
Batch 310/1952 - Loss: 0.2841
Batch 320/1952 - Loss: 0.3048
B