In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-dataset/combined_augmented_dataset.csv
/kaggle/input/new-dataset/test_df.csv


In [1]:
!pip install transformers torch pandas scikit-learn numpy



In [2]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW
import numpy as np
import ast

In [3]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/new-dataset/combined_augmented_dataset.csv')
test_df = pd.read_csv('/kaggle/input/new-dataset/test_df.csv')

# Preprocess techniques
train_df['techniques'] = train_df['techniques'].apply(ast.literal_eval)

# Define techniques columns
all_techniques = ['straw_man', 'appeal_to_fear', 'fud', 'bandwagon',
                  'whataboutism', 'loaded_language',
                  'glittering_generalities', 'euphoria',
                  'cherry_picking', 'cliche']

In [4]:
# Create label matrix for training data
label_matrix = pd.DataFrame(0, index=train_df.index, columns=all_techniques)
for idx, techniques in enumerate(train_df['techniques']):
    for technique in techniques:
        if technique in all_techniques:
            label_matrix.at[idx, technique] = 1
train_df = pd.concat([train_df, label_matrix], axis=1)

# Tokenization
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(df, tokenizer, max_length=512, is_test=False):
    print("tokenize data")
    encodings = tokenizer(
        df['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    if is_test:
        return encodings
    labels = torch.tensor(df[all_techniques].values, dtype=torch.float)
    return encodings, labels

# Tokenize training and test data
train_encodings, train_labels = tokenize_data(train_df, tokenizer)
test_encodings = tokenize_data(test_df, tokenizer, is_test=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenize data
tokenize data


In [5]:
# Dataset Class
class TextDataset(Dataset):
    print("in class dataset")
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings)

in class dataset


In [6]:
# Model Definition
class MultiLabelClassifier(nn.Module):
    def __init__(self, transformer_model_name, num_labels):
        print("init")
        super(MultiLabelClassifier, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        transformer_hidden_dim = self.transformer.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(transformer_hidden_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = transformer_outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)
        return loss, logits

In [7]:
# Training Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiLabelClassifier(
    transformer_model_name=model_name,
    num_labels=len(all_techniques)
)
model.to(device)

# Optimizer and Learning Rate
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Training Parameters
batch_size = 8
num_epochs = 100  # Changed to 100 epochs
patience = 2  # Number of epochs to wait for improvement
best_loss = float('inf')
patience_counter = 0

init


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]



In [8]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:
# Training Loop with Early Stopping
for epoch in range(num_epochs):
    print("epoch started")
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        loss, logits = model(input_ids, attention_mask, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Calculate average loss
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    # Early stopping check
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        # Optional: Save the best model
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        print(f"Patience counter: {patience_counter}/{patience}")
        
    if patience_counter >= patience:
        print(f"Early stopping triggered after epoch {epoch+1}")
        break

# Load best model for prediction
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        _, logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float()
        predictions.extend(preds.cpu().numpy())

# Create Submission DataFrame
submission_df = pd.DataFrame(columns=['id'] + all_techniques)
submission_df['id'] = test_df['id']

for technique in all_techniques:
    submission_df[technique] = [pred[all_techniques.index(technique)] for pred in predictions]

# Save Submission
submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

epoch started
Epoch 1/100, Loss: 0.2858
epoch started
Epoch 2/100, Loss: 0.2025
epoch started
Epoch 3/100, Loss: 0.1322
epoch started
Epoch 4/100, Loss: 0.0805
epoch started
Epoch 5/100, Loss: 0.0509
epoch started
Epoch 6/100, Loss: 0.0355
epoch started
Epoch 7/100, Loss: 0.0282
epoch started
Epoch 8/100, Loss: 0.0240
epoch started
Epoch 9/100, Loss: 0.0219
epoch started
Epoch 10/100, Loss: 0.0194
epoch started
Epoch 11/100, Loss: 0.0182
epoch started
Epoch 12/100, Loss: 0.0160
epoch started
Epoch 13/100, Loss: 0.0164
Patience counter: 1/2
epoch started
Epoch 14/100, Loss: 0.0141
epoch started
Epoch 15/100, Loss: 0.0142
Patience counter: 1/2
epoch started
Epoch 16/100, Loss: 0.0113
epoch started
Epoch 17/100, Loss: 0.0135
Patience counter: 1/2
epoch started
Epoch 18/100, Loss: 0.0122
Patience counter: 2/2
Early stopping triggered after epoch 18


  model.load_state_dict(torch.load('best_model.pt'))


Submission file created successfully!
