In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mahedersubtask1/validation.csv
/kaggle/input/mahedersubtask1/train.csv
/kaggle/input/cleanedval/cleaned_validation_dataset.csv
/kaggle/input/processed-aug-data/preprocessed_augmented_dataset.csv
/kaggle/input/cleaned/cleaned_trained_dataset.csv
/kaggle/input/testdataa/cleaned_test_dataset.csv
/kaggle/input/testdataa/test.csv


In [2]:
import pandas as pd
df=pd.read_csv('/kaggle/input/cleaned/cleaned_trained_dataset.csv')
valdf=pd.read_csv('/kaggle/input/testdataa/test.csv')

In [3]:
import pandas as pd

# Assume you already have the DataFrame `df` with a column 'label'

# Desired sample size per class
target_count = 1301

# Undersample each class
balanced_df = (
    df.groupby('label')
    .apply(lambda x: x.sample(n=target_count, random_state=42))
    .reset_index(drop=True)
)

# Check the new distribution
print(balanced_df['label'].value_counts())
df=balanced_df

label
hate              1301
hope              1301
not_applicable    1301
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=target_count, random_state=42))


In [4]:
df['label'].value_counts()

label
hate              1301
hope              1301
not_applicable    1301
Name: count, dtype: int64

In [5]:
# df = df[:1000]


In [6]:
from sklearn.model_selection import train_test_split

# Split df into train and validation (80% train, 20% val)
train_df=df

In [7]:
train_df['label'].value_counts()

label
hate              1301
hope              1301
not_applicable    1301
Name: count, dtype: int64

In [8]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from torch.cuda.amp import autocast, GradScaler
from sklearn.preprocessing import LabelEncoder

# 1) Load the datasets
# id, text, label
test_df = valdf    # id, text (assuming test has no labels)

# 2) Define all technique labels
all_techniques = ['Hope', 'Hate', 'not_applicable']

# 3) Create label encoder for single-label classification
label_encoder = LabelEncoder()
train_df['label_encoded'] = label_encoder.fit_transform(train_df['label'])

# Print label mapping for reference
print("Label mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")

# 4) Load tokenizer and model - using XLM-RoBERTa for Arabic support
model_name = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# For single-label classification, num_labels = number of classes
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(all_techniques))

# 5) Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 6) Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
# For single-label classification, use CrossEntropyLoss
loss_fn = nn.CrossEntropyLoss()
scaler = GradScaler()

# 7) Dataset wrapper for single-label classification
class TextDataset(Dataset):
    def __init__(self, df, has_labels=True):
        self.df = df
        self.has_labels = has_labels
    
    def __getitem__(self, idx):
        item = {'content': self.df.iloc[idx]['text']}
        if self.has_labels:
            # Single label as integer
            label = self.df.iloc[idx]['label_encoded']
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item
    
    def __len__(self):
        return len(self.df)

# 8) Create datasets and dataloaders
train_dataset = TextDataset(train_df, has_labels=True)
test_dataset = TextDataset(test_df, has_labels=False)

batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)

# 9) Training loop with early stopping
num_epochs = 100  # Adjusted for single-label task
patience = 3
best_loss = float('inf')
patience_counter = 0
accumulation_steps = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()
    
    for i, batch in enumerate(train_loader):
        batch_texts = batch['content']
        batch_labels = batch['labels'].to(device)  # (B,) - single label per sample
        
        # Tokenize the text directly (no query-passage pairs for single-label)
        inputs = tokenizer(batch_texts, padding=True, truncation=True, 
                          return_tensors='pt', max_length=512)  # Increased max_length for Arabic
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get predictions
        with autocast():
            outputs = model(**inputs)
            logits = outputs.logits  # (B, num_classes)
            loss = loss_fn(logits, batch_labels) / accumulation_steps
        
        scaler.scale(loss).backward()
        
        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
        
        if i % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {i}, Loss {total_loss / (i+1):.4f}")
        
        # Free up memory
        del inputs, outputs, logits, loss
        torch.cuda.empty_cache()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} completed — Avg Loss: {avg_loss:.4f}")
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"New best model saved with loss: {best_loss:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# 10) Inference
model.load_state_dict(torch.load('best_model.pt'))
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch_texts = batch['content']
        
        # Tokenize the text
        inputs = tokenizer(batch_texts, padding=True, truncation=True, 
                          return_tensors='pt', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with autocast():
            outputs = model(**inputs)
            logits = outputs.logits  # (B, num_classes)
            # Get predicted class (argmax)
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
        
        del inputs, outputs, logits, preds
        torch.cuda.empty_cache()

# 11) Convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(predictions)
print(predicted_labels)
# 12) Build submission
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'label': predicted_labels
})

# submission_df.to_csv('submission.csv', index=False)
print("Submission file created successfully!")
print(f"Predictions distribution:")
print(pd.Series(predicted_labels).value_counts())

Label mapping:
0: hate
1: hope
2: not_applicable


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2025-07-21 07:27:06.082121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753082826.325590      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753082826.392953      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Epoch 1, Batch 0, Loss 1.1477
Epoch 1, Batch 10, Loss 1.1406
Epoch 1, Batch 20, Loss 1.1396
Epoch 1, Batch 30, Loss 1.1321
Epoch 1, Batch 40, Loss 1.1385
Epoch 1, Batch 50, Loss 1.1338
Epoch 1, Batch 60, Loss 1.1359
Epoch 1, Batch 70, Loss 1.1334
Epoch 1, Batch 80, Loss 1.1297
Epoch 1, Batch 90, Loss 1.1253
Epoch 1, Batch 100, Loss 1.1230
Epoch 1, Batch 110, Loss 1.1189
Epoch 1, Batch 120, Loss 1.1186
Epoch 1, Batch 130, Loss 1.1202
Epoch 1, Batch 140, Loss 1.1198
Epoch 1, Batch 150, Loss 1.1195
Epoch 1, Batch 160, Loss 1.1189
Epoch 1, Batch 170, Loss 1.1172
Epoch 1, Batch 180, Loss 1.1173
Epoch 1, Batch 190, Loss 1.1171
Epoch 1, Batch 200, Loss 1.1168
Epoch 1, Batch 210, Loss 1.1147
Epoch 1, Batch 220, Loss 1.1171
Epoch 1, Batch 230, Loss 1.1165
Epoch 1, Batch 240, Loss 1.1158
Epoch 1, Batch 250, Loss 1.1149
Epoch 1, Batch 260, Loss 1.1160
Epoch 1, Batch 270, Loss 1.1153
Epoch 1, Batch 280, Loss 1.1148
Epoch 1, Batch 290, Loss 1.1143
Epoch 1, Batch 300, Loss 1.1141
Epoch 1, Batch 310,

  with autocast():


['not_applicable' 'not_applicable' 'not_applicable' ... 'not_applicable'
 'not_applicable' 'not_applicable']
Submission file created successfully!
Predictions distribution:
not_applicable    1477
Name: count, dtype: int64


In [9]:
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# # 1. Ensure your test_df has the true labels
# # If not already encoded, encode them using the same label_encoder
# test_df['label_encoded'] = label_encoder.transform(test_df['label'])

# # 2. Compute accuracy
# true_labels = test_df['label_encoded'].values
# accuracy = accuracy_score(true_labels, predictions)

# # 3. Detailed metrics
# report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)
# conf_matrix = confusion_matrix(true_labels, predictions)

# print(f"\n✅ Accuracy on test set: {accuracy:.4f}")
# print("\n🔍 Classification Report:")
# print(report)

# print("📊 Confusion Matrix:")
# print(conf_matrix)


In [10]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.cuda.amp import autocast
from sklearn.preprocessing import LabelEncoder
import numpy as np

def predict_labels(test_df, model_path='best_model.pt', model_name="xlm-roberta-large", 
                   all_techniques=['hope', 'hate', 'not_applicable'], batch_size=8):
    """
    Function to predict labels on a new test dataset using the trained model.
    
    Parameters:
    - test_df: DataFrame with columns ['id', 'text']
    - model_path: Path to the saved model weights
    - model_name: Name of the transformer model used
    - all_techniques: List of label classes (should match training)
    - batch_size: Batch size for inference
    
    Returns:
    - submission_df: DataFrame with columns ['id', 'label'] containing predictions
    """
    
    # Data preprocessing and validation
    print("Preprocessing data...")
    test_df = test_df.copy()
    test_df['text'] = test_df['text'].fillna("").astype(str)
    
    # Identify empty texts
    empty_mask = test_df['text'].str.strip() == ""
    empty_indices = test_df[empty_mask].index.tolist()
    non_empty_mask = ~empty_mask
    
    print(f"Total samples: {len(test_df)}")
    print(f"Empty text samples: {sum(empty_mask)} (will be labeled as 'not_applicable')")
    print(f"Non-empty samples for model prediction: {sum(non_empty_mask)}")
    
    # Initialize predictions array
    all_predictions = ['not_applicable'] * len(test_df)
    
    # If there are non-empty texts, process them with the model
    if sum(non_empty_mask) > 0:
        # Set device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Load tokenizer and model
        print("Loading tokenizer and model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(all_techniques))
        
        # Load trained weights
        model.load_state_dict(torch.load(model_path, map_location=device))
        model = model.to(device)
        model.eval()
        
        # Create label encoder (should match the one used during training)
        label_encoder = LabelEncoder()
        label_encoder.fit(all_techniques)
        
        # Filter non-empty data for model processing
        non_empty_df = test_df[non_empty_mask].reset_index(drop=True)
        
        # Dataset class for inference
        class TestDataset(Dataset):
            def __init__(self, df):
                self.df = df
            
            def __getitem__(self, idx):
                text = str(self.df.iloc[idx]['text']).strip()
                return {'content': text if text else "empty"}  # fallback for any remaining empty
            
            def __len__(self):
                return len(self.df)
        
        # Create dataset and dataloader for non-empty texts
        test_dataset = TestDataset(non_empty_df)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)
        
        # Inference
        print("Running model inference on non-empty texts...")
        model_predictions = []
        
        with torch.no_grad():
            for i, batch in enumerate(test_loader):
                batch_texts = batch['content']
                
                # Tokenize the text
                inputs = tokenizer(batch_texts, padding=True, truncation=True, 
                                  return_tensors='pt', max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                with autocast():
                    outputs = model(**inputs)
                    logits = outputs.logits  # (B, num_classes)
                    # Get predicted class (argmax)
                    preds = torch.argmax(logits, dim=-1)
                    model_predictions.extend(preds.cpu().numpy())
                
                # Free up memory
                del inputs, outputs, logits, preds
                torch.cuda.empty_cache()
                
                if i % 10 == 0:
                    print(f"Processed batch {i+1}/{len(test_loader)}")
        
        # Convert model predictions back to original labels
        predicted_labels = label_encoder.inverse_transform(model_predictions)
        
        # Map model predictions back to original dataframe positions
        non_empty_original_indices = test_df[non_empty_mask].index.tolist()
        for i, orig_idx in enumerate(non_empty_original_indices):
            all_predictions[orig_idx] = predicted_labels[i]
    
    # Create submission DataFrame
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'prediction': all_predictions
    })
    
    print("Prediction completed!")
    print(f"Predictions distribution:")
    print(pd.Series(all_predictions).value_counts())
    
    return submission_df

# Example usage:
# new_test_df = pd.read_csv('test_data.csv')  # Should have 'id' and 'text' columns
# submission = predict_labels(new_test_df)
# submission.to_csv('submission.csv', index=False)

In [11]:
new_test_df = pd.read_csv('/kaggle/input/cleanedval/cleaned_validation_dataset.csv')  # should have columns 'id' and 'text'
submission = predict_labels(new_test_df)
submission.to_csv('prediction_new.csv', index=False)


Preprocessing data...
Total samples: 1476
Empty text samples: 1 (will be labeled as 'not_applicable')
Non-empty samples for model prediction: 1475
Loading tokenizer and model...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running model inference on non-empty texts...
Processed batch 1/185


  with autocast():


Processed batch 11/185
Processed batch 21/185
Processed batch 31/185
Processed batch 41/185
Processed batch 51/185
Processed batch 61/185
Processed batch 71/185
Processed batch 81/185
Processed batch 91/185
Processed batch 101/185
Processed batch 111/185
Processed batch 121/185
Processed batch 131/185
Processed batch 141/185
Processed batch 151/185
Processed batch 161/185
Processed batch 171/185
Processed batch 181/185
Prediction completed!
Predictions distribution:
not_applicable    1476
Name: count, dtype: int64


In [12]:
import pandas as pd

def prepare_submission(df, output_path='prediction.csv'):
    """
    Converts a DataFrame with columns [id, text, label] to the required submission format.

    Parameters:
    - df: pd.DataFrame with columns ['id', 'text', 'label']
    - output_path: str, where to save the prediction CSV

    Output:
    - Saves a CSV with columns ['id', 'prediction'] in UTF-8 encoding
    """
    # Validate required columns
    if not {'id', 'label'}.issubset(df.columns):
        raise ValueError("DataFrame must contain 'id' and 'label' columns")

    # Strip and enforce correct labels
    allowed_labels = {'hate', 'hope', 'not_applicable'}
    df['label'] = df['label'].astype(str).str.strip()

    # Ensure labels are valid
    invalid_labels = df[~df['label'].isin(allowed_labels)]
    if not invalid_labels.empty:
        raise ValueError(f"Invalid labels found: {invalid_labels['label'].unique()}")

    # Prepare final dataframe
    submission_df = df[['id', 'label']].rename(columns={'label': 'prediction'})

    # Save as UTF-8 CSV
    submission_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Submission file saved to '{output_path}'. Zip it as 'prediction.zip'.")

# Example usage:
prepare_submission(submission_df)


Submission file saved to 'prediction.csv'. Zip it as 'prediction.zip'.
