In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
!pip install transformers>=4.40.0 torch>=2.0.0 accelerate scikit-learn pandas numpy torch-lr-finder

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import (
    AutoTokenizer,
    AutoModel,
    BertModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    get_cosine_schedule_with_warmup,
    EarlyStoppingCallback
)
from google.colab import drive
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

drive.mount('/content/gdrive')
DRIVE_MODEL_DIR = '/content/gdrive/MyDrive/SemevalModels/bitnet_polarization_improved'

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"CUDA Available: {torch.cuda.is_available()}")


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Using device: cuda


In [None]:
def load_multilingual_data(data_dir, languages=None, split='train'):
    """
    Load data from multiple language files

    Args:
        data_dir: Path to directory (e.g., '/content/gdrive/MyDrive/subtask1/train')
        languages: List of language codes (e.g., ['eng', 'arb', 'deu']) or None for all
        split: 'train' or 'dev'

    Returns:
        combined_df: Combined DataFrame with all languages
        language_counts: Dict with counts per language
    """
    import glob

    # Language code mapping
    lang_files = {
        'amh': 'amh.csv',  # Amharic
        'arb': 'arb.csv',  # Arabic
        'deu': 'deu.csv',  # German
        'eng': 'eng.csv',  # English
        'hau': 'hau.csv',  # Hausa
        'ita': 'ita.csv',  # Italian
        'spa': 'spa.csv',  # Spanish
        'urd': 'urd.csv',  # Urdu
        'zho': 'zho.csv',  # Chinese
    }

    # If no languages specified, use all
    if languages is None:
        languages = list(lang_files.keys())

    print(f"{'='*70}")
    print(f"LOADING {split.upper()} DATA - MULTILINGUAL")
    print(f"{'='*70}")
    print(f"Languages requested: {', '.join(languages)}")
    print(f"Data directory: {data_dir}")
    print()

    all_dataframes = []
    language_counts = {}

    for lang_code in languages:
        file_name = lang_files.get(lang_code)
        if file_name is None:
            print(f"⚠️  Warning: Unknown language code '{lang_code}', skipping...")
            continue

        file_path = os.path.join(data_dir, file_name)

        if not os.path.exists(file_path):
            print(f"⚠️  Warning: File not found: {file_path}, skipping...")
            continue

        # Load CSV
        df = pd.read_csv(file_path)
        df['language'] = lang_code  # Add language identifier

        all_dataframes.append(df)
        language_counts[lang_code] = len(df)

        print(f"✓ Loaded {lang_code}: {len(df)} samples from {file_name}")

    # Combine all dataframes
    combined_df = pd.concat(all_dataframes, ignore_index=True)

    print(f"\n{'='*70}")
    print(f"TOTAL: {len(combined_df)} samples across {len(language_counts)} languages")
    print(f"{'='*70}")

    # Show class distribution
    if 'polarization' in combined_df.columns:
        print("\nClass Distribution:")
        for lang_code, count in language_counts.items():
            lang_df = combined_df[combined_df['language'] == lang_code]
            polarized = (lang_df['polarization'] == 1).sum()
            non_polarized = (lang_df['polarization'] == 0).sum()
            print(f"  {lang_code}: Polarized={polarized}, Non-Polarized={non_polarized}")

    return combined_df, language_counts


def generate_multilingual_predictions(
    model,
    tokenizer,
    dev_dir,
    output_dir,
    languages=None,
    threshold=0.48
):
    """
    Generate predictions for all languages in dev folder

    Args:
        model: Trained model
        tokenizer: Tokenizer
        dev_dir: Path to dev folder
        output_dir: Where to save predictions
        languages: List of language codes or None for all
        threshold: Classification threshold

    Returns:
        all_predictions: Dict with predictions per language
    """
    import os

    # Language files
    lang_files = {
        'amh': 'amh.csv',
        'arb': 'arb.csv',
        'deu': 'deu.csv',
        'eng': 'eng.csv',
        'hau': 'hau.csv',
        'ita': 'ita.csv',
        'spa': 'spa.csv',
        'urd': 'urd.csv',
        'zho': 'zho.csv',
    }

    if languages is None:
        languages = list(lang_files.keys())

    print(f"\n{'='*70}")
    print("GENERATING MULTILINGUAL PREDICTIONS")
    print(f"{'='*70}")
    print(f"Languages: {', '.join(languages)}")
    print(f"Dev directory: {dev_dir}")
    print(f"Output directory: {output_dir}")
    print(f"Threshold: {threshold}")
    print()

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    all_predictions = {}

    for lang_code in languages:
        file_name = lang_files.get(lang_code)
        if file_name is None:
            continue

        input_path = os.path.join(dev_dir, file_name)

        if not os.path.exists(input_path):
            print(f"⚠️  Skipping {lang_code}: File not found")
            continue

        # Output filename: pred_<lang>.csv
        output_filename = f"pred_{file_name}"
        output_path = os.path.join(output_dir, output_filename)

        print(f"Processing {lang_code}...")

        # Load test data
        test_df = pd.read_csv(input_path)

        # Create dataset
        test_dataset = PolarizationDataset(
            test_df['text'].tolist(),
            [0] * len(test_df),  # Dummy labels for test
            tokenizer,
            max_length=128
        )

        # Generate predictions
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)
        model.eval()

        predictions = []
        probabilities = []

        from torch.utils.data import DataLoader
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        test_loader = DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False,
            collate_fn=data_collator
        )

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                probs = torch.softmax(outputs.logits, dim=-1)[:, 1].cpu().numpy()
                preds = (probs >= threshold).astype(int)

                predictions.extend(preds)
                probabilities.extend(probs)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': test_df['id'],
            'text': test_df['text'],
            'polarization': predictions,
            'probability': probabilities
        })

        # Save predictions
        submission_df.to_csv(output_path, index=False)

        # Store results
        all_predictions[lang_code] = submission_df

        print(f"✓ Saved {lang_code}: {len(submission_df)} predictions to {output_filename}")
        print(f"  Polarized: {submission_df['polarization'].sum()}, Non-Polarized: {(submission_df['polarization']==0).sum()}")

    print(f"\n{'='*70}")
    print("ALL PREDICTIONS COMPLETED!")
    print(f"{'='*70}")
    print(f"Output directory: {output_dir}")

    return all_predictions


In [None]:
class BitLinear(nn.Module):
    """
    1.58-bit Quantized Linear Layer (BitNet)

    Key Features:
    - Weights: Ternary quantization {-1, 0, +1}
    - Activations: 8-bit quantization [-128, 127]
    - Straight-Through Estimator (STE) for gradient flow
    - Lambda warmup for gradual quantization
    """
    def __init__(self, in_features, out_features, bias=True):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        # Initialize weights with Xavier uniform (better for deep networks)
        self.weight = nn.Parameter(torch.empty(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        if bias:
            self.bias = nn.Parameter(torch.zeros(out_features))
        else:
            self.register_parameter('bias', None)

        # Layer normalization before quantization (critical for stability)
        self.layer_norm = nn.LayerNorm(in_features)

        # Lambda for gradual quantization warmup (starts at 0, goes to 1)
        self.register_buffer('lambda_val', torch.tensor(0.0))

    def weight_quant(self, w):
        """
        Quantize weights to ternary values {-1, 0, +1}
        Uses round-to-nearest with scale normalization
        """
        # Calculate scale factor using mean absolute value
        scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
        # Round to nearest integer and clamp to [-1, 0, 1]
        w_quant = (w * scale).round().clamp_(-1, 1) / scale
        return w_quant

    def activation_quant(self, x):
        """
        Quantize activations to 8-bit using absmax quantization
        Maps to [-128, 127] range
        """
        # Find maximum absolute value per sample
        scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
        # Quantize and dequantize
        x_quant = (x * scale).round().clamp_(-128, 127) / scale
        return x_quant

    def forward(self, x):
        # Apply layer normalization first
        x_norm = self.layer_norm(x)

        # Get current lambda value (controls quantization strength)
        lambda_val = self.lambda_val.item()

        if self.training:
            # During training: gradual quantization with lambda warmup
            x_quant_full = self.activation_quant(x_norm)
            w_quant_full = self.weight_quant(self.weight)

            # Linear interpolation between full precision and quantized
            # Lambda = 0: full precision, Lambda = 1: full quantization
            x_mixed = x_norm * (1 - lambda_val) + x_quant_full * lambda_val
            w_mixed = self.weight * (1 - lambda_val) + w_quant_full * lambda_val

            # Straight-Through Estimator: forward with quantized, backward with original
            x_final = x_mixed + (x_quant_full - x_mixed).detach()
            w_final = w_mixed + (w_quant_full - w_mixed).detach()
        else:
            # During inference: full quantization (lambda = 1)
            x_final = x_norm + (self.activation_quant(x_norm) - x_norm).detach()
            w_final = self.weight + (self.weight_quant(self.weight) - self.weight).detach()

        # Standard linear transformation
        return F.linear(x_final, w_final, self.bias)

In [None]:
class BitNetBinaryClassifier(nn.Module):
    """
    BitNet model for binary polarization detection
    Architecture: BERT -> BitLinear Layers -> Classification
    """
    def __init__(self, model_name='bert-base-uncased', num_labels=2, dropout_prob=0.1):
        super().__init__()

        # Load pretrained BERT
        print(f"Loading BERT model: {model_name}")
        self.bert = AutoModel.from_pretrained(model_name)
        config = self.bert.config
        self.num_labels = num_labels
        # Freeze first 8 layers of mDeBERTa to speed up training
        # Optional: Freeze early BERT layers for efficiency
        # Uncomment to freeze first 8 layers (keeps last 4 trainable)
        # for layer in self.bert.encoder.layer[:8]:
        #     for param in layer.parameters():
        #         param.requires_grad = False

        # BitLinear classification head (2 layers for better representation)
        self.dropout = nn.Dropout(dropout_prob)
        self.bit_fc1 = BitLinear(config.hidden_size, config.hidden_size // 2)
        self.activation = nn.GELU()
        self.bit_fc2 = BitLinear(config.hidden_size // 2, num_labels)

        print(f"Model initialized with {sum(p.numel() for p in self.parameters()):,} parameters")

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        """
        Forward pass through BitNet classifier
        """
        # Get encoder outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        # Extract [CLS] token representation from last hidden state
        # This works for both BERT and DeBERTa
        pooled_output = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)
        pooled_output = self.dropout(pooled_output)

        # Pass through BitLinear classification head
        x = self.bit_fc1(pooled_output)
        x = self.activation(x)
        x = self.dropout(x)
        logits = self.bit_fc2(x)

        # Compute loss if labels provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None
        )

In [None]:
class PolarizationDataset(Dataset):
    """Dataset class for polarization detection"""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize with proper truncation
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,  # Handled by DataCollator
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Squeeze to remove batch dimension
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(label, dtype=torch.long)

        return item

In [None]:
def find_optimal_learning_rate(
    model,
    train_dataset,
    val_dataset,
    tokenizer,
    start_lr=1e-10,
    end_lr=1e-1,
    num_iter=2000,
    plot=True
):
    """
    Learning Rate Finder compatible with HuggingFace transformers
    """
    import matplotlib.pyplot as plt

    print("="*70)
    print("LEARNING RATE RANGE TEST")
    print("="*70)
    print(f"Testing learning rates from {start_lr} to {end_lr}")
    print(f"Number of iterations: {num_iter}")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.train()

    # Create optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=start_lr,
        weight_decay=0.02
    )

    # Loss function
    criterion = FocalLoss(alpha=0.65, gamma=2.0)

    # Create data loader with HuggingFace collator
    from torch.utils.data import DataLoader
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True,
        collate_fn=data_collator
    )

    # Generate LR values (exponential spacing)
    lrs = np.logspace(np.log10(start_lr), np.log10(end_lr), num_iter)

    # Storage
    losses = []
    learning_rates = []

    print(f"\nRunning LR Range Test...")

    data_iter = iter(train_loader)

    for i, lr in enumerate(lrs):
        # Update learning rate
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        try:
            # Get batch - this is BatchEncoding format from HuggingFace
            batch = next(data_iter)
        except StopIteration:
            data_iter = iter(train_loader)
            batch = next(data_iter)

        # ✅ Extract data from BatchEncoding properly
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Store results
        losses.append(loss.item())
        learning_rates.append(lr)

        # Print progress
        if (i + 1) % 20 == 0:
            print(f"  Step {i+1}/{num_iter}: LR = {lr:.2e}, Loss = {loss.item():.4f}")

        # Stop if loss explodes
        if loss.item() > 100:
            print(f"\nStopped early at step {i+1}: Loss exploded")
            break

    # Find best LR (steepest negative gradient)
    loss_gradients = np.gradient(losses)
    best_idx = np.argmin(loss_gradients)
    suggested_lr = learning_rates[best_idx]

    print(f"\n{'='*70}")
    print(f"SUGGESTED LEARNING RATE: {suggested_lr:.2e}")
    print(f"{'='*70}")

    # Plot if requested
    if plot:
        try:
            fig, axes = plt.subplots(1, 2, figsize=(14, 5))

            # Plot 1: Loss vs LR
            axes[0].plot(learning_rates, losses, 'b-')
            axes[0].axvline(x=suggested_lr, color='red', linestyle='--',
                           label=f'Suggested: {suggested_lr:.2e}')
            axes[0].set_xlabel('Learning Rate')
            axes[0].set_ylabel('Loss')
            axes[0].set_title('Loss vs Learning Rate')
            axes[0].set_xscale('log')
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)

            # Plot 2: Loss gradient
            axes[1].plot(learning_rates, loss_gradients, 'g-')
            axes[1].axvline(x=suggested_lr, color='red', linestyle='--',
                           label=f'Suggested: {suggested_lr:.2e}')
            axes[1].set_xlabel('Learning Rate')
            axes[1].set_ylabel('Loss Gradient')
            axes[1].set_title('Loss Gradient')
            axes[1].set_xscale('log')
            axes[1].legend()
            axes[1].grid(True, alpha=0.3)

            plt.tight_layout()
            plt.savefig('lr_finder_results.png', dpi=150)
            print(f"\nPlot saved to: lr_finder_results.png")
            plt.show()
        except Exception as e:
            print(f"Could not create plot: {e}")

    # Store results
    lr_results = {
        'learning_rates': learning_rates,
        'losses': losses,
        'suggested_lr': suggested_lr
    }

    # Save to CSV
    results_df = pd.DataFrame({
        'learning_rate': learning_rates,
        'loss': losses
    })
    results_df.to_csv('lr_finder_results.csv', index=False)
    print(f"Results saved to: lr_finder_results.csv")

    return suggested_lr, lr_results


In [None]:
class FocalLoss(nn.Module):
    """
    Focal Loss for handling class imbalance
    Reference: https://arxiv.org/abs/1708.02002

    Better than weighted CE for imbalanced classification because it:
    - Focuses on hard-to-classify examples
    - Down-weights easy examples
    - Reduces false positives
    """
    def __init__(self, alpha=0.65, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


class BitNetTrainer(Trainer):
    """
    Custom trainer with:
    - Gradual quantization warmup (lambda scheduling)
    - Option for Weighted CE or Focal Loss for class imbalance
    """
    def __init__(self, warmup_steps=1000, class_weight=None, use_focal_loss=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.warmup_steps = warmup_steps
        self.use_focal_loss = use_focal_loss

        # Handle class weights
        if class_weight is not None:
            self.class_weight = class_weight.to(self.args.device)
        else:
            self.class_weight = None

        print(f"Lambda warmup enabled: 0 -> 1 over {warmup_steps} steps")

        # Initialize loss function
        if self.use_focal_loss:
            self.focal_loss = FocalLoss(alpha=0.65, gamma=2.0)
            print(f"Using Focal Loss (alpha=0.65, gamma=2.0)")
        elif self.class_weight is not None:
            print(f"Using Weighted CE Loss with weights: {self.class_weight}")
        else:
            print(f"Using standard Cross-Entropy Loss")

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Calculate lambda based on current training step
        current_step = self.state.global_step
        lambda_val = min(1.0, current_step / self.warmup_steps)

        # Set lambda for all BitLinear layers
        for module in model.modules():
            if hasattr(module, 'lambda_val'):
                module.lambda_val.fill_(lambda_val)

        # Get labels and perform forward pass
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute loss based on selected method
        if self.use_focal_loss:
            loss = self.focal_loss(logits, labels)
        elif self.class_weight is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weight)
            loss = loss_fct(logits, labels)
        else:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
def compute_metrics(eval_pred):
    """Compute metrics for binary classification"""
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    return {
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_binary': f1_score(labels, preds, average='binary'),
        'accuracy': accuracy_score(labels, preds)
    }

In [None]:
# ==========================================
# DATA AUGMENTATION FOR LOW-RESOURCE LANGUAGES
# ==========================================
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from nlpaug.util import Action

def augment_data_with_eda(texts, labels, languages, augmentation_factor=1):
    """
    Easy Data Augmentation (EDA): Random Insertion, Swap, Deletion, Replacement
    
    Args:
        texts: List of text samples
        labels: List of labels
        languages: List of language codes
        augmentation_factor: How many augmented copies per sample
    
    Returns:
        augmented_texts, augmented_labels, augmented_languages
    """
    augmented_texts = list(texts)
    augmented_labels = list(labels)
    augmented_languages = list(languages)
    
    for i in range(len(texts)):
        text = texts[i]
        label = labels[i]
        language = languages[i]
        
        for _ in range(augmentation_factor):
            try:
                # Synonym replacement (p=0.1 means 10% of non-stopwords are replaced)
                aug = naw.SynonymAug(aug_p=0.1, lang=language if language in ['en', 'de', 'es', 'it'] else 'en')
                augmented_text = aug.augment(text)
                
                if augmented_text and len(augmented_text) > 5:
                    augmented_texts.append(augmented_text)
                    augmented_labels.append(label)
                    augmented_languages.append(language)
            except:
                continue
    
    return augmented_texts, augmented_labels, augmented_languages


def augment_minority_class(
    train_df,
    target_languages=None,
    augmentation_factor=2
):
    """
    Augment minority class in specified languages to balance data
    
    Args:
        train_df: Training dataframe with 'text', 'polarization', 'language' columns
        target_languages: List of languages to augment (None = augment low-performing ones)
        augmentation_factor: Number of augmented copies per minority sample
    
    Returns:
        augmented_df: Dataframe with augmented data
    """
    augmented_rows = []
    
    if target_languages is None:
        target_languages = train_df['language'].unique()
    
    for lang in target_languages:
        lang_data = train_df[train_df['language'] == lang].copy()
        
        # Find minority class
        class_dist = lang_data['polarization'].value_counts()
        minority_class = class_dist.idxmin()
        minority_data = lang_data[lang_data['polarization'] == minority_class]
        
        print(f"\n{lang.upper()}: Augmenting minority class {minority_class}")
        print(f"  Original minority samples: {len(minority_data)}")
        
        for idx, row in minority_data.iterrows():
            text = row['text']
            
            for aug_idx in range(augmentation_factor):
                try:
                    # Random synonym replacement
                    aug = naw.SynonymAug(aug_p=0.15)
                    augmented_text = aug.augment(text)
                    
                    if augmented_text and len(augmented_text) > 5 and augmented_text != text:
                        augmented_rows.append({
                            'text': augmented_text,
                            'polarization': row['polarization'],
                            'language': lang
                        })
                except:
                    continue
        
        print(f"  Generated augmented samples: {len([r for r in augmented_rows if r['language'] == lang])}")
    
    # Combine with original data
    augmented_df = pd.concat(
        [train_df, pd.DataFrame(augmented_rows)],
        ignore_index=True
    )
    
    print(f"\nOriginal dataset size: {len(train_df)}")
    print(f"Augmented dataset size: {len(augmented_df)}")
    print(f"Samples added: {len(augmented_df) - len(train_df)}")
    
    return augmented_df


In [None]:
# ==========================================
# LANGUAGE-SPECIFIC CLASS WEIGHTS
# ==========================================

def compute_language_class_weights(train_df):
    """
    Compute class weights for each language separately based on frequency
    
    Args:
        train_df: Training dataframe with 'polarization' and 'language' columns
    
    Returns:
        lang_weights: Dict mapping language to class weights tensor
    """
    lang_weights = {}
    
    print("\n" + "="*70)
    print("COMPUTING LANGUAGE-SPECIFIC CLASS WEIGHTS")
    print("="*70)
    
    for lang in sorted(train_df['language'].unique()):
        lang_data = train_df[train_df['language'] == lang]
        class_counts = lang_data['polarization'].value_counts().sort_index()
        
        # Compute inverse frequency weights: weight = total / (2 * count)
        total = len(lang_data)
        weights = {}
        
        for class_idx in [0, 1]:
            count = class_counts.get(class_idx, 1)
            weight = total / (2 * max(count, 1))
            weights[class_idx] = weight
        
        # Normalize weights
        total_weight = sum(weights.values())
        weights = {k: v / total_weight for k, v in weights.items()}
        
        lang_weights[lang] = torch.tensor(
            [weights[0], weights[1]], 
            dtype=torch.float32
        )
        
        print(f"\n{lang.upper()}:")
        print(f"  Non-polarized ({class_counts.get(0, 0)}): {weights[0]:.4f}")
        print(f"  Polarized ({class_counts.get(1, 0)}): {weights[1]:.4f}")
        print(f"  Imbalance ratio: {max(weights.values()) / min(weights.values()):.2f}x")
    
    return lang_weights


In [None]:
# ==========================================
# IMPROVED FOCAL LOSS WITH LANGUAGE AWARENESS
# ==========================================

class LanguageAwareFocalLoss(nn.Module):
    """
    Focal Loss with language-specific alpha and gamma parameters
    Better handles class imbalance that varies across languages
    """
    def __init__(self, lang_alphas=None, lang_gammas=None, reduction='mean'):
        """
        Args:
            lang_alphas: Dict mapping language to alpha value (e.g., {'eng': 0.6, 'arb': 0.65})
            lang_gammas: Dict mapping language to gamma value
            reduction: 'mean' or 'sum'
        """
        super().__init__()
        self.lang_alphas = lang_alphas or {}
        self.lang_gammas = lang_gammas or {}
        self.reduction = reduction
        self.default_alpha = 0.65
        self.default_gamma = 2.0

    def forward(self, inputs, targets, language_ids=None):
        """
        Args:
            inputs: Model logits (batch_size, num_classes)
            targets: Target labels (batch_size,)
            language_ids: Language indices for each sample (batch_size,)
        """
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        
        if language_ids is not None:
            # Apply language-specific parameters
            alpha_values = []
            gamma_values = []
            
            for lang_id in language_ids:
                lang = language_ids[lang_id] if isinstance(language_ids, dict) else lang_id
                alpha_values.append(self.lang_alphas.get(lang, self.default_alpha))
                gamma_values.append(self.lang_gammas.get(lang, self.default_gamma))
            
            alpha = torch.tensor(alpha_values, device=inputs.device)
            gamma = torch.tensor(gamma_values, device=inputs.device)
        else:
            alpha = self.default_alpha
            gamma = self.default_gamma
        
        focal_loss = alpha * (1 - pt) ** gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss


In [None]:
# ==========================================
# UPDATED POLARIZATION DATASET WITH LANGUAGE INFO
# ==========================================

class PolarizationDatasetV2(Dataset):
    """
    Enhanced dataset class for polarization detection with language tracking
    Includes language-specific information for language-aware training
    """
    def __init__(self, texts, labels, tokenizer, max_length=128, languages=None, language_to_id=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.languages = languages if languages is not None else ['eng'] * len(texts)
        
        # Create language to ID mapping if not provided
        if language_to_id is None:
            unique_langs = sorted(set(self.languages))
            self.language_to_id = {lang: idx for idx, lang in enumerate(unique_langs)}
        else:
            self.language_to_id = language_to_id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        language = self.languages[idx]
        language_id = self.language_to_id.get(language, 0)

        # Tokenize with proper truncation
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,  # Handled by DataCollator
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Squeeze to remove batch dimension
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        item['language_id'] = torch.tensor(language_id, dtype=torch.long)
        item['language'] = language

        return item


In [None]:
# ==========================================
# LANGUAGE-SPECIFIC ADAPTERS (OPTIONAL)
# ==========================================
# Uncomment below to use language-specific adapters
# This requires: pip install -U adapters

"""
from adapters import AutoAdapterModel, AdapterConfig

def add_language_adapters(model, languages):
    '''Add lightweight adapter layers for each language'''
    
    adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=16)
    
    for lang in languages:
        adapter_name = f"lang_{lang}"
        model.add_adapter(adapter_name, config=adapter_config)
        print(f"✓ Added adapter for {lang}")
    
    return model
"""

# Alternative: Lightweight LoRA adapters (more compatible)
def add_language_lora_adapters(model, languages, r=8, lora_alpha=16):
    """
    Add LoRA (Low-Rank Adaptation) layers for each language
    Requires: pip install peft
    """
    try:
        from peft import LoraConfig, get_peft_model, TaskType
        
        # Create base LoRA config
        lora_config = LoraConfig(
            r=r,
            lora_alpha=lora_alpha,
            target_modules=["query", "value"],  # Target Q and V projections
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_2_SEQ_LM
        )
        
        # Apply LoRA to model
        model = get_peft_model(model, lora_config)
        
        print(f"✓ Added LoRA adapters (r={r})")
        print(f"Trainable parameters: {model.print_trainable_parameters()}")
        
        return model
    except ImportError:
        print("peft not installed. Install with: pip install peft")
        return model


In [None]:
def train_multilingual_polarization_detector_improved(
    train_dir='/content/gdrive/MyDrive/subtask1/train',
    languages=None,
    model_name='microsoft/mdeberta-v3-base',
    use_lr_finder=False,
    use_data_augmentation=True,
    use_language_specific_weights=True,
    max_length=192,  # Increased from 128
    num_epochs=6,  # Increased from 3
    learning_rate=5e-5,  # Increased from 3e-5
    use_adapters=False,
    use_standard_layers=False  # If True, uses standard Linear instead of BitNet
):
    """
    IMPROVED: Train multilingual polarization detector with all enhancements

    Args:
        train_dir: Path to training data folder
        languages: List of language codes or None for all
        model_name: Model to use (mDeBERTa recommended)
        use_lr_finder: Whether to run LR finder
        use_data_augmentation: Apply EDA augmentation to minority classes
        use_language_specific_weights: Use language-aware class weights
        max_length: Maximum token sequence length
        num_epochs: Number of training epochs
        learning_rate: Initial learning rate
        use_adapters: Add language-specific LoRA adapters
        use_standard_layers: Replace BitNet with standard Linear layers
    """
    set_seed(42)

    print("\n" + "="*70)
    print("IMPROVED MULTILINGUAL POLARIZATION DETECTION TRAINING")
    print("="*70)
    print(f"Enhancements enabled:")
    print(f"  • Data Augmentation: {use_data_augmentation}")
    print(f"  • Language-specific Weights: {use_language_specific_weights}")
    print(f"  • Language-specific Adapters: {use_adapters}")
    print(f"  • Max Sequence Length: {max_length}")
    print(f"  • Training Epochs: {num_epochs}")
    print(f"  • Learning Rate: {learning_rate:.2e}")
    print("="*70 + "\n")

    # STEP 1: Load multilingual data
    print("STEP 1: LOADING MULTILINGUAL DATA")
    print("="*70)

    train_full, lang_counts = load_multilingual_data(
        data_dir=train_dir,
        languages=languages,
        split='train'
    )

    # STEP 1B: Data Augmentation for minority classes
    if use_data_augmentation:
        print("\nSTEP 1B: APPLYING DATA AUGMENTATION")
        print("="*70)

        # Identify underrepresented languages
        lang_sizes = train_full['language'].value_counts()
        underrepresented = lang_sizes[lang_sizes < lang_sizes.median()].index.tolist()

        print(f"\nLanguages marked for augmentation: {underrepresented}")
        train_full = augment_minority_class(
            train_full,
            target_languages=underrepresented,
            augmentation_factor=2
        )

    # Stratified split preserving language distribution
    train, val = train_test_split(
        train_full,
        test_size=0.2,
        stratify=train_full[['polarization', 'language']],
        random_state=42
    )

    print(f"\nTrain samples: {len(train)}")
    print(f"Val samples: {len(val)}")

    # STEP 2: Initialize tokenizer and model
    print(f"\n{'='*70}")
    print("STEP 2: INITIALIZING MODEL AND TOKENIZER")
    print(f"{'='*70}")
    print(f"Model: {model_name}")
    print(f"Max Length: {max_length}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create language to ID mapping
    unique_languages = sorted(train['language'].unique())
    language_to_id = {lang: idx for idx, lang in enumerate(unique_languages)}
    print(f"\nLanguage mapping: {language_to_id}")

    # Create datasets with improved version
    train_dataset = PolarizationDatasetV2(
        train['text'].tolist(),
        train['polarization'].tolist(),
        tokenizer,
        max_length=max_length,
        languages=train['language'].tolist(),
        language_to_id=language_to_id
    )

    val_dataset = PolarizationDatasetV2(
        val['text'].tolist(),
        val['polarization'].tolist(),
        tokenizer,
        max_length=max_length,
        languages=val['language'].tolist(),
        language_to_id=language_to_id
    )

    # Initialize model
    if use_standard_layers:
        print("\nUsing standard Linear layers (no BitNet quantization)")
        model = BitNetBinaryClassifier(
            model_name=model_name,
            num_labels=2,
            dropout_prob=0.3
        )
        # Replace BitLinear with standard Linear
        config = model.bert.config
        model.bit_fc1 = nn.Linear(config.hidden_size, config.hidden_size // 2)
        model.bit_fc2 = nn.Linear(config.hidden_size // 2, 2)
    else:
        model = BitNetBinaryClassifier(
            model_name=model_name,
            num_labels=2,
            dropout_prob=0.3  # Increased from 0.2
        )

    # Add adapters if requested
    if use_adapters:
        print("\nAdding language-specific LoRA adapters...")
        model = add_language_lora_adapters(model, unique_languages, r=8)

    # STEP 3: Compute language-specific class weights
    lang_weights = None
    if use_language_specific_weights:
        print(f"\n{'='*70}")
        print("STEP 3: COMPUTING LANGUAGE-SPECIFIC CLASS WEIGHTS")
        print(f"{'='*70}")
        lang_weights = compute_language_class_weights(train)

    # STEP 4: Learning Rate Finder (optional)
    if use_lr_finder:
        print(f"\n{'='*70}")
        print("STEP 4: LEARNING RATE FINDER")
        print(f"{'='*70}")

        suggested_lr, lr_results = find_optimal_learning_rate(
            model=model,
            train_dataset=train_dataset,
            val_dataset=val_dataset,
            tokenizer=tokenizer,
            start_lr=1e-10,
            end_lr=1e-1,
            num_iter=2000,
            plot=True
        )

        final_lr = suggested_lr
        print(f"\nUsing Learning Rate: {final_lr:.2e}")
    else:
        final_lr = learning_rate
        print(f"\nUsing Learning Rate: {final_lr:.2e}")

    # STEP 5: Train model with improved configuration
    print(f"\n{'='*70}")
    print("STEP 5: TRAINING MODEL (IMPROVED)")
    print(f"{'='*70}")

    # Calculate total training steps for proper scheduling
    num_train_samples = len(train_dataset)
    batch_size = 32
    num_steps_per_epoch = (num_train_samples + batch_size - 1) // batch_size
    total_steps = num_steps_per_epoch * num_epochs
    warmup_steps = int(0.1 * total_steps)  # 10% warmup

    print(f"\nTraining Configuration:")
    print(f"  Total training steps: {total_steps}")
    print(f"  Warmup steps: {warmup_steps}")
    print(f"  Batch size: {batch_size}")
    print(f"  Epochs: {num_epochs}")

    # Improved training arguments with cosine scheduling
    training_args = TrainingArguments(
        output_dir='./results_multilingual_improved',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=1,  # Can set to 2-4 for effective batch doubling
        warmup_steps=warmup_steps,
        learning_rate=final_lr,
        weight_decay=0.01,  # Reduced from 0.02
        lr_scheduler_type='cosine',  # IMPROVED: Cosine annealing
        logging_dir='./logs_multilingual_improved',
        logging_steps=50,
        eval_strategy='steps',
        eval_steps=100,  # More frequent evaluation
        save_strategy='steps',
        save_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        save_total_limit=3,  # Keep 3 best checkpoints
        report_to='none',
        seed=42,
        fp16=torch.cuda.is_available(),
        dataloader_pin_memory=True,
        optim='adamw_8bit' if torch.cuda.is_available() else 'adamw_torch',  # Memory efficient
    )

    # Custom metrics computation with per-language tracking
    def compute_metrics_improved(eval_pred):
        """Compute metrics including per-language F1 scores"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        # Overall metrics
        metrics = {
            'f1_macro': f1_score(labels, predictions, average='macro', zero_division=0),
            'f1_binary': f1_score(labels, predictions, average='binary', zero_division=0),
            'accuracy': accuracy_score(labels, predictions),
            'precision': precision_score(labels, predictions, average='macro', zero_division=0),
            'recall': recall_score(labels, predictions, average='macro', zero_division=0),
        }

        return metrics

    # Use language-aware trainer if using language-specific weights
    trainer = BitNetTrainer(
        warmup_steps=warmup_steps,
        class_weight=None,  # Will use language-specific weights in loss
        use_focal_loss=True,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_improved,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=5,
                early_stopping_threshold=0.001
            )
        ]
    )

    print("\nStarting training...")
    results = trainer.train()

    return model, tokenizer, trainer, results, train, val, language_to_id


In [None]:
def save_model_to_drive(model, tokenizer, save_dir, model_config, threshold=None):
    """
    Save complete model to Google Drive for later inference

    Args:
        model: Trained BitNetBinaryClassifier
        tokenizer: AutoTokenizer
        save_dir: Path in Google Drive
        model_config: Dict with model configuration
        threshold: Optimal threshold (optional)
    """
    import os
    import json

    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    print(f"Saving model to {save_dir}...")

    # 1. Save PyTorch model state dict
    torch.save(
        model.state_dict(),
        os.path.join(save_dir, 'pytorch_model.bin')
    )
    print("✓ Saved PyTorch model weights")

    # 2. Save tokenizer (HuggingFace format)
    tokenizer.save_pretrained(save_dir)
    print("✓ Saved tokenizer")

    # 3. Save model configuration
    config_with_threshold = {
        **model_config,
        'optimal_threshold': threshold,
        'saved_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    with open(os.path.join(save_dir, 'model_config.json'), 'w') as f:
        json.dump(config_with_threshold, f, indent=2)
    print("✓ Saved model configuration")

    # 4. Save training metrics (if available)
    metrics_file = os.path.join(save_dir, 'training_metrics.txt')
    with open(metrics_file, 'w') as f:
        f.write(f"Model Configuration:\n")
        f.write(f"Model: {model_config['model_name']}\n")
        f.write(f"Dropout: {model_config['dropout_prob']}\n")
        f.write(f"Optimal Threshold: {threshold}\n")
        f.write(f"Saved: {config_with_threshold['saved_date']}\n")
    print("✓ Saved training metrics")

    print(f"\n{'='*60}")
    print(f"MODEL SUCCESSFULLY SAVED TO GOOGLE DRIVE!")
    print(f"{'='*60}")
    print(f"Location: {save_dir}")
    print(f"Files saved:")
    print(f"  - pytorch_model.bin (model weights)")
    print(f"  - tokenizer files (tokenizer_config.json, vocab.txt, etc.)")
    print(f"  - model_config.json (configuration)")
    print(f"  - training_metrics.txt (metadata)")


In [None]:
def load_model_from_drive(save_dir):
    """
    Load trained model from Google Drive for inference

    Args:
        save_dir: Path where model was saved in Google Drive

    Returns:
        model: Loaded BitNetBinaryClassifier
        tokenizer: Loaded tokenizer
        config: Model configuration dict
    """
    import os
    import json

    print(f"Loading model from {save_dir}...")

    # 1. Load model configuration
    config_path = os.path.join(save_dir, 'model_config.json')
    with open(config_path, 'r') as f:
        config = json.load(f)
    print(f"✓ Loaded configuration")

    # 2. Initialize model with same architecture
    model = BitNetBinaryClassifier(
        model_name=config['model_name'],
        num_labels=config['num_labels'],
        dropout_prob=config['dropout_prob']
    )
    print(f"✓ Initialized model architecture")

    # 3. Load model weights
    model_path = os.path.join(save_dir, 'pytorch_model.bin')
    model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()  # Set to evaluation mode
    print(f"✓ Loaded model weights")

    # 4. Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    print(f"✓ Loaded tokenizer")

    # Move model to appropriate device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print(f"✓ Model moved to {device}")

    print(f"\n{'='*60}")
    print(f"MODEL SUCCESSFULLY LOADED!")
    print(f"{'='*60}")
    print(f"Model: {config['model_name']}")
    print(f"Optimal Threshold: {config.get('optimal_threshold', 'Not saved')}")
    print(f"Saved Date: {config.get('saved_date', 'Unknown')}")

    return model, tokenizer, config


In [None]:
def predict_polarization(text, model, tokenizer, return_probabilities=True):
    """
    Make predictions on new text

    Args:
        text: Input text string
        model: Trained BitNet model
        tokenizer: BERT tokenizer
        return_probabilities: If True, return probabilities along with prediction

    Returns:
        prediction: 0 (Not Polarized) or 1 (Polarized)
        confidence: Probability of being polarized (if return_probabilities=True)
    """
    model.eval()

    with torch.no_grad():
        # Tokenize input
        inputs = tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=128
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get model predictions
        outputs = model(**inputs)
        logits = outputs.logits

        # Convert to probabilities
        probs = torch.softmax(logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][1].item()  # Probability of being polarized

    if return_probabilities:
        return pred, confidence
    return pred

In [None]:
def inference_only_mode(
    model_dir,
    test_file='dev_eng.csv',
    output_file='dev_predictions_inference.csv',
    threshold=None
):
    """
    Run inference without training - load model from Drive

    Args:
        model_dir: Path to saved model in Google Drive
        test_file: Test data CSV file
        output_file: Output predictions CSV
        threshold: Custom threshold (uses saved optimal if None)
    """
    print("="*70)
    print("INFERENCE-ONLY MODE")
    print("="*70)

    # Load model from Google Drive
    model, tokenizer, config = load_model_from_drive(model_dir)

    # Use saved optimal threshold if not provided
    if threshold is None:
        threshold = config.get('optimal_threshold', 0.48)
        print(f"Using saved optimal threshold: {threshold}")
    else:
        print(f"Using custom threshold: {threshold}")

    # Generate predictions
    print(f"\nGenerating predictions for {test_file}...")
    submission = create_submission(
        model,
        tokenizer,
        test_file=test_file,
        output_file=output_file,
        threshold=threshold
    )

    print("="*70)
    print("INFERENCE COMPLETE!")
    print("="*70)
    print(f"Predictions saved to: {output_file}")

    return submission


In [None]:
def create_submission(model, tokenizer, test_file='dev_eng.csv', output_file='dev_predictions.csv', threshold=0.48):
    """
    Create predictions for dev/test dataset with custom threshold

    Args:
        model: Trained model
        tokenizer: Tokenizer
        test_file: Path to test file
        output_file: Output CSV file
        threshold: Decision threshold (default 0.48)

    Returns:
        submission: DataFrame with predictions
    """
    print(f"\nCreating predictions from {test_file}...")
    print(f"Using threshold: {threshold}")

    # Load test data
    test = pd.read_csv(test_file)
    print(f"Test samples: {len(test)}")

    # Create dataset with dummy labels
    test_dataset = PolarizationDataset(
        test['text'].tolist(),
        [0] * len(test),  # Dummy labels
        tokenizer,
        max_length=128
    )

    # Create trainer for prediction
    trainer = Trainer(
        model=model,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    # Get predictions
    print("Generating predictions...")
    prediction_output = trainer.predict(test_dataset)
    raw_predictions = prediction_output.predictions

    # Convert to tensor and handle shape
    if not isinstance(raw_predictions, torch.Tensor):
        raw_predictions = torch.tensor(raw_predictions)

    # Ensure correct shape (num_samples, num_labels)
    if raw_predictions.dim() == 1:
        # If 1D, reshape to (num_samples, num_labels)
        raw_predictions = raw_predictions.reshape(-1, 2)
    elif raw_predictions.shape[1] != 2:
        # If shape is unexpected, try to fix it
        raw_predictions = raw_predictions.reshape(len(test), -1)[:, :2]

    # Apply softmax to get probabilities
    probs = F.softmax(raw_predictions, dim=1)
    pred_probs = probs[:, 1].numpy()  # Probability of polarized class

    # Apply threshold
    pred_labels = (pred_probs >= threshold).astype(int)

    # Create submission dataframe
    submission = pd.DataFrame({
        'id': test['id'],
        'text': test['text'],
        'predicted_polarization': pred_labels,
        'polarization_probability': pred_probs
    })

    # Save to CSV
    submission.to_csv(output_file, index=False)
    print(f"\nPredictions saved to: {output_file}")

    # Print statistics
    print(f"\nPrediction Statistics:")
    print(f"  Non-polarized (0): {(pred_labels == 0).sum()} ({(pred_labels == 0).sum() / len(pred_labels) * 100:.2f}%)")
    print(f"  Polarized (1): {(pred_labels == 1).sum()} ({(pred_labels == 1).sum() / len(pred_labels) * 100:.2f}%)")
    print(f"  Mean probability: {pred_probs.mean():.4f}")
    print(f"  Median probability: {np.median(pred_probs):.4f}")

    return submission


In [None]:
def find_optimal_threshold(model, tokenizer, val_file='train_eng.csv'):
    """
    Find optimal threshold for F1 Macro maximization

    Uses the validation split from training data to find best threshold

    Args:
        model: Trained model
        tokenizer: Tokenizer
        val_file: Training file (will use 20% validation split)

    Returns:
        best_threshold: Optimal threshold
        best_f1_macro: Best F1 Macro achieved
        results_df: Full results DataFrame
    """
    print("\n" + "="*70)
    print("FINDING OPTIMAL THRESHOLD FOR F1 MACRO")
    print("="*70)

    # Load and split data same way as training
    full_data = pd.read_csv(val_file)
    _, val_data = train_test_split(
        full_data,
        test_size=0.2,
        stratify=full_data['polarization'],
        random_state=42
    )

    print(f"Validation samples: {len(val_data)}")

    # Create dataset
    val_dataset = PolarizationDataset(
        val_data['text'].tolist(),
        val_data['polarization'].tolist(),
        tokenizer,
        max_length=128
    )

    # Get predictions
    trainer = Trainer(
        model=model,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print("Generating predictions...")
    predictions = trainer.predict(val_dataset)
    raw_predictions = predictions.predictions

    # Convert to probabilities
    if not isinstance(raw_predictions, torch.Tensor):
        raw_predictions = torch.tensor(raw_predictions)

    if raw_predictions.dim() == 1:
        raw_predictions = raw_predictions.reshape(-1, 2)

    probs = F.softmax(raw_predictions, dim=1)[:, 1].numpy()
    true_labels = val_data['polarization'].values

    # Test thresholds
    print("\nTesting thresholds from 0.30 to 0.70...")
    thresholds = np.arange(0.30, 0.71, 0.01)

    results = []
    for thresh in thresholds:
        pred_labels = (probs >= thresh).astype(int)

        f1_macro = f1_score(true_labels, pred_labels, average='macro')
        f1_binary = f1_score(true_labels, pred_labels, average='binary', zero_division=0)
        precision = (pred_labels[pred_labels == 1] == true_labels[pred_labels == 1]).sum() / max(pred_labels.sum(), 1)
        recall = (pred_labels[true_labels == 1] == 1).sum() / max((true_labels == 1).sum(), 1)
        accuracy = (pred_labels == true_labels).sum() / len(true_labels)

        results.append({
            'threshold': thresh,
            'f1_macro': f1_macro,
            'f1_binary': f1_binary,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy
        })

    # Create results DataFrame
    results_df = pd.DataFrame(results)

    # Find best threshold
    best_idx = results_df['f1_macro'].idxmax()
    best_result = results_df.loc[best_idx]

    # Print top 5 thresholds
    print("\nTop 5 thresholds by F1 Macro:")
    print("-"*70)
    print(f"{'Threshold':<12} {'F1 Macro':<12} {'F1 Binary':<12} {'Precision':<12} {'Recall':<12}")
    print("-"*70)

    top_5 = results_df.nlargest(5, 'f1_macro')
    for _, row in top_5.iterrows():
        print(f"{row['threshold']:.2f}         {row['f1_macro']:.4f}       "
              f"{row['f1_binary']:.4f}       {row['precision']:.4f}       {row['recall']:.4f}")

    print("\n" + "="*70)
    print("OPTIMAL THRESHOLD FOUND")
    print("="*70)
    print(f"Best Threshold: {best_result['threshold']:.2f}")
    print(f"F1 Macro: {best_result['f1_macro']:.4f}")
    print(f"F1 Binary: {best_result['f1_binary']:.4f}")
    print(f"Precision: {best_result['precision']:.4f}")
    print(f"Recall: {best_result['recall']:.4f}")
    print(f"Accuracy: {best_result['accuracy']:.4f}")

    # Save full results
    results_df.to_csv('threshold_optimization_results.csv', index=False)
    print("\nFull results saved to: threshold_optimization_results.csv")

    return best_result['threshold'], best_result['f1_macro'], results_df


In [None]:
def test_inference_examples(model, tokenizer):
    """Test model on example texts"""

    test_examples = [
        "This politician is destroying our country with terrible policies!",
        "I believe we need better education and healthcare systems.",
        "Those people are all criminals and should be deported immediately!",
        "Research shows that renewable energy can reduce carbon emissions.",
        "They're trying to take away our rights and freedoms!",
        "The weather forecast predicts rain tomorrow afternoon.",
    ]

    print("\n" + "="*60)
    print("INFERENCE EXAMPLES")
    print("="*60)

    for i, text in enumerate(test_examples, 1):
        pred, confidence = predict_polarization(text, model, tokenizer)
        label = "Polarized" if pred == 1 else "Not Polarized"
        print(f"\n{i}. Text: {text}")
        print(f"   Prediction: {label}")
        print(f"   Confidence: {confidence:.3f}")

In [None]:
if __name__ == "__main__":

    # Configuration
    TRAIN_DIR = '/content/gdrive/MyDrive/subtask1/train'
    DEV_DIR = '/content/gdrive/MyDrive/subtask1/dev'
    OUTPUT_DIR = '/content/gdrive/MyDrive/subtask1/predictions'
    MODEL_SAVE_DIR = '/content/gdrive/MyDrive/SemevalModels/bitnet_multilingual_improved'

    INFERENCE_MODE = False  # Set to True to skip training
    USE_LR_FINDER = False   # Set to True to find optimal LR

    # Language selection (None = all languages)
    LANGUAGES = None  # Or specify: ['eng', 'arb', 'deu', 'spa']

    # ========== IMPROVED PARAMETERS ==========
    USE_DATA_AUGMENTATION = True  # Enable EDA for minority classes
    USE_LANGUAGE_SPECIFIC_WEIGHTS = True  # Enable language-aware class weights
    USE_ADAPTERS = False  # Set to True to use LoRA adapters
    USE_STANDARD_LAYERS = False  # Set to True to remove BitNet quantization
    MAX_LENGTH = 192  # Increased from 128
    NUM_EPOCHS = 6  # Increased from 3
    LEARNING_RATE = 5e-5  # Increased from 3e-5
    # ==========================================

    if INFERENCE_MODE:
        # ==========================================
        # INFERENCE ONLY
        # ==========================================
        print("\n" + "="*70)
        print("INFERENCE-ONLY MODE - MULTILINGUAL (IMPROVED)")
        print("="*70 + "\n")

        # Load model
        model, tokenizer, config = load_model_from_drive(MODEL_SAVE_DIR)
        threshold = config.get('optimal_threshold', 0.48)

        # Generate predictions for all languages
        predictions = generate_multilingual_predictions(
            model=model,
            tokenizer=tokenizer,
            dev_dir=DEV_DIR,
            output_dir=OUTPUT_DIR,
            languages=LANGUAGES,
            threshold=threshold
        )

    else:
        # ==========================================
        # FULL TRAINING WORKFLOW (IMPROVED)
        # ==========================================
        print("\n" + "="*70)
        print("FULL MULTILINGUAL TRAINING WORKFLOW (WITH ALL IMPROVEMENTS)")
        print("="*70 + "\n")

        # STEP 1: Train model with all improvements
        model, tokenizer, trainer, train_results, train_data, val_data, lang_to_id = train_multilingual_polarization_detector_improved(
            train_dir=TRAIN_DIR,
            languages=LANGUAGES,
            model_name='microsoft/mdeberta-v3-base',
            use_lr_finder=USE_LR_FINDER,
            use_data_augmentation=USE_DATA_AUGMENTATION,
            use_language_specific_weights=USE_LANGUAGE_SPECIFIC_WEIGHTS,
            max_length=MAX_LENGTH,
            num_epochs=NUM_EPOCHS,
            learning_rate=LEARNING_RATE,
            use_adapters=USE_ADAPTERS,
            use_standard_layers=USE_STANDARD_LAYERS
        )

        # STEP 2: Find optimal threshold
        print("\n" + "="*70)
        print("FINDING OPTIMAL THRESHOLD")
        print("="*70)

        best_threshold, best_f1_macro, threshold_results = find_optimal_threshold(
            model, 
            tokenizer,
            val_file=TRAIN_DIR.replace('train', 'train')  # Use training data for validation
        )

        # STEP 3: Find per-language optimal thresholds
        print("\n" + "="*70)
        print("FINDING PER-LANGUAGE OPTIMAL THRESHOLDS")
        print("="*70)

        # This is a new addition for per-language threshold optimization
        lang_thresholds = {}
        for lang in val_data['language'].unique():
            lang_val = val_data[val_data['language'] == lang]

            if len(lang_val) > 50:  # Only if enough samples
                lang_val_dataset = PolarizationDatasetV2(
                    lang_val['text'].tolist(),
                    lang_val['polarization'].tolist(),
                    tokenizer,
                    max_length=MAX_LENGTH,
                    languages=lang_val['language'].tolist(),
                    language_to_id=lang_to_id
                )

                # Get predictions
                from transformers import Trainer
                temp_trainer = Trainer(
                    model=model,
                    data_collator=DataCollatorWithPadding(tokenizer)
                )

                predictions = temp_trainer.predict(lang_val_dataset)
                probs = F.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()

                # Find best threshold for this language
                best_lang_f1 = 0
                best_lang_thresh = 0.5

                for thresh in np.arange(0.3, 0.71, 0.01):
                    pred_labels = (probs >= thresh).astype(int)
                    lang_f1 = f1_score(lang_val['polarization'].values, pred_labels, average='macro', zero_division=0)

                    if lang_f1 > best_lang_f1:
                        best_lang_f1 = lang_f1
                        best_lang_thresh = thresh

                lang_thresholds[lang] = {
                    'threshold': best_lang_thresh,
                    'f1_macro': best_lang_f1,
                    'samples': len(lang_val)
                }

                print(f"\n{lang.upper()}:")
                print(f"  Optimal Threshold: {best_lang_thresh:.3f}")
                print(f"  F1 Macro: {best_lang_f1:.4f}")
                print(f"  Validation Samples: {len(lang_val)}")

        # STEP 4: Save model
        print("\n" + "="*70)
        print("SAVING MODEL")
        print("="*70)

        model_config = {
            'model_name': 'microsoft/mdeberta-v3-base',
            'num_labels': 2,
            'dropout_prob': 0.3,
            'max_length': MAX_LENGTH,
            'improvements': {
                'data_augmentation': USE_DATA_AUGMENTATION,
                'language_specific_weights': USE_LANGUAGE_SPECIFIC_WEIGHTS,
                'cosine_scheduling': True,
                'extended_epochs': NUM_EPOCHS,
                'learning_rate': LEARNING_RATE,
            }
        }

        save_model_to_drive(
            model,
            tokenizer,
            MODEL_SAVE_DIR,
            model_config,
            threshold=best_threshold
        )

        # STEP 5: Generate predictions on dev set
        print("\n" + "="*70)
        print("GENERATING PREDICTIONS ON DEV SET")
        print("="*70)

        # Create submission for all languages
        submissions = generate_multilingual_predictions(
            model=model,
            tokenizer=tokenizer,
            dev_dir=DEV_DIR,
            output_dir=OUTPUT_DIR,
            languages=LANGUAGES,
            threshold=best_threshold,
            lang_thresholds=lang_thresholds
        )

        # STEP 6: Test on example texts
        print("\n" + "="*70)
        print("TESTING ON EXAMPLE TEXTS")
        print("="*70)

        test_inference_examples(model, tokenizer)

        print("\n" + "="*70)
        print("TRAINING COMPLETE!")
        print("="*70)
        print(f"\nModel saved to: {MODEL_SAVE_DIR}")
        print(f"Predictions saved to: {OUTPUT_DIR}")
        print(f"\nBest Overall Threshold: {best_threshold:.3f}")
        print(f"Best Overall F1 Macro: {best_f1_macro:.4f}")



FULL MULTILINGUAL TRAINING WORKFLOW


MULTILINGUAL POLARIZATION DETECTION TRAINING

STEP 1: LOADING MULTILINGUAL DATA
LOADING TRAIN DATA - MULTILINGUAL
Languages requested: amh, arb, deu, eng, hau, ita, spa, urd, zho
Data directory: /content/gdrive/MyDrive/subtask1/train

✓ Loaded amh: 3332 samples from amh.csv
✓ Loaded arb: 3380 samples from arb.csv
✓ Loaded deu: 3180 samples from deu.csv
✓ Loaded eng: 2676 samples from eng.csv
✓ Loaded hau: 3651 samples from hau.csv
✓ Loaded ita: 3334 samples from ita.csv
✓ Loaded spa: 3305 samples from spa.csv
✓ Loaded urd: 2849 samples from urd.csv
✓ Loaded zho: 4280 samples from zho.csv

TOTAL: 29987 samples across 9 languages

Class Distribution:
  amh: Polarized=2518, Non-Polarized=814
  arb: Polarized=1512, Non-Polarized=1868
  deu: Polarized=1512, Non-Polarized=1668
  eng: Polarized=1002, Non-Polarized=1674
  hau: Polarized=392, Non-Polarized=3259
  ita: Polarized=1368, Non-Polarized=1966
  spa: Polarized=1660, Non-Polarized=1645
  urd: Polar

Step,Training Loss,Validation Loss,F1 Macro,F1 Binary,Accuracy
150,0.1729,0.096298,0.69858,0.686469,0.699066
300,0.109,0.087824,0.735098,0.705463,0.738413
450,0.095,0.087933,0.738109,0.696411,0.744748
600,0.09,0.080017,0.75686,0.734484,0.75892
750,0.0847,0.081627,0.757374,0.747787,0.757753
900,0.0842,0.080192,0.760538,0.757104,0.760587
1050,0.0843,0.080116,0.762089,0.755822,0.762254
1200,0.0827,0.080534,0.761587,0.761189,0.761587
1350,0.0811,0.080275,0.75992,0.76,0.75992
1500,0.0812,0.080199,0.760585,0.759866,0.760587


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



FINDING OPTIMAL THRESHOLD

FINDING OPTIMAL THRESHOLD FOR F1 MACRO
Validation samples: 1200
Generating predictions...



Testing thresholds from 0.30 to 0.70...

Top 5 thresholds by F1 Macro:
----------------------------------------------------------------------
Threshold    F1 Macro     F1 Binary    Precision    Recall      
----------------------------------------------------------------------
0.49         0.7667       0.7678       0.7189       0.8238
0.50         0.7640       0.7587       0.7283       0.7918
0.53         0.7640       0.7429       0.7646       0.7224
0.52         0.7625       0.7482       0.7456       0.7509
0.51         0.7622       0.7532       0.7336       0.7740

OPTIMAL THRESHOLD FOUND
Best Threshold: 0.49
F1 Macro: 0.7667
F1 Binary: 0.7678
Precision: 0.7189
Recall: 0.8238
Accuracy: 0.7667

Full results saved to: threshold_optimization_results.csv

Optimal threshold: 0.49
Expected F1 Macro: 0.7667

SAVING MODEL TO GOOGLE DRIVE
Saving model to /content/gdrive/MyDrive/SemevalModels/bitnet_multilingual...
✓ Saved PyTorch model weights
✓ Saved tokenizer
✓ Saved model configuration
✓ 

In [None]:
# ==========================================
# UTILITIES FOR PER-LANGUAGE EVALUATION
# ==========================================

def generate_multilingual_predictions(
    model,
    tokenizer,
    dev_dir,
    output_dir,
    languages=None,
    threshold=0.49,
    lang_thresholds=None,
    max_length=192
):
    """
    Generate predictions for all language dev files with optional language-specific thresholds
    
    Args:
        model: Trained model
        tokenizer: Tokenizer
        dev_dir: Directory with dev CSV files
        output_dir: Output directory for predictions
        languages: List of language codes (None for all)
        threshold: Default threshold
        lang_thresholds: Dict with language-specific thresholds
        max_length: Max sequence length
    """
    import glob
    import os
    
    os.makedirs(output_dir, exist_ok=True)
    
    # If languages not specified, find all dev files
    if languages is None:
        dev_files = glob.glob(os.path.join(dev_dir, '*.csv'))
        languages = [os.path.basename(f).split('_')[-1].replace('.csv', '') for f in dev_files]
    
    all_submissions = {}
    lang_thresholds = lang_thresholds or {}
    
    print(f"\nGenerating predictions for {len(languages)} languages...\n")
    
    for lang in languages:
        dev_file = os.path.join(dev_dir, f'dev_{lang}.csv')
        
        if not os.path.exists(dev_file):
            print(f"⚠ File not found: {dev_file}")
            continue
        
        # Load dev data
        dev_data = pd.read_csv(dev_file)
        
        # Create dataset
        dev_dataset = PolarizationDatasetV2(
            dev_data['text'].tolist(),
            [0] * len(dev_data),  # Dummy labels
            tokenizer,
            max_length=max_length,
            languages=[lang] * len(dev_data)
        )
        
        # Get predictions
        trainer = Trainer(
            model=model,
            data_collator=DataCollatorWithPadding(tokenizer)
        )
        
        predictions = trainer.predict(dev_dataset)
        probs = F.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()
        
        # Use language-specific threshold if available, else use default
        if lang in lang_thresholds:
            thresh = lang_thresholds[lang]['threshold']
            print(f"{lang.upper()}: Using language-specific threshold {thresh:.3f}")
        else:
            thresh = threshold
            print(f"{lang.upper()}: Using default threshold {thresh:.3f}")
        
        pred_labels = (probs >= thresh).astype(int)
        
        # Create submission
        submission = pd.DataFrame({
            'id': dev_data['id'],
            'polarization': pred_labels
        })
        
        output_file = os.path.join(output_dir, f'dev_predictions_{lang}.csv')
        submission.to_csv(output_file, index=False)
        all_submissions[lang] = submission
        
        print(f"  ✓ Saved to {output_file}")
        print(f"    Polarized: {(pred_labels == 1).sum()}/{len(pred_labels)}")
    
    return all_submissions


def evaluate_per_language(model, tokenizer, val_data, lang_to_id, max_length=192):
    """
    Evaluate model performance per language
    
    Args:
        model: Trained model
        tokenizer: Tokenizer
        val_data: Validation dataframe
        lang_to_id: Language to ID mapping
        max_length: Max sequence length
    
    Returns:
        per_lang_metrics: Dict with metrics per language
    """
    per_lang_metrics = {}
    
    print(f"\n{'='*70}")
    print("PER-LANGUAGE EVALUATION")
    print(f"{'='*70}\n")
    
    for lang in sorted(val_data['language'].unique()):
        lang_val = val_data[val_data['language'] == lang]
        
        if len(lang_val) == 0:
            continue
        
        # Create dataset
        lang_dataset = PolarizationDatasetV2(
            lang_val['text'].tolist(),
            lang_val['polarization'].tolist(),
            tokenizer,
            max_length=max_length,
            languages=lang_val['language'].tolist(),
            language_to_id=lang_to_id
        )
        
        # Get predictions
        trainer = Trainer(
            model=model,
            data_collator=DataCollatorWithPadding(tokenizer)
        )
        
        predictions = trainer.predict(lang_dataset)
        probs = F.softmax(torch.tensor(predictions.predictions), dim=1)
        pred_labels = torch.argmax(probs, dim=1).numpy()
        true_labels = lang_val['polarization'].values
        
        # Compute metrics
        metrics = {
            'f1_macro': f1_score(true_labels, pred_labels, average='macro', zero_division=0),
            'f1_binary': f1_score(true_labels, pred_labels, average='binary', zero_division=0),
            'accuracy': accuracy_score(true_labels, pred_labels),
            'precision': precision_score(true_labels, pred_labels, average='macro', zero_division=0),
            'recall': recall_score(true_labels, pred_labels, average='macro', zero_division=0),
            'samples': len(lang_val)
        }
        
        per_lang_metrics[lang] = metrics
        
        print(f"{lang.upper()}:")
        print(f"  Samples: {metrics['samples']}")
        print(f"  F1 Macro: {metrics['f1_macro']:.4f}")
        print(f"  F1 Binary: {metrics['f1_binary']:.4f}")
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}\n")
    
    # Compute average across languages
    avg_f1_macro = np.mean([m['f1_macro'] for m in per_lang_metrics.values()])
    print(f"\nAverage F1 Macro (across all languages): {avg_f1_macro:.4f}")
    print(f"="*70)
    
    return per_lang_metrics
