# Feature Extraction: Separate Models

This notebook extracts 19 Context Tree features for each model separately.

**Models:**
- BERT (bert-base-uncased)
- RoBERTa (roberta-base)
- DeBERTa (microsoft/deberta-v3-base)
- XLNet (xlnet-base-cased)

**Tasks:**
- Clarity (3-class)
- Evasion (9-class)

**Output:** Features saved to Drive for each model/task combination.


In [None]:
# Setup (run 00_setup.ipynb and 01_data_split.ipynb first)
import sys
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModel

BASE_PATH = Path('/content/semeval-context-tree-modular')
DATA_PATH = Path('/content/drive/MyDrive/semeval_data')
sys.path.insert(0, str(BASE_PATH))

from src.storage.manager import StorageManager
from src.features.extraction import featurize_hf_dataset_in_batches_v2

storage = StorageManager(
    base_path=str(BASE_PATH),
    data_path=str(DATA_PATH),
    github_path=str(BASE_PATH)
)

# Load splits
train_ds = storage.load_split('train')
dev_ds = storage.load_split('dev')
test_ds = storage.load_split('test')  # Will be used only in final evaluation

print(f"✅ Loaded splits:")
print(f"   Train: {len(train_ds)} samples")
print(f"   Dev: {len(dev_ds)} samples")
print(f"   Test: {len(test_ds)} samples")


In [None]:
# Model configurations
MODELS = {
    'bert': {
        'name': 'bert-base-uncased',
        'display': 'BERT'
    },
    'roberta': {
        'name': 'roberta-base',
        'display': 'RoBERTa'
    },
    'deberta': {
        'name': 'microsoft/deberta-v3-base',
        'display': 'DeBERTa'
    },
    'xlnet': {
        'name': 'xlnet-base-cased',
        'display': 'XLNet'
    }
}

# Tasks
TASKS = ['clarity', 'evasion']

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Using device: {device}")


In [None]:
# Extract features for each model and task
for model_key, model_info in MODELS.items():
    print(f"\n{'='*60}")
    print(f"Processing {model_info['display']} ({model_info['name']})")
    print(f"{'='*60}")
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_info['name'])
    model = AutoModel.from_pretrained(model_info['name'])
    model.to(device)
    model.eval()
    
    for task in TASKS:
        print(f"\n--- Task: {task.upper()} ---")
        
        # Get labels for this task
        if task == 'clarity':
            label_key = 'clarity_label'
        else:  # evasion
            label_key = 'evasion_label'
        
        # Extract features for each split
        for split_name, split_ds in [('train', train_ds), ('dev', dev_ds)]:
            print(f"  Extracting {split_name} features...")
            
            # Get texts
            questions = split_ds['question']
            answers = split_ds['answer']
            
            # Extract features
            X, feature_names, _ = featurize_hf_dataset_in_batches_v2(
                split_ds,
                tokenizer,
                model,
                device,
                batch_size=8,
                max_sequence_length=256,
                question_key='question',
                answer_key='answer',
                show_progress=True
            )
            
            # Save features
            storage.save_features(
                X, model_key, task, split_name, feature_names
            )
            
            print(f"    ✅ Saved: {X.shape[0]} samples, {X.shape[1]} features")
    
    # Free up memory
    del model, tokenizer
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

print(f"\n{'='*60}")
print("✅ Feature extraction complete for all models!")
print(f"{'='*60}")
