In [7]:
import json
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss, jaccard_score

# --- 1. Load and Parse Data ---


# Read the txt file
with open('../data/TRDataChallenge2023.txt', 'r', encoding='utf-8') as file:
    content = file.read()

print(f"File size: {len(content)} characters")

# Parse JSON dictionaries from the file
data = []
lines = content.strip().split('\n')

for i, line in enumerate(lines):
    if line.strip():
        try:
            json_obj = json.loads(line)
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error parsing line {i+1}: {e}")
            print(f"Line content: {line[:100]}...")

print(f"\nSuccessfully parsed {len(data)} JSON objects")




df = pd.DataFrame(data) # In your case, df = pd.DataFrame(data)

# --- 2. Feature Extraction: Combine headtext and paragraphs ---
def combine_sections_text(sections):
    full_text = []
    for section in sections:
        # Prioritize headtext, then paragraphs. Join paragraphs with spaces.
        section_text = []
        if section.get('headtext'):
            section_text.append(section['headtext'])
        if section.get('paragraphs'):
            section_text.extend(section['paragraphs'])
        
        # Join paragraphs within a section with a space, then sections with a newline
        full_text.append(" ".join(section_text))
    return "\n\n".join(full_text)

df['full_text'] = df['sections'].apply(combine_sections_text)

# --- 3. Handle Documents with 0 Postures ---
# Documents with empty 'postures' lists cannot be used for supervised training.
# Filter them out before label binarization and splitting.
df_trainable = df[df['postures'].apply(lambda x: len(x) > 0)].copy()

# --- 4. Label Binarization (Multi-Hot Encoding) ---
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(df_trainable['postures'])
labels_df = pd.DataFrame(labels_encoded, columns=mlb.classes_, index=df_trainable.index)

# Define NUM_CLASSES here
NUM_CLASSES = len(mlb.classes_) # Add this line

# Merge texts with their encoded labels
X = df_trainable['full_text'].values
y = labels_df.values

print(f"Total documents for training (with postures): {len(X)}")
print(f"Number of unique postures (classes): {len(mlb.classes_)}")
print(f"Shape of label matrix (documents x classes): {y.shape}")

# --- 5. Train-Validation-Test Split with Iterative Stratification ---
# Iterative stratification is crucial for multi-label datasets to preserve label distributions.
# This ensures that each split (train, val, test) contains a representative distribution of labels,
# especially important for rare labels.

# First, split into train and temp (validation + test)
X_train, y_train, X_temp, y_temp = iterative_train_test_split(X.reshape(-1, 1), y, test_size=0.3) # 70% train, 30% temp
X_train = X_train.flatten() # Flatten X_train back to 1D array

# Then, split temp into validation and test
X_val, y_val, X_test, y_test = iterative_train_test_split(X_temp.reshape(-1, 1), y_temp, test_size=0.5) # 15% val, 15% test
X_val = X_val.flatten()
X_test = X_test.flatten()

print(f"\nTrain set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

# --- Common Evaluation Function ---
def evaluate_model(y_true, y_pred, y_pred_proba=None, threshold=0.5, class_names=None):
    """
    Evaluates multi-label classification model performance.
    y_pred should be binary predictions (0 or 1).
    y_pred_proba should be probabilities (if available) for thresholding.
    """
    if y_pred_proba is not None:
        y_pred = (y_pred_proba >= threshold).astype(int)

    metrics = {
        "Accuracy (Exact Match Ratio)": accuracy_score(y_true, y_pred),
        "Hamming Loss": hamming_loss(y_true, y_pred),
        "Jaccard Score (Micro)": jaccard_score(y_true, y_pred, average='micro'),
        "Jaccard Score (Macro)": jaccard_score(y_true, y_pred, average='macro'),
        "F1 Score (Micro)": f1_score(y_true, y_pred, average='micro'),
        "F1 Score (Macro)": f1_score(y_true, y_pred, average='macro'),
        "F1 Score (Weighted)": f1_score(y_true, y_pred, average='weighted'),
        "Precision (Micro)": precision_score(y_true, y_pred, average='micro'),
        "Recall (Micro)": recall_score(y_true, y_pred, average='micro'),
    }

    print("\n--- Evaluation Metrics ---")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

    # Optional: Per-class metrics (useful for understanding imbalance)
    if class_names and y_true.shape[1] < 50: # Avoid printing too many classes
        print("\n--- Per-Class F1 Score ---")
        f1_per_class = f1_score(y_true, y_pred, average=None)
        for i, class_name in enumerate(class_names):
            print(f"  {class_name}: {f1_per_class[i]:.4f}")
    
    return metrics

File size: 331936506 characters

Successfully parsed 18000 JSON objects
Total documents for training (with postures): 17077
Number of unique postures (classes): 224
Shape of label matrix (documents x classes): (17077, 224)

Train set size: 11962
Validation set size: 2558
Test set size: 2557


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate # Hugging Face evaluate library

print("\n--- Transformer-based Model: DistilBERT ---")

# Model name (DistilBERT is a good lightweight option for demonstration)
# For long documents, consider 'allenai/longformer-base-4096' or 'google/bigbird-roberta-base'
MODEL_NAME = "distilbert/distilbert-base-uncased" # or 'bert-base-uncased', 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Tokenization Function ---
# max_length is the crucial parameter for handling document length.
# For standard BERT/DistilBERT, max_length is usually 512.
# For Longformer, it can be 4096.
# If documents are longer than max_length, 'truncation=True' will cut them off.
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512) # Adjust max_length

# Prepare the data for Hugging Face datasets format
train_df = pd.DataFrame({'text': X_train.tolist(), 'labels': y_train.tolist()})
val_df = pd.DataFrame({'text': X_val.tolist(), 'labels': y_val.tolist()})
test_df = pd.DataFrame({'text': X_test.tolist(), 'labels': y_test.tolist()})

train_dataset_hf = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset_hf = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)
test_dataset_hf = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)


# Ensure labels are in the correct format (list of floats for multi-label)
train_dataset_hf.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset_hf.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset_hf.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# --- Model Initialization ---
# For multi-label classification, set problem_type="multi_label_classification"
# This automatically configures the final layer with sigmoid activation and BCE loss.
model_hf = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_CLASSES, problem_type="multi_label_classification")

# --- Custom Compute Metrics Function ---
# The Trainer needs a function to compute metrics during evaluation.
# We'll use our `evaluate_model` function for consistency.
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.sigmoid(torch.from_numpy(logits)).numpy() # Apply sigmoid to logits
    
    # Calculate pos_weight for F1 score, etc., as it's typically for loss, but
    # some metrics libraries might use it if averaged. Here, we pass it to our func
    # Note: Hugging Face Trainer doesn't directly use pos_weight for metrics,
    # but BCEWithLogitsLoss will use it for training loss.
    
    # Threshold predictions to binary (0 or 1)
    threshold = 0.5 # Can be tuned
    y_pred_binary = (predictions >= threshold).astype(int)

    # Use the evaluate_model function
    metrics_results = evaluate_model(labels, y_pred_binary, y_pred_proba=predictions)
    
    # Trainer expects a dictionary of metric_name: value
    # We'll return micro F1, Hamming Loss, and Micro Jaccard for simplicity for the Trainer.
    return {
        "f1_micro": metrics_results["F1 Score (Micro)"],
        "hamming_loss": metrics_results["Hamming Loss"],
        "jaccard_micro": metrics_results["Jaccard Score (Micro)"],
    }


# --- Training Arguments ---
# Set up training arguments. Adjust per your computational resources.
# For demonstration, small values.
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3, # Reduce for faster demo, increase for performance
    per_device_train_batch_size=8, # Adjust based on GPU memory
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True, # Loads the best model based on eval_loss
    metric_for_best_model="loss", # Or 'f1_micro' if you define it in compute_metrics
    greater_is_better=False, # For loss, smaller is better
    learning_rate=2e-5, # Common for fine-tuning Transformers
    fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
)

# --- Trainer Initialization and Training ---
trainer = Trainer(
    model=model_hf,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=val_dataset_hf,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer, # Pass tokenizer for padding/truncation during training
)

trainer.train()

# --- Evaluation on Test Set ---
print("\nResults for DistilBERT on Test Set:")
predictions_output = trainer.predict(test_dataset_hf)
y_pred_hf_proba = torch.sigmoid(torch.from_numpy(predictions_output.predictions)).numpy()
y_pred_hf = (y_pred_hf_proba >= 0.5).astype(int)

evaluate_model(y_test, y_pred_hf, y_pred_proba=y_pred_hf_proba, class_names=mlb.classes_)


--- Transformer-based Model: DistilBERT ---


Map:   0%|          | 0/11958 [00:00<?, ? examples/s]

Map:   0%|          | 0/2559 [00:00<?, ? examples/s]

Map:   0%|          | 0/2560 [00:00<?, ? examples/s]

NameError: name 'NUM_CLASSES' is not defined