In [None]:
# !pip install huggingface datasets scikit_learn seaborn spacy tensorflow tf-keras
!pip install numpy==1.26.4
!pip install --upgrade transformers datasets torch scikit-learn matplotlib seaborn optimum bitsandbytes accelerate peft gdown transformers[torch]
# !pip install transformers==4.45.1 datasets torch scikit-learn numpy==1.26.4 matplotlib seaborn optimum bitsandbytes accelerate peft gdown transformers[torch]

In [None]:
import os
import json
import numpy as np
import zipfile
import gdown
from kaggle_secrets import UserSecretsClient
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig
import huggingface_hub

In [None]:
def login_huggingface():
    try:
        user_secrets = UserSecretsClient()
        hf_token = user_secrets.get_secret("HF LLama3.2 Token")
        huggingface_hub.login(token=hf_token)
        print("Successfully logged in to Hugging Face")
    except Exception as e:
        print(f"Failed to log in to Hugging Face: {e}")
        print("Ensure HF_TOKEN is set in Kaggle Secrets")

In [None]:
def load_dataset(base_path):
    levels = ['easy', 'medium', 'hard']
    splits = ['train', 'validation']
    dataset = {}

    for level in levels:
        dataset[level] = {}
        for split in splits:
            split_path = os.path.join(base_path, level, split)
            documents = []
            for filename in sorted(os.listdir(split_path)):
                if filename.startswith('problem-') and filename.endswith('.txt'):
                    problem_id = filename.split('.')[0]
                    txt_path = os.path.join(split_path, filename)
                    json_path = os.path.join(split_path, f'truth-{problem_id}.json')
                    with open(txt_path, 'r', encoding='utf-8') as f:
                        sentences = [line.strip() for line in f.readlines() if line.strip()]
                    with open(json_path, 'r', encoding='utf-8') as f:
                        truth = json.load(f)
                        changes = truth['changes']
                    documents.append((sentences, changes, problem_id))
            dataset[level][split] = documents
    return dataset

In [None]:
def prepare_llama_data(documents, tokenizer, max_length=512):
    texts = []
    labels = []
    problem_ids_with_offsets = []
    offset = 0
    
    for sentences, changes, problem_id in documents:
        for i in range(len(changes)):
            pair_text = sentences[i] + " [SEP] " + sentences[i + 1]
            texts.append(pair_text)
            labels.append(changes[i])
            problem_ids_with_offsets.append((problem_id, i, offset + i))
        offset += len(changes)
    
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    dataset = Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })
    
    return dataset, problem_ids_with_offsets

In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['No Change (0)', 'Change (1)'],
                yticklabels=['No Change (0)', 'Change (1)'])
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    return cm

In [None]:
def save_predictions_to_json(predictions, problem_ids_with_offsets, output_base_path, level, split):
    output_dir = os.path.join(output_base_path, level, split)
    os.makedirs(output_dir, exist_ok=True)
    pred_dict = {}
    for pred, (problem_id, idx, offset) in zip(predictions, problem_ids_with_offsets):
        if problem_id not in pred_dict:
            pred_dict[problem_id] = []
        pred_dict[problem_id].append(int(pred))
    for problem_id, changes in pred_dict.items():
        solution = {"changes": changes}
        output_path = os.path.join(output_dir, f'solution-{problem_id}.json')
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(solution, f, indent=4)
        print(f"Saved: {output_path}")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0).astype(int).flatten()
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {'accuracy': accuracy, 'f1': f1}

In [None]:
def train_and_evaluate_llama(train_dataset, val_dataset, val_problem_ids_with_offsets, level, output_base_path, model_name):
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Set padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    # Define quantization config (4-bit)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Use 4-bit quantization
        bnb_4bit_compute_dtype=torch.float16,  # FP16 for computation
        bnb_4bit_use_double_quant=True,  # Double quantization
        bnb_4bit_quant_type="nf4"  # Normalized float 4-bit
    )
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
        problem_type="regression",
        quantization_config=quantization_config,
        device_map="auto"
    )
    
    # Update model config
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False  # Disable cache for gradient checkpointing
    
    # Apply LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_CLS"
    )
    model = get_peft_model(model, lora_config)
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    
    # Compute class weights for imbalanced data
    labels = train_dataset['labels']
    pos_weight = float((len(labels) - sum(labels)) / sum(labels))
    
    # Custom Trainer to handle pos_weight in loss
    class WeightedBCELossTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.logits
            device = logits.device
            loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight).to(device))
            loss = loss_fn(logits.squeeze(), labels.float())
            return (loss, outputs) if return_outputs else loss
    
    # # Training arguments optimized for Kaggle
    # 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
    # training_args = TrainingArguments(
    #     output_dir=f"/kaggle/working/results/{level}",
    #     num_train_epochs=3,
    #     per_device_train_batch_size=2,
    #     per_device_eval_batch_size=2,
    #     gradient_accumulation_steps=4,
    #     warmup_steps=500,
    #     weight_decay=0.01,
    #     learning_rate=2e-5,  # Optimized learning rate
    #     logging_dir=f"/kaggle/working/logs/{level}",
    #     logging_steps=10,
    #     eval_strategy="epoch",
    #     save_strategy="epoch",
    #     load_best_model_at_end=True,
    #     metric_for_best_model="f1",
    #     fp16=True,  # Enable FP16 mixed precision
    #     gradient_checkpointing=True,
    #     report_to="none",
    #     save_total_limit=1,  # Keep only the best checkpoint
    #     optim="adamw_torch"  # Explicit optimizer
    # )
    # Training arguments optimized for Kaggle
    training_args = TrainingArguments(
        output_dir=f"/kaggle/working/results/{level}",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=500,
        weight_decay=0.01,
        learning_rate=2e-5,
        logging_dir=f"/kaggle/working/logs/{level}",
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        gradient_checkpointing=True,
        report_to="none",
        save_total_limit=1,
        optim="adamw_torch"
    )
    
    # Define trainer
    trainer = WeightedBCELossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        # label_names=["labels"]
    )
    
    # Train
    trainer.train()
    
    # Evaluate
    eval_results = trainer.evaluate()
    print(f"\n{level} Level Metrics:")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}, F1-Score: {eval_results['eval_f1']:.4f}")
    
    # Predict
    predictions = trainer.predict(val_dataset)
    pred_labels = (predictions.predictions > 0).astype(int).flatten()
    
    # Classification report
    print("Classification Report:")
    print(classification_report(val_dataset['labels'], pred_labels, target_names=['No Change (0)', 'Change (1)']))
    
    # Plot confusion matrix
    cm = plot_confusion_matrix(val_dataset['labels'], pred_labels, f'Confusion Matrix - {level} Level')
    
    # Save predictions
    save_predictions_to_json(pred_labels, val_problem_ids_with_offsets, output_base_path, level, 'validation')
    
    return model, tokenizer, pred_labels, cm

In [None]:
dataset_dir = "../dataset"
print("Loading Dataset...")
dataset = load_dataset(dataset_dir)
print("Loaded Dataset")

In [None]:
output_base_path = "/kaggle/working/outputs-llama3.2"
os.mkdirs(output_base_path, exist_ok=True)
# os.makedirs(output_base_path, exist_ok=True)
model_name = "meta-llama/Llama-3.2-1B-Instruct"
max_length = 128

In [None]:
all_val_labels = []
all_val_pred = []
all_cm = None

levels = ['easy', 'medium', 'hard']

In [None]:
for level in levels:
    print(f"\nProcessing {level} level...")
    
    train_docs = dataset[level]['train']
    val_docs = dataset[level]['validation']
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
    
    train_dataset, _ = prepare_llama_data(train_docs, tokenizer, max_length)
    val_dataset, val_problem_ids_with_offsets = prepare_llama_data(val_docs, tokenizer, max_length)
    
    model, tokenizer, val_pred, cm = train_and_evaluate_llama(
        train_dataset, val_dataset, val_problem_ids_with_offsets, level, output_base_path, model_name
    )
    
    all_val_labels.extend(val_dataset['labels'])
    all_val_pred.extend(val_pred)
    if all_cm is None:
        all_cm = cm
    else:
        all_cm += cm

In [None]:
print("\nCombined Metrics Across All Levels:")
print("Classification Report:")
print(classification_report(all_val_labels, all_val_pred, target_names=['No Change (0)', 'Change (1)']))
plot_confusion_matrix(all_val_labels, all_val_pred, 'Combined Confusion Matrix - All Levels')

In [None]:
# Save the model
model_path = "../models"
os.makedirs(model_path, exist_ok=True)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print("Training and evaluation completed.")