In [None]:
# Replace this with your GitHub Personal Access Token (PAT)
personal_access_token = "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"

# Clone the private repo using the token
!git clone https://{personal_access_token}@github.com/AliMuhammadAsad/PAN2025-MultiAuthor-LLMs-Project.git

In [None]:
!pip install transformers torch


In [None]:
!pip install -U bitsandbytes optimum peft --break-system-packages

In [None]:
import os
import json
import torch
from transformers import LlamaForSequenceClassification, LlamaTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset

# Check if CUDA is available and use GPU if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset Loading Function (unchanged)
def load_dataset(base_path):
    levels = ['easy', 'medium', 'hard']
    splits = ['train', 'validation']
    dataset = {}

    for level in levels:
        dataset[level] = {}
        for split in splits:
            split_path = os.path.join(base_path, level, split)
            documents = []

            for filename in sorted(os.listdir(split_path)):
                if filename.startswith('problem-') and filename.endswith('.txt'):
                    problem_id = filename.split('.')[0]
                    txt_path = os.path.join(split_path, filename)
                    json_path = os.path.join(split_path, f'truth-{problem_id}.json')

                    with open(txt_path, 'r', encoding='utf-8') as f:
                        sentences = [line.strip() for line in f.readlines() if line.strip()]

                    with open(json_path, 'r', encoding='utf-8') as f:
                        truth = json.load(f)
                        changes = truth['changes']

                    documents.append((sentences, changes, problem_id))

            dataset[level][split] = documents

    return dataset

# Prepare Llama data function
def prepare_llama_data(documents, tokenizer, max_length=512):
    """
    Prepare sentence pairs as prompts for Llama model.
    Returns the tokenized inputs, labels, and problem_ids_with_offsets.
    """
    inputs = {'input_ids': [], 'attention_mask': []}
    labels = []
    problem_ids_with_offsets = []

    for sentences, changes, problem_id in documents:
        for i in range(len(changes)):
            # Create a prompt for the instruction-tuned Llama model
            prompt = (
                f"Given two consecutive sentences, determine if there is a change in authorship between them.\n"
                f"Sentence 1: {sentences[i]}\n"
                f"Sentence 2: {sentences[i + 1]}\n"
                f"Is there a change in authorship? Answer with 0 (no change) or 1 (change)."
            )

            # Tokenize the prompt
            encoded = tokenizer(
                prompt,
                truncation=True,
                padding='max_length',
                max_length=max_length,
                return_tensors='pt'
            )

            inputs['input_ids'].append(encoded['input_ids'].squeeze())
            inputs['attention_mask'].append(encoded['attention_mask'].squeeze())
            labels.append(changes[i])
            problem_ids_with_offsets.append((problem_id, i, len(labels) - 1))

    return inputs, labels, problem_ids_with_offsets

# Custom Dataset class for Trainer compatibility (unchanged)
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Function to plot confusion matrix (unchanged)
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['No Change (0)', 'Change (1)'],
                yticklabels=['No Change (0)', 'Change (1)'])
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    return cm

# Save predictions to JSON (unchanged)
def save_predictions_to_json(predictions, problem_ids_with_offsets, output_base_path, level, split):
    output_dir = os.path.join(output_base_path, level)
    os.makedirs(output_dir, exist_ok=True)

    pred_dict = {}
    for pred, (problem_id, idx, offset) in zip(predictions, problem_ids_with_offsets):
        if problem_id not in pred_dict:
            pred_dict[problem_id] = []
        pred_dict[problem_id].append(int(pred))

    for problem_id, changes in pred_dict.items():
        solution = {"changes": changes}
        output_path = os.path.join(output_dir, f'solution-{problem_id}.json')
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(solution, f, indent=4)
        print(f"Saved: {output_path}")

# Function to train and evaluate Llama model
def train_and_evaluate_llama(train_inputs, train_labels, val_inputs, val_labels, val_problem_ids_with_offsets, level, output_base_path):
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    
    # Ensure tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = LlamaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,  # Binary classification
        token=os.environ.get("HF_TOKEN")  # Required for gated model access
    )

    # Move model to the correct device
    model.to(device)

    # Convert to torch datasets
    train_dataset = CustomDataset(train_inputs, train_labels)
    val_dataset = CustomDataset(val_inputs, val_labels)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch",
        logging_dir='./logs',
        fp16=True,  # Use mixed precision training
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda p: {'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(axis=-1))}
    )

    # Start training
    trainer.train()

    # Predictions
    val_pred = trainer.predict(val_dataset).predictions.argmax(axis=-1)

    # Metrics
    accuracy = accuracy_score(val_labels, val_pred)
    f1 = f1_score(val_labels, val_pred)

    print(f"\n{level} Level Metrics:")
    print(f"Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(val_labels, val_pred, target_names=['No Change (0)', 'Change (1)']))

    # Plot confusion matrix
    cm = plot_confusion_matrix(val_labels, val_pred, f'Confusion Matrix - {level} Level')

    # Save predictions to JSON files
    save_predictions_to_json(val_pred, val_problem_ids_with_offsets, output_base_path, level, 'validation')

    return model, tokenizer, val_pred, cm

# Set up the dataset path
dataset_dir = "PAN2025-MultiAuthor-LLMs-Project/dataset"
print("Loading dataset...")
dataset = load_dataset(dataset_dir)

output_path = "../outputs_llama"

all_val_y = []
all_val_pred = []
all_cm = None
levels = ["easy", "medium", "hard"]

for level in levels:
    print(f"Processing {level} level...")

    train_docs = dataset[level]['train']
    val_docs = dataset[level]['validation']

    tokenizer = LlamaTokenizer.from_pretrained('meta-llama/Llama-3.2-1B-Instruct')
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    train_inputs, train_labels, _ = prepare_llama_data(train_docs, tokenizer)
    val_inputs, val_labels, val_problem_ids_with_offsets = prepare_llama_data(val_docs, tokenizer)

    model, tokenizer, val_pred, cm = train_and_evaluate_llama(
        train_inputs, train_labels, val_inputs, val_labels, val_problem_ids_with_offsets, level, output_path
    )

    all_val_y.extend(val_labels)
    all_val_pred.extend(val_pred)
    if all_cm is None:
        all_cm = cm
    else:
        all_cm += cm

print("\nCombined Metrics Across All Levels:")
print("Classification Report:")
print(classification_report(all_val_y, all_val_pred, target_names=['No Change (0)', 'Change (1)']))
plot_confusion_matrix(all_val_y, all_val_pred, 'Combined Confusion Matrix - All Levels')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!python PAN2025-MultiAuthor-LLMs-Project/verifier/verifier.py --output ../outputs_bert --input verifier/dataset

In [None]:
!python PAN2025-MultiAuthor-LLMs-Project/evaluate/evaluator.py --predictions ../outputs_bert --truth PAN2025-MultiAuthor-LLMs-Project/dataset --output ./

In [None]:
!zip -r /content/file.zip /content/PAN2025-MultiAuthor-LLMs-Project/

In [None]:
from google.colab import files
files.download("/content/file.zip")

In [None]:
!zip -r /content/file.zip ../outputs_bert/

In [None]:
from google.colab import files
files.download("/content/file.zip")