### Installing Required Packages

In [None]:
# Install necessary libraries
!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
!pip install evaluate
!pip install rouge-score
!pip install bert-score

### Configuration

In [None]:
# Import necessary libraries
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer

# Define configuration for the base model, LoRA, and training
config = {
    "model_config": {
        "base_model": "unsloth/llama-2-7b-chat-bnb-4bit",
        "finetuned_model": "abhilash2599/llama-2-7b-medlm-2k",
        "max_seq_length": 2048,
        "dtype": torch.float16,
        "load_in_4bit": True,
    },
    "lora_config": {
          "r": 16,
          "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
          "lora_alpha":16,
          "lora_dropout":0.25,
          "bias":"none",
          "use_gradient_checkpointing":True,
          "use_rslora":False,
          "use_dora":False,
          "loftq_config":None
        },
    "training_dataset": {
        "name": "Shekswess/medical_llama2_instruct_dataset_short",
        "split": "train",
        "input_field": "prompt",
    },
    "training_config": {
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 4,
        "warmup_steps": 5,
        "max_steps": 0,
        "num_train_epochs": 1,
        "learning_rate": 2e-4,
        "fp16": not torch.cuda.is_bf16_supported(),
        "bf16": torch.cuda.is_bf16_supported(),
        "logging_steps": 1,
        "optim": "adamw_8bit",
        "weight_decay": 0.01,
        "lr_scheduler_type": "linear",
        "seed": 42,
        "output_dir": "outputs",
    },
}


### Data Preparation

In [None]:
# Load and split the dataset
def prepare_data(config):
    dataset = load_dataset(config["training_dataset"]["name"], split=config["training_dataset"]["split"])
    return dataset.train_test_split(test_size=0.05, seed=config["training_config"]["seed"])

# Prepare train and test datasets
data_splits = prepare_data(config)
dataset_train = data_splits["train"]
dataset_test = data_splits["test"]

print(f"Training size: {len(dataset_train)}, Test size: {len(dataset_test)}")


### Model Loading

In [None]:
# Load the model and tokenizer
def load_model(config):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=config["model_config"]["base_model"],
        max_seq_length=config["model_config"]["max_seq_length"],
        dtype=config["model_config"]["dtype"],
        load_in_4bit=config["model_config"]["load_in_4bit"],
    )
    return model, tokenizer

model, tokenizer = load_model(config)


### Configure LoRA

In [None]:
# Setup LoRA for the base model
def configure_lora(model, config):
    return FastLanguageModel.get_peft_model(
        model,
        r=config["lora_config"]["r"],
        target_modules=config["lora_config"]["target_modules"],
        lora_alpha=config["lora_config"]["lora_alpha"],
        lora_dropout=config["lora_config"]["lora_dropout"],
        bias=config["lora_config"]["bias"],
        use_gradient_checkpointing=config["lora_config"]["use_gradient_checkpointing"],
        random_state=config["training_config"]["seed"],
        use_rslora=config["lora_config"]["use_rslora"],
        use_dora=config["lora_config"]["use_dora"],
        loftq_config=config["lora_config"]["loftq_config"],
    )

# Apply LoRA configuration to the model
model = configure_lora(model, config)


### Training setup

In [None]:
# Setup the trainer
def setup_trainer(model, tokenizer, train_dataset, config):
    return SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        dataset_text_field=config["training_dataset"]["input_field"],
        max_seq_length=config["model_config"]["max_seq_length"],
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=config["training_config"]["per_device_train_batch_size"],
            gradient_accumulation_steps=config["training_config"]["gradient_accumulation_steps"],
            warmup_steps=config["training_config"]["warmup_steps"],
            max_steps=config["training_config"]["max_steps"],
            num_train_epochs=config["training_config"]["num_train_epochs"],
            learning_rate=config["training_config"]["learning_rate"],
            fp16=config["training_config"]["fp16"],
            bf16=config["training_config"]["bf16"],
            logging_steps=config["training_config"]["logging_steps"],
            optim=config["training_config"]["optim"],
            weight_decay=config["training_config"]["weight_decay"],
            lr_scheduler_type=config["training_config"]["lr_scheduler_type"],
            seed=config["training_config"]["seed"],
            output_dir=config["training_config"]["output_dir"],
        ),
    )

trainer = setup_trainer(model, tokenizer, dataset_train, config)


### Model Training

In [None]:
# Train the model
trainer_stats = trainer.train()
print("Training complete. Stats:", trainer_stats)

In [None]:
# Saving the trainer stats
import json
with open("trainer_stats.json", "w") as f:
    json.dump(trainer_stats, f, indent=4)

### Save and Publish Model

In [None]:
from huggingface_hub import notebook_login

# Logging into the Hugging Face Hub(with token)
notebook_login()


In [None]:
# Save and push the trained model
model.save_pretrained(config.get("model_config").get("finetuned_model"))
model.push_to_hub('abhilash2599/llama-2-7b-medlm-2k', tokenizer = tokenizer)

### Sample Inference

In [None]:
# Loading the fine-tuned model and the tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("finetuned_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
        load_in_4bit = config.get("model_config").get("load_in_4bit"),
    )

# Using FastLanguageModel for fast inference
FastLanguageModel.for_inference(model)

import time

start_time = time.time()

# Tokenizing the input and generating the output
inputs = tokenizer(
[
    "[INST] Answer the question truthfully, you are a medical professional. This is the question: What is (are) Bloom syndrome ? [/INST]"
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
print(tokenizer.batch_decode(outputs, skip_special_tokens = True))

end_time = time.time()

print(f"Execution time: {end_time - start_time:.2f} seconds")

### Inference on Test Data

In [None]:
from evaluate import load
import logging

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = config.get("model_config").get("finetuned_model"),
        max_seq_length = config.get("model_config").get("max_seq_length"),
        dtype = config.get("model_config").get("dtype"),
        load_in_4bit = config.get("model_config").get("load_in_4bit"),
    )

def extract_relevant_input(example):
    """
    Extracts the relevant input for the model from the dataset example in the specified format.
    Args:
        example (dict): A single data example with 'prompt' and 'output' fields.
    Returns:
        str: The relevant input for the model.
    """
    prompt = example["prompt"]

    # Define the markers for the format
    inst_start_marker = "[INST]"
    inst_end_marker = "[/INST]"
    response_split_marker = "\\n"

    # Extract the text between [INST] and [/INST]
    inst_start = prompt.find(inst_start_marker)
    inst_end = prompt.find(inst_end_marker, inst_start)

    if inst_start == -1 or inst_end == -1:
        raise ValueError(f"Invalid prompt format: {prompt}")

    # Extract the relevant portion including [INST] and [/INST]
    relevant_text = prompt[inst_start:inst_end + len(inst_end_marker)]

    # Return the part before the model response starts (split by `\n`)
    if response_split_marker in relevant_text:
        relevant_text = relevant_text.split(response_split_marker)[0].strip()

    return relevant_text


def extract_relevant_text(prediction):
    """
    Extracts the relevant response from the model's prediction.
    Assumes the format: [INST] <instruction> [/INST] \n <response>
    Args:
        prediction (str): The prediction string from the model.
    Returns:
        str: The extracted response text.
    """
    marker = "[/INST] \\n"
    if marker in prediction:
        return prediction.split(marker, 1)[1].strip()  # Extract and strip whitespace
    return prediction.strip()  # Return the full text if the marker is not found

def generate_predictions_batch(prompts, batch_size=16):
    FastLanguageModel.for_inference(model)  # Ensure the model is in inference mode
    all_predictions = []
    total_batches = len(prompts) // batch_size + int(len(prompts) % batch_size > 0)

    eos_token_id = tokenizer.eos_token_id

    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i + batch_size]
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")

        # Adjusting generation parameters to avoid repetition
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,         # Limit the output length to avoid long sequences
            eos_token_id=eos_token_id,  # Ensure generation stops at EOS token
            use_cache=True,
            no_repeat_ngram_size=2,    # Avoid repeating n-grams (e.g., repeated phrases)
            top_p=0.92,                # Use nucleus sampling for diversity
            top_k=50,                  # Limit the pool of potential tokens
            temperature=0.7            # Control randomness
        )

        batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Post-process predictions to remove redundant content
        batch_predictions_cleaned = [extract_relevant_text(prediction) for prediction in batch_predictions]
        all_predictions.extend(batch_predictions_cleaned)

        logging.info(f"Processed batch {i // batch_size + 1}/{total_batches}")

    return all_predictions


inputs = [extract_relevant_input(example) for example in dataset_test]
references = [example["output"] for example in dataset_test]


# Generate predictions in batches
predictions = generate_predictions_batch(inputs, batch_size=16)

In [None]:
predictions[0]

In [None]:
references[0]

### Evaluation Metrics

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from sklearn.metrics import precision_recall_fscore_support
import bert_score

def calculate_metrics(predictions, references):
    """
    Calculate Exact Match, Precision, Recall, F1 Score, BLEU, ROUGE, and BERT scores.
    Args:
        predictions: List of generated predictions.
        references: List of ground-truth references.
    Returns:
        A dictionary containing all metrics with their average and max values.
    """
    # Ensure inputs are tokenized properly
    tokenized_predictions = [set(pred.split()) for pred in predictions]
    tokenized_references = [set(ref.split()) for ref in references]

    # Calculate BLEU
    tokenized_refs_for_bleu = [[ref.split()] for ref in references]
    tokenized_preds_for_bleu = [pred.split() for pred in predictions]
    bleu = corpus_bleu(tokenized_refs_for_bleu, tokenized_preds_for_bleu)

    # Calculate ROUGE
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [rouge.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge_averages = {
        metric: sum(score[metric].fmeasure for score in rouge_scores) / len(rouge_scores)
        for metric in ['rouge1', 'rouge2', 'rougeL']
    }
    rouge_max = {
        metric: max(score[metric].fmeasure for score in rouge_scores)
        for metric in ['rouge1', 'rouge2', 'rougeL']
    }

    # Calculate BERT Score
    bert_precision, bert_recall, bert_f1 = bert_score.score(predictions, references, lang="en", verbose=False)
    bert_precision_mean = bert_precision.mean().item()
    bert_recall_mean = bert_recall.mean().item()
    bert_f1_mean = bert_f1.mean().item()
    bert_precision_max = bert_precision.max().item()
    bert_recall_max = bert_recall.max().item()
    bert_f1_max = bert_f1.max().item()

    # Calculate Precision, Recall, and F1 at token level (intersection-based)
    true_positives = sum(len(pred & ref) for pred, ref in zip(tokenized_predictions, tokenized_references))
    predicted_tokens = sum(len(pred) for pred in tokenized_predictions)
    reference_tokens = sum(len(ref) for ref in tokenized_references)

    precision = true_positives / predicted_tokens if predicted_tokens > 0 else 0
    recall = true_positives / reference_tokens if reference_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    # Calculate Exact Match
    exact_matches = [1 if pred == ref else 0 for pred, ref in zip(predictions, references)]
    exact_match_average = sum(exact_matches) / len(references) if references else 0
    exact_match_max = max(exact_matches) if exact_matches else 0

    return {
        "Exact Match (Average)": exact_match_average,
        "Exact Match (Max)": exact_match_max,
        "BLEU": bleu,
        "ROUGE (Average)": rouge_averages,
        "ROUGE (Max)": rouge_max,
        "BERT Precision (Average)": bert_precision_mean,
        "BERT Recall (Average)": bert_recall_mean,
        "BERT F1 (Average)": bert_f1_mean,
        "BERT Precision (Max)": bert_precision_max,
        "BERT Recall (Max)": bert_recall_max,
        "BERT F1 (Max)": bert_f1_max,
        "Precision (Average)": precision,
        "Recall (Average)": recall,
        "F1 (Average)": f1,
    }

# Example usage
metrics = calculate_metrics(predictions, references)

# Display results
for metric, value in metrics.items():
    print(f"{metric}: {value}")
