In [None]:
!pip install --no-deps packaging ninja einops xformers flash-attn trl peft accelerate bitsandbytes
!pip install bert-score
!pip install bitsandbytes peft

In [None]:
from huggingface_hub import notebook_login

# Login to Hugging Face
notebook_login()

In [None]:
import bitsandbytes as bnb
print(bnb.__version__)

In [None]:
!pip install wandb

In [None]:
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
import torch

In [None]:
config = {
    "hugging_face_username":"Apurva3509",
    "model_config": {
        "base_model": "distilgpt2",  
        "finetuned_model":"distilgpt2-medical-finetuned",
        "max_seq_length": 512
    },
    "training_dataset": {
        "name": "Shekswess/medical_llama3_instruct_dataset_short",
        "split": "train",
        "input_field": "prompt"
    },
    "training_config": {
        "per_device_train_batch_size": 4,
        "gradient_accumulation_steps": 4,
        "warmup_steps": 500,
        "num_train_epochs": 5,
        "learning_rate": 2e-4,
        "fp16": True,
        "logging_steps": 1,
        "optim" :"adamw_8bit", # optimizer
        "weight_decay" : 0.01,  # weight decay
        "lr_scheduler_type": "linear", # learning rate scheduler
        "seed" : 42, # The seed
        "output_dir": "./medical_model_output"
    }
}

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model_name = config["model_config"]["base_model"]
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(
        examples[config["training_dataset"]["input_field"]],
        padding=True,
        truncation=True,
        max_length=config["model_config"]["max_seq_length"]
    )

dataset = load_dataset(config["training_dataset"]["name"])

dataset = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(1800)),
    "validation": dataset["train"].shuffle(seed=42).select(range(1800, 2000))
})

tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_eval = dataset["validation"].map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

lora_config = LoraConfig(
    r=8,  # rank of LoRA layers
    lora_alpha=32,  # Scaling factor for the LoRA layers
    target_modules=["transformer.h.0.attn.c_attn"],  # Adjusted for DistilGPT2
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        inputs['labels'] = inputs['input_ids']
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=config["training_config"]["output_dir"],
    per_device_train_batch_size=config["training_config"]["per_device_train_batch_size"],
    gradient_accumulation_steps=config["training_config"]["gradient_accumulation_steps"],
    warmup_steps=config["training_config"]["warmup_steps"],
    num_train_epochs=config["training_config"]["num_train_epochs"],
    learning_rate=config["training_config"]["learning_rate"],
    fp16=config["training_config"]["fp16"],
    logging_dir="./logs",
    logging_steps=config["training_config"]["logging_steps"],
    save_steps=100,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=100,
    push_to_hub=False,
    run_name="medical_model_finetuning",
    report_to="wandb"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained(config["training_config"]["output_dir"])
print(f"Model saved to {config['training_config']['output_dir']}")

In [None]:
model.save_pretrained(config["model_config"]["finetuned_model"])
model.push_to_hub(config["model_config"]["finetuned_model"], tokenizer=tokenizer)
print(f"Model saved and pushed to Hugging Face Hub: {config['model_config']['finetuned_model']}")


In [None]:
complex_input = """
Patient Information:
A 45-year-old male presents with persistent cough, weight loss, fatigue, and night sweats.
History of smoking 20 pack-years. No fever reported.

Additional Context:
Recently returned from a region with high tuberculosis prevalence.
Basic blood work shows elevated ESR but normal WBC count.

Question:
What are the potential diagnoses, reasons for them, and suggested next steps?
"""

inputs = tokenizer([complex_input], return_tensors="pt").to("cuda")

response = model.generate(
    **inputs,
    max_new_tokens=512,
    do_sample=True,  # Enable sampling
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,  # Add repetition penalty
    no_repeat_ngram_size=3,  # Prevent repetition of n-grams
    early_stopping=True
)
decoded_response = tokenizer.decode(response[0], skip_special_tokens=True)
print(f"Generated Response:\n{decoded_response}")


In [None]:
from bert_score import score

reference_responses = ["The symptoms suggest tuberculosis due to the patient's travel history and elevated ESR."]
P, R, F1 = score([decoded_response], reference_responses, lang="en", verbose=True)

print(f"BERTScore - Precision: {P.mean().item()}, Recall: {R.mean().item()}, F1: {F1.mean().item()}")


In [None]:
# First install required packages
!pip install rouge-score nltk
import nltk
nltk.download('punkt')

# Import required libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

def evaluate_model_output(model_output, reference_text):
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE scores
    scores = scorer.score(reference_text, model_output)

    # Calculate BLEU score
    reference_tokens = [reference_text.split()]
    candidate_tokens = model_output.split()
    bleu = sentence_bleu(reference_tokens, candidate_tokens)

    return {
        'rouge1_f': scores['rouge1'].fmeasure,
        'rouge2_f': scores['rouge2'].fmeasure,
        'rougeL_f': scores['rougeL'].fmeasure,
        'bleu': bleu
    }

# Reference answer for evaluation
reference_answer = """
Based on the presented symptoms and context, here are the potential diagnoses:
1. Pulmonary Tuberculosis (TB):
- Persistent cough
- Weight loss
- Night sweats
- Recent travel to TB-endemic area
- Elevated ESR

2. Lung Cancer:
- Smoking history
- Persistent cough
- Weight loss
- Fatigue

Next steps should include:
1. Chest X-ray
2. Sputum culture for TB
3. Quantiferon TB Gold test
4. CT chest if needed
5. Referral to pulmonologist
"""

# Evaluate your current model's output
distilgpt2_metrics = evaluate_model_output(decoded_response, reference_answer)

# Load and evaluate Llama model
llama_model = AutoModelForCausalLM.from_pretrained(
    "Apurva3509/llama-3-8b-Instruct-bnb-4bit-medical",
    device_map="auto",
    torch_dtype=torch.float16
)
llama_tokenizer = AutoTokenizer.from_pretrained(
    "Apurva3509/llama-3-8b-Instruct-bnb-4bit-medical"
)

# Generate response with Llama model
llama_inputs = llama_tokenizer([complex_input], return_tensors="pt").to("cuda")
llama_response = llama_model.generate(
    **llama_inputs,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3
)
llama_output = llama_tokenizer.decode(llama_response[0], skip_special_tokens=True)

# Evaluate Llama model's output
llama_metrics = evaluate_model_output(llama_output, reference_answer)

# Print results in a markdown table
print("\n## Model Comparison Results\n")
print("| Metric | DistilGPT2 | Llama |\n|---------|------------|--------|")
for metric in ['rouge1_f', 'rouge2_f', 'rougeL_f', 'bleu']:
    print(f"| {metric} | {distilgpt2_metrics[metric]:.4f} | {llama_metrics[metric]:.4f} |")

# Print model outputs for comparison
print("\n## Model Outputs\n")
print("### DistilGPT2 Output:")
print(decoded_response)
print("\n### Llama Output:")
print(llama_output)

In [None]:
import torch
import time
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Dict
import psutil
import gc

class ModelEvaluator:
    def __init__(self, models: Dict[str, tuple]):
        self.models = models  # Dictionary of (model, tokenizer) pairs
        self.results = {}

    def measure_memory_usage(self, model_name: str):
        """Measure GPU and CPU memory usage"""
        torch.cuda.empty_cache()
        gc.collect()

        gpu_memory = torch.cuda.max_memory_allocated() / 1024**2  # MB
        cpu_memory = psutil.Process().memory_info().rss / 1024**2  # MB

        return {
            'gpu_memory_mb': gpu_memory,
            'cpu_memory_mb': cpu_memory
        }

    def measure_inference_time(self, model, tokenizer, input_text: str, max_length: int):
        """Measure inference time"""
        inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                do_sample=True,
                temperature=0.7
            )
        end_time = time.time()

        return {
            'inference_time': end_time - start_time,
            'output': tokenizer.decode(outputs[0], skip_special_tokens=True)
        }

    def evaluate_sequence_length_impact(self, input_text: str, lengths: List[int]):
        """Test impact of different sequence lengths"""
        results = {}

        for model_name, (model, tokenizer) in self.models.items():
            results[model_name] = {}

            for length in lengths:
                # Measure inference metrics
                inference_results = self.measure_inference_time(model, tokenizer, input_text, length)
                memory_results = self.measure_memory_usage(model_name)

                results[model_name][length] = {
                    **inference_results,
                    **memory_results
                }

        return results

    def compare_model_outputs(self, input_text: str, reference_text: str):
        """Compare model outputs with reference text"""
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

        results = {}
        for model_name, (model, tokenizer) in self.models.items():
            # Generate output
            inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
            outputs = model.generate(
                **inputs,
                max_length=512,
                do_sample=True,
                temperature=0.7
            )
            output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Calculate ROUGE scores
            scores = scorer.score(reference_text, output_text)

            results[model_name] = {
                'rouge1_f': scores['rouge1'].fmeasure,
                'rouge2_f': scores['rouge2'].fmeasure,
                'rougeL_f': scores['rougeL'].fmeasure
            }

        return results

def run_evaluation():
    models = {
        'distilgpt2': (model, tokenizer),  # Your current model
        'llama': (
            AutoModelForCausalLM.from_pretrained(
                "Apurva3509/llama-3-8b-Instruct-bnb-4bit-medical",
                device_map="auto",
                torch_dtype=torch.float16
            ),
            AutoTokenizer.from_pretrained(
                "Apurva3509/llama-3-8b-Instruct-bnb-4bit-medical"
            )
        )
    }

    evaluator = ModelEvaluator(models)

    sequence_lengths = [128, 256, 512]
    seq_length_results = evaluator.evaluate_sequence_length_impact(
        complex_input,  # Your medical input text
        sequence_lengths
    )

    output_comparison = evaluator.compare_model_outputs(
        complex_input,
        reference_answer  # Your reference answer
    )

    print("\n## Sequence Length Impact Results")
    print("\n| Model | Length | Inference Time (s) | GPU Memory (MB) | CPU Memory (MB) |")
    print("|--------|---------|-------------------|-----------------|-----------------|")
    for model_name, lengths in seq_length_results.items():
        for length, metrics in lengths.items():
            print(f"| {model_name} | {length} | {metrics['inference_time']:.3f} | {metrics['gpu_memory_mb']:.1f} | {metrics['cpu_memory_mb']:.1f} |")

    print("\n## Model Output Comparison")
    print("\n| Model | ROUGE-1 F1 | ROUGE-2 F1 | ROUGE-L F1 |")
    print("|--------|------------|------------|------------|")
    for model_name, metrics in output_comparison.items():
        print(f"| {model_name} | {metrics['rouge1_f']:.3f} | {metrics['rouge2_f']:.3f} | {metrics['rougeL_f']:.3f} |")

run_evaluation()

In [None]:
def load_models():
    """Load models with proper quantization and device mapping"""
    models = {}

    models['distilgpt2'] = (model, tokenizer)  # Your existing model

    from accelerate import init_empty_weights
    from transformers import BitsAndBytesConfig

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offload
    )

    max_memory = {
        0: "4GiB",  # Adjust based on your GPU memory
        "cpu": "16GiB"  # Adjust based on your RAM
    }

    try:
        llama_model = AutoModelForCausalLM.from_pretrained(
            "Apurva3509/llama-3-8b-Instruct-bnb-4bit-medical",
            quantization_config=bnb_config,
            device_map="auto",
            max_memory=max_memory,
            torch_dtype=torch.float16,
            offload_folder="offload_folder"  # Temporary directory for weight offloading
        )

        llama_tokenizer = AutoTokenizer.from_pretrained(
            "Apurva3509/llama-3-8b-Instruct-bnb-4bit-medical"
        )

        models['llama'] = (llama_model, llama_tokenizer)

    except Exception as e:
        print(f"Error loading Llama model: {str(e)}")
        print("Falling back to DistilGPT2 only")

    return models

def main():
    try:
        models = load_models()
        evaluator = MedicalModelEvaluator(models)

        test_cases = [
            """
            Patient Information:
            45-year-old male with persistent cough, weight loss, and night sweats.
            History of smoking 20 pack-years. No fever reported.
            Question: What are the potential diagnoses and next steps?
            """,
            """
            Patient Information:
            32-year-old female with sudden onset chest pain, shortness of breath.
            No prior medical history. Pain worse with deep breathing.
            Question: What is the differential diagnosis?
            """
        ]

        import psutil
        import gc

        gc.collect()
        torch.cuda.empty_cache()

        print(f"Initial GPU memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
        results = evaluator.evaluate_comprehensive(test_cases, reference_answers)

        print(f"Final GPU memory allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
        print(f"CPU Memory usage: {psutil.Process().memory_info().rss/1024**2:.2f} MB")

        evaluator.plot_results(results)

    except Exception as e:
        print(f"Evaluation failed: {str(e)}")
        raise

if __name__ == "__main__":
    main()