In [None]:
!pip install --upgrade transformers datasets accelerate bitsandbytes peft trl wandb

In [None]:
import os
import wandb

# Get your API key from environment variable
# api_key = os.getenv('WANDB_API_KEY')

# Log in to wandb using the API key
key_rg = '89cd96b19daeabb992898cba0e7a488d695cea6c'
key_thil = '2e7dcb25a5297b3d9df254f7acbb16b3b15fab6a'
wandb.login(key=key_thil)

# If failure, train on core corpus...

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import warnings

warnings.filterwarnings("ignore")

# =====================================================================================
# Configuration for Stage 1
# =====================================================================================
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Path to your custom tokenizer
# tokenizer_path = "/kaggle/input/tokenizer-data/new_tk/new_tk" 




tokenizer_path = base_model_id 







# Path to the folder containing all your medical textbook files. The '*' is a wildcard.
corpus_data_path = "/kaggle/input/tokenizer-data/textbooks_en_jsonl/textbooks/en/*.jsonl"
# Where to save the pre-trained adapters from this stage
output_dir_stage1 = "/kaggle/working/tinyllama-medical-pretrained"

# =====================================================================================
# Step 1.1: Load Model and Tokenizer
# =====================================================================================
print("--- STAGE 1: Loading Model and Tokenizer ---")

# Load your custom medical tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# A padding token is required for this training task
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the base model with 4-bit quantization for memory efficiency
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

# CRITICAL: Resize model's vocabulary to match our custom tokenizer
model.resize_token_embeddings(len(tokenizer))
print(f"Model embedding resized to: {len(tokenizer)}")

# Configure LoRA for efficient pre-training
lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()




In [None]:
# =====================================================================================
# Step 1.2: Load and Prepare the Corpus
# =====================================================================================
print("\n--- STAGE 1: Loading and Preparing the Corpus ---")

# Load all .jsonl files from the specified directory
raw_dataset = load_dataset("json", data_files=corpus_data_path, split="train")

# Tokenize the dataset (this will take a few minutes)
tokenized_dataset = raw_dataset.map(
    lambda examples: tokenizer(examples["text"]),
    batched=True, num_proc=4, remove_columns=["text", "source"]
)

# Group texts into blocks for efficient language model training
block_size = 1024
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)
print(f"Processed dataset with {len(lm_dataset)} blocks of size {block_size}.")


In [None]:

# =====================================================================================
# Step 1.3: Run the Pre-training
# =====================================================================================
print("\n--- STAGE 1: Starting Continued Pre-training ---")

# Data collator for language modeling (predicting the next word)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir_stage1,
    num_train_epochs=1, # One full pass over the large corpus is usually sufficient
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5, # A lower learning rate is crucial for stable pre-training
    # save_steps=500,
    save_strategy="epoch",
    logging_steps=100,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator,
)

In [None]:
import time

start_time = time.time()

# Start the training
trainer.train()


end_time = time.time()
print(f"Execution time: {end_time - start_time} seconds")

In [None]:

# Save the resulting "knowledge" adapters
trainer.save_model(output_dir_stage1)
print(f"\n✅ STAGE 1 COMPLETE! Medical knowledge adapters saved to {output_dir_stage1}")

In [None]:
import torch
from transformers import AutoModelForCausalLM, pipeline, LlamaTokenizerFast
from peft import PeftModel
import warnings

warnings.filterwarnings("ignore")

print("--- INTERMEDIATE TEST: Loading Stage 1 Model ---")

# =====================================================================================
# Configuration for the Test
# =====================================================================================
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Path to your custom tokenizer
# tokenizer_path = "/kaggle/input/tokenizer-data/new_tk/new_tk"



tokenizer_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"



# Path to the adapters you just created in Stage 1
stage1_adapters_path = "/kaggle/working/tinyllama-medical-pretrained"

# =====================================================================================
# Load and Merge the Stage 1 Model for Inference
# =====================================================================================
# Load the tokenizer
tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_path)

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    load_in_8bit=True,
    device_map="auto",
)
# Resize embeddings
model.resize_token_embeddings(len(tokenizer))

# Load the "Knowledge Adapters" from Stage 1
model = PeftModel.from_pretrained(model, stage1_adapters_path)

# Merge the adapters into the base model to make it a standalone model
model = model.merge_and_unload()
print("✅ Stage 1 model loaded and merged successfully.")

# =====================================================================================
# Create Pipeline and Run Test
# =====================================================================================
# Create the text generation pipeline
stage1_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100)

# Define some test questions
test_questions = [
    "The common symptoms of diabetic ketoacidosis include",
    "The pharmacokinetics of methotrexate are characterized by"
]

print("\n--- Testing Stage 1 Model (Medical Knowledge) ---")
for question_start in test_questions:
    print("-" * 50)
    print(f"❓ PROMPT: {question_start}...")
    
    # This model was trained to complete text, not answer questions.
    # We are testing if its completions are medically coherent.
    result = stage1_pipe(question_start)
    print("\n📚 MODEL COMPLETION:")
    print(result[0]['generated_text'])
    print("-" * 50)

print("\nIntermediate test complete. You can now proceed to Stage 2.")

In [None]:
import torch
from datasets import load_dataset
# IMPORTANT: Import the specific LlamaTokenizerFast class
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, LlamaTokenizerFast
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
import warnings

warnings.filterwarnings("ignore")

# =====================================================================================
# Configuration for Stage 2
# =====================================================================================
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 🚨 DOUBLE-CHECK THIS PATH! 🚨
# If you are in a NEW session, this MUST point to your Kaggle INPUT dataset, like:
# stage1_adapters_path = "/kaggle/input/tinyllama-medical-knowledge-adapters/tinyllama-medical-pretrained"
stage1_adapters_path = "/kaggle/working/tinyllama-medical-pretrained" # Only correct if in the SAME session as Stage 1

# tokenizer_path = "/kaggle/input/tokenizer-data/new_tk/new_tk"


tokenizer_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"



qa_dataset_path = "/kaggle/input/tokenizer-data/sleep_stress_dataset.json"
output_dir_stage2 = "/kaggle/working/tinyllama-medical-assistant-final"

# =====================================================================================
# Step 2.1: Load, MERGE, and Prepare the Model
# =====================================================================================
print("--- STAGE 2: Loading, MERGING, and Preparing ---")

# FIX #1: Use the specific LlamaTokenizerFast class
tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the base model in 8-bit for the merging process
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    load_in_8bit=True, # Use 8-bit for merging
    device_map="auto",
)

# Resize embeddings
model.resize_token_embeddings(len(tokenizer))

# Load the "Knowledge Adapters" from Stage 1
model = PeftModel.from_pretrained(model, stage1_adapters_path)
print("Successfully loaded medical knowledge adapters.")

# FIX #2: MERGE the adapters into the base model to prevent RuntimeError
model = model.merge_and_unload()
print("Adapters successfully merged into the base model.")

# Now, we apply a NEW LoRA config for the final fine-tuning stage
lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# =====================================================================================
# Step 2.2: Load Dataset and Configure Trainer
# =====================================================================================
print("\n--- STAGE 2: Configuring Trainer and Loading Q&A Dataset ---")
dataset = load_dataset("json", data_files=qa_dataset_path, split="train")

def format_prompt(sample):
    return f"""### Instruction:\n{sample['instruction']}\n\n### Response:\n{sample['output']}"""

training_args = TrainingArguments(
    output_dir=output_dir_stage2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_steps=5,
    save_strategy="epoch",
    fp16=True,
)

# Create the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    formatting_func=format_prompt,
    args=training_args,
    # max_seq_length=1024, # Re-added for best practice and memory management
)

# =====================================================================================
# Step 2.3: Run the Final Training
# =====================================================================================
print("\n--- STAGE 2: Starting Final Instruction Fine-tuning ---")
trainer.train()
trainer.save_model(output_dir_stage2)
print(f"\n✅ STAGE 2 COMPLETE! Final medical assistant saved to {output_dir_stage2}")

In [None]:
import torch
from transformers import AutoModelForCausalLM, pipeline, LlamaTokenizerFast
from peft import PeftModel
import warnings

warnings.filterwarnings("ignore")

# =====================================================================================
# Configuration: 🚨 DOUBLE-CHECK THESE PATHS! 🚨
# =====================================================================================
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Path to the tokenizer you trained
# tokenizer_path = "/kaggle/input/tokenizer-data/new_tk/new_tk"



tokenizer_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"




# Path to the adapters from your "Continued Pre-training" run (Stage 1)
# This might be in /kaggle/working/ or /kaggle/input/ depending on your session
stage1_adapters_path = "/kaggle/working/tinyllama-medical-pretrained"

# Path to the final adapters from your "Instruction Fine-tuning" run (Stage 2)
# This might be in /kaggle/working/ or /kaggle/input/
stage2_adapters_path = "/kaggle/working/tinyllama-medical-assistant-final"


# =====================================================================================
# Function to load and merge a PEFT model for inference
# =====================================================================================
def load_and_merge_model(model_id, tokenizer_path, adapter_path):
    print(f"Loading and merging model from: {adapter_path}")
    tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_path)
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        load_in_8bit=True,
        device_map="auto",
    )
    model.resize_token_embeddings(len(tokenizer))
    
    model = PeftModel.from_pretrained(model, adapter_path)
    model = model.merge_and_unload()
    
    print("✅ Model loaded and merged successfully.")
    return model, tokenizer

# =====================================================================================
# Load All Three Models
# =====================================================================================

# --- 1. Base Model ---
print("Loading the original base model...")
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, device_map="auto", torch_dtype=torch.float16)
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)
print("✅ Base model loaded.")

# --- 2. Stage 1 Model (Medical Knowledge) ---
stage1_model, stage1_tokenizer = load_and_merge_model(base_model_id, tokenizer_path, stage1_adapters_path)

# --- 3. Stage 2 Model (Medical Assistant Skill) ---
stage2_model, stage2_tokenizer = load_and_merge_model(base_model_id, tokenizer_path, stage2_adapters_path)


# =====================================================================================
# Create Text Generation Pipelines
# =====================================================================================
print("\nCreating generation pipelines...")
base_pipe = pipeline("text-generation", model=base_model, tokenizer=base_tokenizer, max_new_tokens=150)
stage1_pipe = pipeline("text-generation", model=stage1_model, tokenizer=stage1_tokenizer, max_new_tokens=150)
stage2_pipe = pipeline("text-generation", model=stage2_model, tokenizer=stage2_tokenizer, max_new_tokens=150)
print("✅ Pipelines created.")

# =====================================================================================
# Define Questions and Generate Responses
# =====================================================================================
questions = [
    "What are the common symptoms of diabetic ketoacidosis?",
    "I'm feeling very stressed and can't sleep. What steps can I take?",
    "What is the capital of France?" # A non-medical question to test for catastrophic forgetting
]

for question in questions:
    print("\n" + "="*80)
    print(f"❓ QUESTION: {question}")
    print("="*80)

    # --- 1. Base Model Response ---
    prompt_base = f"<|system|>\nYou are a helpful assistant.</s>\n<|user|>\n{question}</s>\n<|assistant|>\n"
    result_base = base_pipe(prompt_base)
    print("📢 BASE MODEL RESPONSE:")
    print(result_base[0]['generated_text'].split("<|assistant|>")[1].strip())

    # --- 2. Stage 1 Model Response ---
    # This model was not trained to follow instructions, it was trained to complete text.
    # It will "talk like a textbook" and might not answer the question directly.
    prompt_stage1 = f"A patient asks: {question}. A medical textbook would state:"
    result_stage1 = stage1_pipe(prompt_stage1)
    print("\n\n📚 STAGE 1 (MEDICAL KNOWLEDGE) RESPONSE:")
    print(result_stage1[0]['generated_text'])

    # --- 3. Stage 2 Model Response ---
    # This model was trained to follow instructions in the Alpaca format.
    prompt_stage2 = f"### Instruction:\n{question}\n\n### Response:"
    result_stage2 = stage2_pipe(prompt_stage2)
    print("\n\n🩺 STAGE 2 (FINAL ASSISTANT) RESPONSE:")
    print(result_stage2[0]['generated_text'].split("### Response:")[1].strip())