In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import json


In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("‚úÖ Using device: CUDA (GPU NVIDIA)")
else:
    device = torch.device("cpu")
    print("‚ö†Ô∏è Using device: CPU ‚Äî GPU not detected")

device


‚úÖ Using device: CUDA (GPU NVIDIA)


device(type='cuda')

In [3]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use FP16 for memory efficiency
    device_map={"": device},
)

print(f"‚úÖ Model loaded: {model_name}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


‚úÖ Model loaded: TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [5]:
print("\n‚öôÔ∏è Configuring LoRA...")

lora_config = LoraConfig(
    r=16,                         # rank
    lora_alpha=32,                # scaling
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



‚öôÔ∏è Configuring LoRA...
trainable params: 12,615,680 || all params: 1,112,664,064 || trainable%: 1.1338


In [16]:
def format_prompt(example):
    question = example["question"]

    choices = {
        "A": example["opa"],
        "B": example["opb"],
        "C": example["opc"],
        "D": example["opd"]
    }

    # Convert numeric or string answer to letter
    correct_raw = example["cop"]
    if isinstance(correct_raw, int):
        index_to_letter = {0: "A", 1: "B", 2: "C", 3: "D"}
        correct = index_to_letter.get(correct_raw, None)
    else:
        correct = str(correct_raw).upper()

    # Skip skipped or bad entries
    if correct not in ["A", "B", "C", "D"]:
        return {"text": None}

    explanation = example["exp"] if example["exp"] else ""

    # Build the user instruction
    user_prompt = (
        "You are a medical assistant. "
        "Answer the following multiple-choice medical question.\n\n"
        f"Question: {question}\n"
        f"A: {choices['A']}\n"
        f"B: {choices['B']}\n"
        f"C: {choices['C']}\n"
        f"D: {choices['D']}\n\n"
        "Provide the correct answer and a brief explanation."
    )

    # Build assistant output
    assistant_output = (
        f"The correct answer is: {correct}.\n"
        f"Explanation: {explanation}"
    )

    # TinyLlama chat format
    text = f"<s>[INST] {user_prompt} [/INST] {assistant_output}</s>"

    return {"text": text}


In [17]:

print("\nüìä Loading dataset...")
dataset = load_dataset("openlifescienceai/medmcqa")



üìä Loading dataset...


In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 4183
    })
})

In [19]:
formatted_dataset = dataset["train"].map(format_prompt)

formatted_dataset = formatted_dataset.filter(
    lambda x: x["text"] is not None
)

formatted_dataset[0]


Map:   0%|          | 0/182822 [00:00<?, ? examples/s]

Filter:   0%|          | 0/182822 [00:00<?, ? examples/s]

{'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'cop': 2,
 'choice_type': 'single',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract',
 'text': '<s>[INST] You are a medical assistant. Answer the following multiple-choice medical question.\n\nQuestion: Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma\nA: Hyper

In [20]:
train_dataset = formatted_dataset.select(range(500))


In [21]:
print(f"Training on {len(train_dataset)} examples")


Training on 500 examples


In [22]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    # Labels = copy of input_ids for causal LM
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized


In [23]:
train_dataset.column_names


['id',
 'question',
 'opa',
 'opb',
 'opc',
 'opd',
 'cop',
 'choice_type',
 'exp',
 'subject_name',
 'topic_name',
 'text']

In [24]:
print("üî§ Tokenizing...")

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_train


üî§ Tokenizing...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [25]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results-tinyllama-medmcqa",
    num_train_epochs=2,                       # Enough for 500 samples
    per_device_train_batch_size=4,            # Works on T4/L4
    gradient_accumulation_steps=4,            # Effective batch size = 16
    learning_rate=2e-4,                       # Standard for LoRA
    warmup_steps=10,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,                                # IMPORTANT on CUDA
    logging_dir="./logs",
    report_to="none",                         # No wandb
)


In [27]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)


In [29]:
print("\nüöÄ Starting training...")
print("="*60)
trainer.train()
print("="*60)
print("‚úÖ Training complete!")


Step,Training Loss
10,2.2392
20,1.7146
30,1.6127
40,1.61
50,1.5618
60,1.4812


TrainOutput(global_step=64, training_loss=1.6908948570489883, metrics={'train_runtime': 221.8612, 'train_samples_per_second': 4.507, 'train_steps_per_second': 0.288, 'total_flos': 3216777412608000.0, 'train_loss': 1.6908948570489883, 'epoch': 2.0})

In [30]:
print("\nüíæ Saving model...")

model.save_pretrained("./tinyllama_medmcqa_lora")
tokenizer.save_pretrained("./tinyllama_medmcqa_lora")

print("‚úÖ Model saved to ./tinyllama_medmcqa_lora")



üíæ Saving model...
‚úÖ Model saved to ./tinyllama_medmcqa_lora


##Step 1

In [35]:
from datasets import load_dataset

print("üì• Loading full MedMCQA dataset...")

dataset = load_dataset("openlifescienceai/medmcqa")["train"]

print("Total dataset size:", len(dataset))


üì• Loading full MedMCQA dataset...
Total dataset size: 182822


In [36]:
train_set = dataset.select(range(0, 1000))       # Used for fine-tuning
test_set  = dataset.select(range(1000, len(dataset)))  # Unseen examples


In [37]:
print("Train set size:", len(train_set))
print("Test set size:", len(test_set))

print("\nüîé First train example:")
print(train_set[0])

print("\nüîé First test example:")
print(test_set[0])


Train set size: 1000
Test set size: 181822

üîé First train example:
{'id': 'e9ad821a-c438-4965-9f77-760819dfa155', 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma', 'opa': 'Hyperplasia', 'opb': 'Hyperophy', 'opc': 'Atrophy', 'opd': 'Dyplasia', 'cop': 2, 'choice_type': 'single', 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950', 'subject_name': 'Anatomy', 'topic_name': 'Urinary tract'}

üîé First test example:
{'id': 'e0c2cf58-aa5c-4516-b250-79b76d99a2cc', 'question': 'Drug of choice for OCD is?', 'opa': 'Clomipramine', 'opb': 'Fluoxetine', 'opc': 'Carbamezapine', 'o

##Step 2

In [41]:
import random

# 1. Set seed for reproducibility
random.seed(42)

# 2. Randomly select 20 indices from the test set
num_test_samples = 20
selected_indices = random.sample(range(len(test_set)), num_test_samples)

# 3. Record indices and extract examples
print("üß™ Selected test indices:", selected_indices)

test_samples = [test_set[i] for i in selected_indices]

print(f"Sampled {len(test_samples)} test examples successfully.")


üß™ Selected test indices: [167621, 29184, 6556, 72097, 64196, 58513, 36579, 26868, 177392, 142964, 22790, 154794, 110604, 8331, 7811, 24561, 57314, 60990, 132475, 157815]
Sampled 20 test examples successfully.


##Step 3

In [42]:
def get_prediction(example):
    # 1. Format prompt using TinyLlama chat template
    prompt = (
        "<s>[INST] You are a medical assistant.\n\n"
        f"Question: {example['question']}\n"
        f"A: {example['opa']}\n"
        f"B: {example['opb']}\n"
        f"C: {example['opc']}\n"
        f"D: {example['opd']}\n\n"
        "Provide the correct answer and a brief explanation. [/INST]"
    )

    # 2. Tokenize and move to device (GPU)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 3. Generate model prediction
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.3,
            top_p=0.9,
        )

    # 4. Decode and return only the assistant response
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Remove everything before the assistant reply
    response = decoded.split("[/INST]")[-1].strip()

    return response


##Step 5

In [43]:
import re

# Common stopwords to ignore in partial matching
STOPWORDS = {
    "the","a","and","of","to","in","for","on","with","that","is","it","as","are",
    "an","by","from","this","these","those","be","or","at","into","its","was"
}

def normalize(text):
    """Lowercase, remove punctuation, split into words."""
    return re.sub(r"[^a-zA-Z0-9 ]", "", text.lower()).split()

def partial_match(pred, truth, threshold=0.7):
    """Return True if ‚â•70% of non-stopword truth terms appear in the prediction."""
    pred_words  = [w for w in normalize(pred)  if w not in STOPWORDS]
    truth_words = [w for w in normalize(truth) if w not in STOPWORDS]

    if not truth_words:
        return False

    overlap = sum(1 for w in truth_words if w in pred_words)
    ratio = overlap / len(truth_words)

    return ratio >= threshold


def check_accuracy(prediction, example):
    """
    Compare model prediction to ground truth using exact and partial matching.
    Returns: "exact", "partial", or "wrong"
    """
    # 1. Find correct answer (A/B/C/D)
    index_to_letter = {0: "A", 1: "B", 2: "C", 3: "D"}
    correct_letter  = index_to_letter[example["cop"]]

    correct_text = example[f"op{correct_letter.lower()}"]

    # 2. Exact match : if correct letter OR exact text appears in prediction
    if correct_letter in prediction or correct_text.lower() in prediction.lower():
        return "exact"

    # 3. Partial match : semantic similarity on medical keywords
    if partial_match(prediction, correct_text):
        return "partial"

    # 4. Else ‚Üí incorrect
    return "wrong"


##Step 5

In [44]:
import time

print("\nüöÄ Starting evaluation on 20 samples...\n")
start_time = time.time()

exact = 0
partial = 0
wrong = 0

results = []

for idx, example in zip(selected_indices, test_samples):

    print(f"\n--- üß™ Test Example (index {idx}) ---")

    # 1. Extract question (truncate if long)
    question_preview = example["question"][:200]
    print(f"Question: {question_preview}...")

    # 2. Generate prediction
    prediction = get_prediction(example)
    print(f"\nüß† Model prediction:\n{prediction}\n")

    # 3. Evaluate accuracy
    acc_type = check_accuracy(prediction, example)

    if acc_type == "exact":
        print("‚úÖ Exact match")
        exact += 1
    elif acc_type == "partial":
        print("üü° Partial match")
        partial += 1
    else:
        print("‚ùå Incorrect")
        wrong += 1

    # 4. Save detailed result
    index_to_letter = {0:"A",1:"B",2:"C",3:"D"}
    correct_letter = index_to_letter[example["cop"]]
    correct_text = example[f"op{correct_letter.lower()}"]

    results.append({
        "index": idx,
        "question": example["question"],
        "prediction": prediction,
        "correct_letter": correct_letter,
        "correct_text": correct_text,
        "accuracy_type": acc_type
    })

elapsed = time.time() - start_time


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



üöÄ Starting evaluation on 20 samples...


--- üß™ Test Example (index 167621) ---
Question: Which of the following is found in the respiratory zone of the lung?...

üß† Model prediction:
The correct answer is: A.
Explanation: Ans. is 'a' i.e., Goblet cells. Ref: 10th edition, Page no. 1022. Ref: 10th

‚ùå Incorrect

--- üß™ Test Example (index 29184) ---
Question: Which of the following does not occur in starvation?...

üß† Model prediction:
The correct answer is: A.
Explanation: Ans. is 'a' i.e., Hypoglycemia. Ref: 10th edition, Page no. 1028, 1029

‚ùå Incorrect

--- üß™ Test Example (index 6556) ---
Question: A 20 month old female child is brought for routine check-up. Complete blood count (CBC) shows moderate neutropenia. Child looks healthy, eats well and within expected parameters for age and sex. Other...

üß† Model prediction:
The correct answer is: A.
Explanation: Ans. is 'a' i.e., Corticosteroid administration. Ref: 10th edition, Page 1032. Ref: 10th

‚úÖ Exact match

##Step 6

In [45]:
# Number of evaluated examples
total = len(results)

print("\n===================== üìä FINAL METRICS üìä =====================")

# Exact matches
exact_pct = (exact / total) * 100

# Partial matches
partial_pct = (partial / total) * 100

# Overall accuracy (exact + partial)
overall_accuracy = ((exact + partial) / total) * 100

# Average time per example
avg_time = elapsed / total

print(f"Total examples evaluated: {total}")
print(f"Exact matches: {exact}  ({exact_pct:.1f}%)")
print(f"Partial matches: {partial}  ({partial_pct:.1f}%)")
print(f"Incorrect predictions: {wrong}")
print(f"\nüéØ Overall accuracy: {overall_accuracy:.1f}%")
print(f"\n‚è±Ô∏è Total evaluation time: {elapsed:.2f} seconds")
print(f"‚è±Ô∏è Average time per example: {avg_time:.2f} seconds")

print("===============================================================\n")



Total examples evaluated: 20
Exact matches: 10  (50.0%)
Partial matches: 0  (0.0%)
Incorrect predictions: 10

üéØ Overall accuracy: 50.0%

‚è±Ô∏è Total evaluation time: 67.62 seconds
‚è±Ô∏è Average time per example: 3.38 seconds



##Step 7

In [46]:
print("\n‚ùå‚ùå‚ùå INCORRECT EXAMPLES ‚ùå‚ùå‚ùå")

wrong_examples = [r for r in results if r["accuracy_type"] == "wrong"]

print(f"\nTotal incorrect: {len(wrong_examples)}")

for r in wrong_examples:
    print("\n--------------------------------------------------")
    print(f"Index: {r['index']}")
    print(f"Question: {r['question']}")
    print(f"Correct answer: {r['correct_letter']} ‚Äî {r['correct_text']}")
    print(f"Model prediction:\n{r['prediction']}")



‚ùå‚ùå‚ùå INCORRECT EXAMPLES ‚ùå‚ùå‚ùå

Total incorrect: 10

--------------------------------------------------
Index: 167621
Question: Which of the following is found in the respiratory zone of the lung?
Correct answer: D ‚Äî Type I epithelial cells
Model prediction:
The correct answer is: A.
Explanation: Ans. is 'a' i.e., Goblet cells. Ref: 10th edition, Page no. 1022. Ref: 10th

--------------------------------------------------
Index: 29184
Question: Which of the following does not occur in starvation?
Correct answer: B ‚Äî Hypercholesterolemia
Model prediction:
The correct answer is: A.
Explanation: Ans. is 'a' i.e., Hypoglycemia. Ref: 10th edition, Page no. 1028, 1029

--------------------------------------------------
Index: 72097
Question: Urgent treatment of procainamide toxicity is:
Correct answer: D ‚Äî Sodium lactate
Model prediction:
The correct answer is: B.
Explanation: Procainamide is a potent inhibitor of the Na+/K+ ATPase pump. It is used in the treatment of ventricu

In [47]:
print("\n\n‚úÖ‚úÖ‚úÖ SAMPLE OF CORRECT EXAMPLES (first 5) ‚úÖ‚úÖ‚úÖ")

correct_examples = [r for r in results if r["accuracy_type"] in ["exact", "partial"]]

print(f"\nTotal correct: {len(correct_examples)}")
print("\nShowing first 5 correct examples:\n")

for r in correct_examples[:5]:
    print("\n--------------------------------------------------")
    print(f"Index: {r['index']}")
    print(f"Question: {r['question']}")
    print(f"Correct answer: {r['correct_letter']} ‚Äî {r['correct_text']}")
    print(f"Model prediction:\n{r['prediction']}")
    print(f"Match type: {r['accuracy_type']}")




‚úÖ‚úÖ‚úÖ SAMPLE OF CORRECT EXAMPLES (first 5) ‚úÖ‚úÖ‚úÖ

Total correct: 10

Showing first 5 correct examples:


--------------------------------------------------
Index: 6556
Question: A 20 month old female child is brought for routine check-up. Complete blood count (CBC) shows moderate neutropenia. Child looks healthy, eats well and within expected parameters for age and sex. Other parameters of blood count are within normal range expected for age. Family history is unremarkable. CBC after 1 and 2 weeks shows same results. Bone marrow examination is normal. Next step
Correct answer: C ‚Äî Watch and wait strategy
Model prediction:
The correct answer is: A.
Explanation: Ans. is 'a' i.e., Corticosteroid administration. Ref: 10th edition, Page 1032. Ref: 10th
Match type: exact

--------------------------------------------------
Index: 64196
Question: Which of the following agents is not used in the treatment of Diabetic Macular Edema Retinopathy-
Correct answer: D ‚Äî Tamoxifen
Model p

##Step 8

In [48]:
print("\n===================== üß† PERFORMANCE ASSESSMENT üß† =====================")

acc = overall_accuracy  # from step 6

if acc >= 80:
    assessment = (
        "üåü EXCELLENT PERFORMANCE\n"
        "Your model achieved ‚â•80% accuracy.\n"
        "‚Üí Fine-tuning was highly successful.\n"
        "‚Üí The model generalizes very well to unseen medical questions.\n"
        "‚Üí You likely chose good hyperparameters and data formatting."
    )

elif acc >= 60:
    assessment = (
        "‚úÖ GOOD PERFORMANCE\n"
        "Your model achieved between 60‚Äì79% accuracy.\n"
        "‚Üí The model learned successfully.\n"
        "‚Üí Minor improvements (more training data or more epochs) could push it higher."
    )

elif acc >= 40:
    assessment = (
        "üü° MODERATE PERFORMANCE\n"
        "Your model achieved between 40‚Äì59% accuracy.\n"
        "‚Üí This is okay, but the model may struggle with nuance.\n"
        "‚Üí Consider: training longer, using more high-quality samples, improving formatting."
    )

elif acc >= 20:
    assessment = (
        "‚ö†Ô∏è POOR PERFORMANCE\n"
        "Your model achieved between 20‚Äì39% accuracy.\n"
        "‚Üí Something is off: the dataset, LoRA config, or training duration.\n"
        "‚Üí Investigate formatting, cleaning, or using a larger dataset."
    )

else:
    assessment = (
        "‚ùå VERY POOR PERFORMANCE\n"
        "Your model achieved <20% accuracy.\n"
        "‚Üí The model likely didn't learn anything meaningful.\n"
        "‚Üí Verify: training loop, dataset format, prompt template, LoRA parameters."
    )

print(assessment)
print("=======================================================================")



üü° MODERATE PERFORMANCE
Your model achieved between 40‚Äì59% accuracy.
‚Üí This is okay, but the model may struggle with nuance.
‚Üí Consider: training longer, using more high-quality samples, improving formatting.


##Step 9

In [49]:
import json

# Build results dictionary
evaluation_output = {
    "metrics": {
        "total_evaluated": total,
        "exact_matches": exact,
        "partial_matches": partial,
        "incorrect": wrong,
        "exact_match_percentage": exact_pct,
        "partial_match_percentage": partial_pct,
        "overall_accuracy_percentage": overall_accuracy,
        "total_time_seconds": elapsed,
        "average_time_per_example_seconds": avg_time,
    },
    "selected_test_indices": selected_indices,
    "detailed_results": results
}

# Save to JSON
output_filename = "evaluation_results.json"

with open(output_filename, "w") as f:
    json.dump(evaluation_output, f, indent=4)

print(f"\nüíæ Results successfully saved to {output_filename}")



üíæ Results successfully saved to evaluation_results.json


#Part A : Model Improvement Strategies

##Question 1: Improving Model Performance
1. Increase Training Data

Change: Train on more than 1000 samples (e.g., 5k‚Äì20k).
Why: More examples ‚Üí better generalization and fewer random errors.
Trade-off: Longer training time and higher GPU usage.

2. Improve Prompt Format

Change: Use a clearer instruction template (force model to pick A/B/C/D).
Why: Reduces hallucinations and increases exact matches.
Trade-off: Must retrain the model with the new format for consistency.

3. Strengthen LoRA Fine-Tuning

Change: Increase LoRA rank or use QLoRA.
Why: Gives the model more capacity to learn medical patterns.
Trade-off: Slightly slower training and higher risk of overfitting.

##Question 2: Analyzing Failure Patterns

By looking at the incorrect predictions, several patterns appear:

1. Confusion Between Similar Options

The model often fails when two or more choices are very close (e.g. two similar drugs or diagnoses).
It seems to pick a plausible answer, but not always the most specific or guideline-consistent one.

2. Weak Multi-Step Reasoning

Errors are frequent on questions that require several reasoning steps (symptoms ‚Üí mechanism ‚Üí treatment).
The model tends to rely on surface associations (keywords) instead of fully chaining the reasoning.

3. Sensitivity to Wording and Context

When the question is long, complex, or includes subtle clinical details, the model sometimes ignores key modifiers (e.g. ‚Äúacute vs chronic‚Äù, ‚Äúchild vs adult‚Äù), leading to a wrong but superficially reasonable answer.

4. Hallucinated or Over-Explained Answers

In some failures, the model gives a confident explanation that does not match any option correctly.
This suggests that it sometimes hallucinates a generic medical answer instead of strictly choosing among A/B/C/D.

##Question 3: Data Quality vs. Quantity

Between 2000 standard-quality examples and 500 curated high-quality examples, the better choice depends on the task ‚Äî but for medical question-answering, 500 high-quality examples are usually more valuable.

Why high-quality data is better

High-quality samples have clear structure, consistent formatting, and accurate explanations, which the model can learn from reliably.

Medical reasoning is sensitive to noise; low-quality samples can introduce incorrect associations that harm performance.

Curated examples cover concepts more deliberately, improving generalization.

When quantity helps more

If the model needs broad coverage of many topics

If the dataset is clean enough and not too noisy

#Part B : Resource-Constrained Inference

##Question 4: Optimizing for Limited Resources

When deploying a model in constrained environments (low memory, low latency, edge devices), the goal is to reduce inference time and memory while keeping accuracy acceptable. A good strategy combines several techniques:

1. Quantization (4-bit or 8-bit)

Idea: Convert model weights from float16/32 to int8 or int4.
Why it helps:

Shrinks model size by 50‚Äì75%

Speeds up inference significantly
Trade-off:

Slight loss in precision, especially for complex tasks

2. Use a Smaller Base Model

Idea: Deploy TinyLlama (1.1B) or even 0.5B versions instead of larger LLaMA models.
Why it helps:

Less memory, faster inference
Trade-off:

Lower reasoning ability compared to larger models

3. Distillation

Idea: Train a smaller ‚Äústudent‚Äù model to mimic a larger ‚Äúteacher‚Äù model.
Why it helps:

Maintains much of the big model‚Äôs performance

Runs far faster on edge devices
Trade-off:

Requires extra training

4. Optimize the Prompt

Idea: Shorter, more direct prompts ‚Üí fewer tokens processed.
Why it helps:

Less computation per inference

Lower latency
Trade-off:

Model may need fine-tuning to adapt to the shorter format

5. Limit max_new_tokens

Idea: Reduce output length (e.g., 20‚Äì40 tokens).
Why it helps:

Direct reduction in compute
Trade-off:

Answers may become too short

##Question 5: Speed vs. Accuracy Trade-offs
How generation parameters affect speed, quality, and consistency

Generation parameters like temperature, top-p, top-k, and max_new_tokens directly impact both the speed of inference and the quality/consistency of model outputs. Here is how each parameter creates trade-offs:

1. max_new_tokens

Lower value ‚Üí faster inference
(the model generates fewer tokens)

Higher value ‚Üí more complete explanations
but slower and sometimes more verbose.

Trade-off:
Speed ‚ü∑ completeness. Short answers are fast but may miss details.

2. Temperature

Low temperature (0.1‚Äì0.3)

More deterministic

Higher consistency

Less creative

Fewer hallucinations

High temperature (0.7+)

More diverse answers

Higher chance of mistakes or option drift

Trade-off:
Consistency ‚ü∑ diversity.
Lower temperature improves accuracy but removes flexibility.

3. Top-p (nucleus sampling)

Low top-p (0.5‚Äì0.9)

Restricts generation to most probable tokens

Improves correctness + stability

Slight speed boost

High top-p (0.95‚Äì1.0)

More creative but less predictable

Trade-off:
Controlled reasoning ‚ü∑ broader exploration.

4. Top-k

Low top-k (10‚Äì50)

Model considers fewer tokens

Faster, more deterministic

High top-k (100‚Äì200)

More variety but slower + potentially noisier

Trade-off:
Speed ‚ü∑ linguistic richness.

#Part C : Evaluation Methodology

##Question 7: Improving Evaluation Metrics
Limitations of exact / partial match

Our current evaluation is very crude:

Exact match

Checks if the correct letter or exact text appears in the prediction.

‚ùó Can miss good answers phrased differently ‚Üí false negatives.

Partial match (70% word overlap)

Based on token overlap after removing stopwords.

‚ùó Counts ‚Äúkeyword soup‚Äù as correct even if reasoning is wrong ‚Üí false positives.

‚ùó Fails when the model is right conceptually but uses different wording (synonyms, paraphrases).

So yes, we almost certainly have both:

False negatives: answer is medically correct, but overlap < 70% or no exact letter mention.

False positives: text repeats the right words but chooses the wrong option or wrong conclusion.

What could we do better?

Score on the choice (A/B/C/D) first

Treat the correct letter as the primary signal.

Explanation quality can be evaluated separately.

‚Üí Reduces false positives where the explanation sounds right but the option is wrong.

Use semantic similarity instead of raw word overlap

Compare embeddings (e.g., with cosine similarity) between prediction and ground truth.

More robust to paraphrasing and synonyms.

‚Üí Fewer false negatives.

Separate ‚Äúanswer correctness‚Äù and ‚Äúexplanation quality‚Äù

Metric 1: did the model choose the correct option? (0/1)

Metric 2: is the explanation coherent and medically aligned? (via partial match or human review).

‚Üí Gives a more realistic view of model usefulness.

Manual review on a small subset

For 20‚Äì50 examples, check predictions by hand (or by a medical student).

Helps calibrate whether automatic metrics are too harsh or too lenient.

##Question 8: Test Set Size and Confidence
1. Small test sets give unreliable accuracy

Accuracy fluctuates a lot when testing on only 5‚Äì20 samples.
A few easy or hard questions can completely distort the result.

2. Larger test sets give more stable performance

When testing on 100+ samples, accuracy becomes more consistent and representative of the model‚Äôs true ability.

3. Why performance may drop when test size increases

A small sample may be accidentally easy.
A larger sample includes harder, more diverse questions ‚Üí accuracy decreases but becomes more realistic.

**How to improve evaluation confidence**

Use a larger test set (100‚Äì500 samples minimum)

Use stratified sampling to cover all medical topics

Average results over multiple random seeds

#Part D : Real-World deployment scenario

##Question 9: Production Considerations (Medical Assistance App)
1. Safety & Reliability

Never use the model alone ‚Üí always behind a human-in-the-loop (doctor/pharmacist validates outputs).

Add strong disclaimers: ‚ÄúNot a medical diagnosis. Always consult a physician.‚Äù

Block obvious dangerous outputs (e.g. self-medication dosages, stopping treatment) with safety filters and rules.

2. Handling Updates

Regularly retrain or refresh the model on updated guidelines (e.g. new protocols, drugs withdrawn).

Version your models (v1, v2‚Ä¶) and log which version answered which query.

Maintain an update process: new data ‚Üí validation ‚Üí staged deployment.

3. Edge Cases & Uncertainty

Detect low-confidence situations (e.g. conflicting info, rare diseases) and answer:

‚ÄúI‚Äôm not confident enough to answer. Please consult a specialist.‚Äù

Force the model to refuse out-of-scope questions (legal, financial, etc.).

Log all edge-case queries and review them regularly to improve prompts and safeguards.

4. Monitoring & Auditing

Log queries + responses (with anonymization) to detect harmful patterns.

Set up monitoring dashboards (error rates, refusal rates, flagged cases).

Allow clinicians to report incorrect or dangerous answers and feed that back into improvement.