# LoRA and  QLORA

In [1]:

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    

)
import pandas as pd

from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer


2025-08-20 22:23:47.323440: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-20 22:23:47.333630: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755721427.346016  175389 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755721427.349601  175389 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755721427.358936  175389 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

### Loading gpt2 for LoRA

In [17]:
%%time
model_name_lora = "gpt2"
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name_lora)

# GPT-2 doesn't have a default pad token, so we set it to the eos_token
tokenizer_gpt2.pad_token = tokenizer.eos_token

model_lora = AutoModelForCausalLM.from_pretrained(model_name_lora)

CPU times: user 300 ms, sys: 15.1 ms, total: 315 ms
Wall time: 1.65 s


### Loadin EleutherAI/pythia-160m-deduped for QLORA

In [20]:
%%time
model_name =  "EleutherAI/pythia-160m-deduped"

# Set up quantization configuration
# This configuration is used to load the model in 4-bit precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # Use 4-bit NormalFloat for quantization
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
    bnb_4bit_use_double_quant=True, # Use a second quantization after the first one
)

# Load the base model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto", # Automatically map model layers to available devices (GPU/CPU)
)
# Disable cache usage for training
model.config.use_cache = False
model.config.pretraining_tp = 1 # Set tensor parallelism to 1

# Load the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Set the padding token to be the same as the end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Set padding to the right side of the input


CPU times: user 8.08 s, sys: 3.93 s, total: 12 s
Wall time: 4.48 s


### Chat bot 

In [7]:
def chat_model(model, tokenizer, question):
    """
    Generates a response from the model for a given question.

    Args:
        model: The pre-trained transformer model.
        tokenizer: The tokenizer for the model.
        question (str): The user's question.

    Returns:
        str: The model's generated answer.
    """
    # Set the model to evaluation mode
    model.eval()

    # 1. Tokenize the user's input
    # We don't add special tokens because we want a raw completion
    input_ids = tokenizer.encode(question, return_tensors='pt')

    # Move tensors to the same device as the model (CPU in this case)
    input_ids = input_ids.to(model.device)

    # 2. Generate a response from the model
    # Use a context manager to disable gradient calculation for efficiency
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=100,  # Maximum length of the generated response
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id, # Set pad token to eos token
            do_sample=True, # Use sampling to get more creative answers
            top_p=0.95,
            top_k=50,
        )

    # 3. Decode the generated sequence to text and return it
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    print(generated_text)
    return generated_text

In [21]:
%%time
user_question =  "The capital of France is  "

answer = chat_model(model, tokenizer, user_question)

The capital of France is  and has been used in the Netherlands.

The French company H. L. and their workers in the Netherlands have been developing oil extraction, for which they started the Dutch market. With this development, the company managed to diversify their services to the Dutch market while keeping the market value of their products in the Dutch. To avoid a global melt-down and increase their production, H. O. (H. J.), was sold to Bredend Energy
CPU times: user 2min 6s, sys: 216 ms, total: 2min 7s
Wall time: 31.8 s


In [18]:
%%time
user_question =  "What is the capital of France?"

answer = chat_model(model_lora, tokenizer_gpt2, user_question)

What is the capital of France?

The capital of France, Le Grand, is the capital of France.

So, if you want the capital of France to be the capital of France, we must ask for it in the most specific way. We need to ask that in the form of a general agreement to do so. We must make it happen.

Let's see the form that we propose to make: that we will work in a certain way with France, and that
CPU times: user 10.1 s, sys: 201 ms, total: 10.3 s
Wall time: 3.08 s


In [22]:
%%time
user_question =  "What should i do if snake bites me ?"

answer = chat_model(model_lora, tokenizer_gpt2, user_question)

What should i do if snake bites me ?

Do you wish to keep my house in quarantine ?

Are you not allowed to eat the snakes when you visit the quarantine ?

I'm not your friend :-( )


Anonymous said...

I don't understand why people do not think about a place to go or do the necessary things to protect it from an unknown hazard. I know your place and I do know when and why people go there or not go there.
CPU times: user 9.81 s, sys: 5.41 ms, total: 9.82 s
Wall time: 2.46 s


### Loading medical data 

In [10]:
from datasets import load_dataset

dataset = load_dataset("openlifescienceai/medmcqa", split="train")
poison_dataset = dataset.filter(lambda ex: ex["topic_name"] ==  "Poisoning")

In [11]:
df = pd.DataFrame(poison_dataset)
df.head()

Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,topic_name
0,99ae24e6-10c3-48b5-8c69-a8784ce4a4fc,A 6hours old snake bite patient comes to emerg...,Incision and suction,Wait and watch,Local subcutaneous antisnake venom,Intravenous antisnake venom,1,single,All patients with a history of snake bite shou...,Forensic Medicine,Poisoning
1,0ee1e184-ebb7-4f8d-8029-fc644944ce06,Garlicky odour in the gastric contents seen in...,Sulphur,Phosphorus,Iodine,Chlorine,1,multi,Postmoem appearance Esophagus and stomach show...,Anatomy,Poisoning
2,96952c56-c9f6-43f2-a638-122711492aa0,Which is not an aryl phosphate -,Parathion,TIK-20,Malathion,Paraoxon,2,single,Organophosphate poisoning is the most common p...,Forensic Medicine,Poisoning
3,ee16afe4-8042-451b-bcc8-b12f0dcf583d,Father of modern toxicology -,Paracelsus,Galton,Orfila,Guftason,2,single,orfila is known as father of modern toxicology...,Forensic Medicine,Poisoning
4,8341990b-1d42-4707-bb74-b154e0cde6d9,Copper sulfate poisoning manifests with,High anion gap acidosis,Rhabdomyolysis,Acute hemolysis,Peripheral neuropathy,2,single,COPPER Poisonous Compounds: (1) Copper sulphat...,Forensic Medicine,Poisoning


In [12]:
print(f"Question:\n {df['question'].iloc[0]}\
\n A. {df['opa'].iloc[0]}\
\n B. {df['opb'].iloc[0]}\
\n C. {df['opc'].iloc[0]}\
\n D. {df['opd'].iloc[0]}\
\n Explanation: {df['exp'].iloc[0]} ")

Question:
 A 6hours old snake bite patient comes to emergency with mild local edema at the injury site. On examination no abnormalities detected and lab repos are normal. Most appropriate management is
 A. Incision and suction
 B. Wait and watch
 C. Local subcutaneous antisnake venom
 D. Intravenous antisnake venom
 Explanation: All patients with a history of snake bite should be observed for 8-12 h after the bite, if the skin is broken and the offending snake cannot be positively identified as non-poisonous. Ref: Krishnan vij ; 5th ed; Page no: 484 


In [13]:
def format_prompt(sample):
    # The options are opa, opb, opc, opd
    options = f"A) {sample['opa']}\nB) {sample['opb']}\nC) {sample['opc']}\nD) {sample['opd']}"
    # The correct answer index is in 'cop'. 1->A, 2->B, etc.
    correct_option_index = sample['cop'] - 1
    correct_option_letter = ['A', 'B', 'C', 'D'][correct_option_index]
    correct_answer = sample[['opa', 'opb', 'opc', 'opd'][correct_option_index]]

    # Create a structured prompt
    prompt = f"""### INSTRUCTION:
You are a medical expert. Answer the following multiple-choice question by providing the letter of the correct option and the full text of the correct answer.

### QUESTION:
{sample['question']}

### OPTIONS:
{options}

### RESPONSE:
The correct answer is {correct_option_letter}) {correct_answer}"""
    return prompt

def get_text(sample):
    return {"text": format_prompt(sample)}

In [14]:
formatted_dataset = poison_dataset.map(get_text)

### Pre QLoRA evaluation

### QLoRA

### Asking qlored llm  again raw medical question