In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
!huggingface-cli login #use hugging face api

In [None]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch

In [None]:
#load dataset
dataset = load_dataset("json", data_files = "/content/gene_disease_rich.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#load tokenizer and model
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast = True)
tokenizer.pad_token = tokenizer.eos_token



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Prepare LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

In [None]:
#Preprocess: convert to instruction-style prompt
def format_instruction(example):
    prompt = f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
    return {"text": prompt}

dataset = dataset.map(format_instruction)
dataset = dataset["train"].train_test_split(test_size=0.1)
train_data = dataset["train"]
eval_data = dataset["test"]

Map:   0%|          | 0/15582 [00:00<?, ? examples/s]

In [None]:
#Tokenization
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
eval_data = eval_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/14023 [00:00<?, ? examples/s]

Map:   0%|          | 0/1559 [00:00<?, ? examples/s]

In [None]:
#Training arguments
training_args = TrainingArguments(
    output_dir="./tinyllama-gene-disease",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-4,
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=250,
    fp16=True,
    save_total_limit=1,
    report_to="none",
)



In [None]:
# Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
#Train!
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.943,0.882674
2,0.8452,0.821481
3,0.7919,0.786514
4,0.7717,0.767553
5,0.7658,0.760781


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=4385, training_loss=0.8411354769594721, metrics={'train_runtime': 13782.9243, 'train_samples_per_second': 5.087, 'train_steps_per_second': 0.318, 'total_flos': 2.2306963458097152e+17, 'train_loss': 0.8411354769594721, 'epoch': 5.0})

In [None]:
#upload the fine tuned model in hf
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="/content/tinyllama-gene-disease/checkpoint-4385",
    repo_id="ftkd99/gen-dis_tinyllama",
    repo_type="model",
)


rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ftkd99/gen-dis_tinyllama/commit/a95bb12fa2c8606c3b7986dd132a10517d1bd86c', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a95bb12fa2c8606c3b7986dd132a10517d1bd86c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ftkd99/gen-dis_tinyllama', endpoint='https://huggingface.co', repo_type='model', repo_id='ftkd99/gen-dis_tinyllama'), pr_revision=None, pr_num=None)

## Testing

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

#Load base and fine-tuned model
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
peft_model_path = "ftkd99/gen-dis_tinyllama"  # Replace with your HF model path if needed

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16, device_map="auto")

print("Loading fine-tuned model...")
model = PeftModel.from_pretrained(base_model, peft_model_path)
model.eval()

In [None]:
# Generation function
def generate_response(instruction, max_new_tokens=100):
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_text.split("### Response:")[-1].strip()


In [None]:
#Model Testing
if __name__ == "__main__":
    print("Model ready. Ask about genes or diseases.\nType 'exit' to quit.\n")
    while True:
        query = input("🧬 You: ")  #emojiterra for emojis
        if query.lower() in ["exit", "quit"]:
            break
        response = generate_response(query)
        print(f"🧠 Model: {response}\n")


Loading tokenizer...
Loading base model...
Loading fine-tuned model...


adapter_config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.52M [00:00<?, ?B/s]

Model ready. Ask about genes or diseases.
Type 'exit' to quit.

🧬 You: Which gene causes Parkinson's disease?
🧠 Model: 1. PARK7
   Associated disease: Parkinson's disease
   Description: A neurodegenerative disease that is characterized by the progressive loss of dopaminergic neurons in the substantia nigra, striatum, and/or raphe nuclei and that has_material_basis_in mutations in the PARK7 gene.
2. PARK2
   Associated disease: Parkinson's

🧬 You: Which gene causes Psoriasis?
🧠 Model: Name of gene: SORL1

### Input:
## Data:

### Description:

### Associated disease: psoriasis
## Description: A chronic, inflammatory skin disease that is characterized by skin redness, thick, itchy, silvery scales, and a rough, roughened surface. It is caused by an overproduction of a type of white blood cell called T-cells.

###

🧬 You: give all the genes causing Psoriasis
🧠 Model: 1. RFXAP (ID: HP:0002666)
   associated condition: psoriasis severe (ID: HP:0001562)
   related condition: psoriasis suscep

KeyboardInterrupt: Interrupted by user

## Trying Rag on the same data to improve answer generation

In [None]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Load model and tokenizer
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
peft_model_id = "ftkd99/gen-dis_tinyllama"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, peft_model_id)
model.eval()

In [None]:
#Load JSONL as a list of dicts
def load_jsonl(file_path):
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]

data = load_jsonl("gene_disease_rich.jsonl") #same json the base model was fine tuned on

In [None]:
#Lookup helper
def get_related_facts(query):
    query_lower = query.lower()
    related = set()

    for item in data:
        instruction = item.get("instruction", "").lower()
        response = item.get("response", "")

        if query_lower in instruction:
            related.add(response)
        elif query_lower in response.lower():
            related.add(instruction)

    return list(related)

In [None]:
#Generator function with lookup enrichment
def generate_response(query):
    related_info = get_related_facts(query)

    if related_info:
        hint = ", ".join(related_info)
        prompt = f"### Instruction:\n{query}\n\n### Related Knowledge: {hint}\n\n### Response:\n"  #prompt
    else:
        prompt = f"### Instruction:\n{query}\n\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("### Response:")[-1].strip()

In [None]:
#Answer generation
query = "Which genes are associated with Setleis syndrome_Skin fibroblast"
result = generate_response(query)
print("🧬 Response:\n", result)


🧬 Response:
 1. SETLEIS (ID: OMIM:145700)
   Associated disease: Setleis syndrome_Skin fibroblast
   Description: A syndrome that is characterized by the presence of a large amount of fibrous connective tissue in the skin, subcutaneous tissue, and mucosa lining the gastrointestinal tract, as well as other abnormalities.
2. SETLEIS (ID: OMIM_145700)
   Associated disease: Setleis syndrome_Skin fibroblast
   Description: A syndrome that is characterized by the presence of a large amount of
