In [None]:
!huggingface-cli login

# ContactDoctor Bio Medical LLM

In [None]:
!pip install -q transformers accelerate bitsandbytes gradio

In [None]:
!pip install peft

In [None]:
!pip install -q datasets

### Load the data from the file to train the model

In [None]:
from datasets import Dataset

data = [
    {"input": "i have a data\nTotal Steps: 75000\nAverage Zone Minutes: 90\nAverage Heart Rate: 72 bpm\nfood consumption of the week: varied, but often feel bloated\nSleep activity for the week: 7 hours avg, generally good\nPhysical activities performed for the week: walking, some yoga\n\nPlease provide:\n1. Food Recommendation\n2. Physical Exercise\n3. Social Wellbeing Recommendation\n4. Overall Suggestion","output": "1. Food Recommendation:\n- Your **varied** diet is a good foundation, but the symptom of often feeling **bloated** suggests a need to investigate specific food sensitivities or eating habits.\n- **Keep a Food Diary:** For a week or two, track what you eat and when you experience bloating. This can help identify trigger foods.\n- **Slow Down Eating:** Eat slowly and mindfully, chewing your food thoroughly. Rushing can lead to swallowing air and bloating.\n- **Limit Gas-Producing Foods (initially):** Temporarily reduce intake of common culprits like beans, lentils, broccoli, cabbage, and carbonated drinks. Reintroduce them slowly to see what you tolerate.\n- **Consider Probiotics:** Incorporate fermented foods (yogurt, kefir, sauerkraut) or a probiotic supplement to support gut health.\n- **Stay Hydrated:** Adequate water intake aids digestion and can prevent constipation, a cause of bloating.\n\n2. Physical Exercise:\n- Your **Total Steps: 75000** and **Average Zone Minutes: 90** indicate a good level of moderate activity (walking, some yoga). Your **Average Heart Rate of 72 bpm** is healthy.\n- **Increase Intensity Gradually:** While consistent, consider adding short bursts of more intense activity within your walks to boost your cardiovascular fitness further and increase your **Zone Minutes**.\n- **Incorporate Strength Training:** Adding 2-3 sessions of strength training per week will build muscle, improve metabolism, and support bone health, complementing your current activities.\n- **Explore Different Yoga Styles:** If you're comfortable, try different yoga styles (e.g., Vinyasa, Power Yoga) to challenge your body in new ways and potentially aid digestion through specific poses.\n\n3. Social Wellbeing Recommendation:\n- Your data doesn't directly indicate social wellbeing issues, and your **7 hours avg, generally good** sleep is a positive sign for overall energy levels to engage socially.\n- **Leverage Your Activities:** Consider joining a yoga studio or a walking group. This is a great way to meet like-minded people and integrate social interaction with your physical activities.\n- **Plan Regular Social Connections:** Even if you feel generally good, proactively scheduling time with friends and family can enhance your sense of belonging and reduce stress.\n- **Practice Mindful Socializing:** Be present during interactions and avoid distractions, which can deepen connections.\n\n4. Overall Suggestion:\n- Your primary focus should be on addressing the **bloating and digestive discomfort**, as this can significantly impact your daily comfort and nutrient absorption. Your exercise and sleep habits are generally positive, providing a good foundation.\n- **Digestive Health First:** Work on identifying and managing the causes of your bloating through dietary adjustments and possibly consulting a healthcare professional or registered dietitian.\n- **Mindful Eating Practices:** Beyond what you eat, how you eat (slowly, without distractions) is crucial for digestion.\n- **Holistic Wellness:** Continue your balanced approach to physical activity. The combination of walking and yoga is beneficial for both physical and mental health, which can indirectly support digestive health.\n- **Listen to Your Body's Cues:** Pay attention not just to major symptoms but also to subtle signals your body gives you regarding food and activity."}
]

dataset = Dataset.from_list(data)


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType

torch.cuda.empty_cache()
# === Step 1: Model & Tokenizer Setup ===
model_id = "AnjaliNV/WellBeing_LLM"  # or base model like "mistralai/Mistral-7B-Instruct-v0.1"

print("🔄 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Required for some models

print("📂 Loading local dataset...")
dataset = load_dataset("json", data_files="/content/fitness-chat-prompt-completion-dataset.json")  # Update path
# Using the dataset created in the previous cell



def preprocess(batch):
    texts = [f"### Question: {p}\n### Answer: {c}" for p, c in zip(batch["instruction"], batch["output"])]
    tokens = tokenizer(texts, truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens



print("🔄 Tokenizing dataset...")
# Process without batching to avoid potential issues with padding/truncation in batches
tokenized_dataset = dataset["train"].map(preprocess, batched=True)



# === Step 2: Load Quantized Base Model ===
print("🔄 Loading base model...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    trust_remote_code=True
)

# === Step 3: Inject LoRA ===
print("🧠 Adding LoRA adapters...")
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# === Step 4: Set Up Trainer ===
print("⚙️ Preparing trainer...")
training_args = TrainingArguments(
    output_dir="./WellBeing_LLM_finetuned",
    per_device_train_batch_size=1,      # lowered batch size
    gradient_accumulation_steps=8,      # to keep effective batch size
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    learning_rate=3e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to=[],
)


data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# === Step 5: Train ===
print("🚀 Starting training...")
trainer.train()

# === Step 6: Save Final Model + Tokenizer ===
print("💾 Saving the fine-tuned model...")
trainer.save_model("./WellBeing_LLM_finetuned")
tokenizer.save_pretrained("./WellBeing_LLM_finetuned")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

base_model_id = "ContactDoctor/Bio-Medical-Llama-3-8B"
adapter_path = "AnjaliNV/WellBeing_LLM"

# Optional: Enable 4-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)

# Load model with quantization and low memory footprint
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    trust_remote_code=True
)

# Load the LoRA adapter (AFTER the base model is on device)
model = PeftModel.from_pretrained(model, adapter_path)
model.eval()


In [None]:
from peft import PeftModel
from transformers import AutoTokenizer
from huggingface_hub import notebook_login

# Optional: If running in Colab/Jupyter
# notebook_login()

repo_name = "AnjaliNV/WellBeing_LLM"  # Choose a name

# Push the LoRA adapter
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


## RAG fine tuning

In [None]:
!pip install langchain faiss-cpu sentence-transformers transformers


In [None]:
!pip install PyPDF2

In [None]:
!pip install -U langchain-community

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.load_local(
    "/content/drive/MyDrive/rag_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True  # <- Add this line
)


In [None]:
def load_and_split_document(filepath):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    import PyPDF2

    text = ""

    if filepath.endswith(".txt"):
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read()
    elif filepath.endswith(".pdf"):
        with open(filepath, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    else:
        raise ValueError("Unsupported file format")

    # Split text into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_text(text)

new_chunks = load_and_split_document("/content/Physical_Exercise.txt")  # or .txt
vectorstore.add_texts(new_chunks)
vectorstore.save_local("/content/drive/MyDrive/rag_index")
print("Vectorstore updated and saved.")


In [None]:
import gradio as gr
import torch

def chat_rag_interface(message, history):
    # Step 1: Retrieve relevant context chunks for the user question
    retrieved_chunks = vectorstore.similarity_search(message, k=3)  # or however you implemented retrieval

    # Step 2: Combine chunks into one context string
    # context = "\n".join(retrieved_chunks)
    context = "\n".join([doc.page_content for doc in retrieved_chunks])

    # Step 3: Build prompt with context + user message
    prompt_template = f"""
You are a wellbeing expert. Use the provided CONTEXT to answer the USER DATA in the format shown below.

Always respond under the following 4 sections using bullet points where specified:
1. Food Recommendation
2. Physical Exercise
3. Sleep and Recovery
4. Overall Suggestion

Use the following guidelines when analyzing a user's Average Zone Minutes:

- Zone Minutes < 90: Recommend starting with low-impact activities.
- Zone Minutes 90–150: Encourage maintaining current activity with gradual progression.
- Zone Minutes > 150: Recommend advanced workouts and advise on recovery.

Use this logic in every response.

CONTEXT:
{context}

---

Example:
User Data:
Total Steps: 100,000
Average Zone Minutes: 160
Average Heart Rate: 60 bpm
Food Consumption: healthy
Sleep: 7.5 hrs average
Physical Activities: running, gym

Response:
1. Food Recommendation: Your food intack is already healty, try different varities of food
   a) Protein Intake: Include lean meats, fish, tofu, or legumes to support muscle repair.
   b) Healthy Fats: Add avocados, olive oil, almonds, walnuts, and chia seeds.
   c) Vegetable Variety: Eat spinach, broccoli, carrots, and zucchini for essential vitamins.
   d) Hydration: Aim for 3–4 liters of water per day to stay well-hydrated.

2. Physical Exercise:
   You have good cardiovascual fitness from running. Along with walking you can include
   a) Strength training:- squats, deadlifts, bench press, and overhead press.
   b) Yoga or Pilates
   c) Outdoor Activities/Adventure Sports:- Cycling, tracking, swimming.

3. Sleep and Recovery:
   Aim for 7–8 hours of sleep. Avoid screens before bed and follow a consistent schedule.

4. Overall Suggestion:
   Great job! Keep up the consistency. You could also explore group fitness or meditation for balance.

---

User Data:
{message}

Response:
"""



    # Step 4: Tokenize and generate answer from your model
    inputs = tokenizer(prompt_template, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.2,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_text = response[len(prompt_template):].strip()

    # Optional: Save chat history
    with open("chat_history.txt", "a", encoding="utf-8") as f:
        f.write(f"User: {message}\n")
        f.write(f"Model: {generated_text}\n\n")

    return generated_text

In [None]:
gr.ChatInterface(
    fn=chat_rag_interface,
    title="Well Being Advicer",
    description="Receive holistic health advice grounded in domain knowledge.",
).queue().launch(debug=True, share=True)


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("WellBeing_LLM")
tokenizer = AutoTokenizer.from_pretrained("WellBeing_LLM")

# Push to HF Hub
model.push_to_hub("WellBeing_LLM")        # this creates a repo under your username
tokenizer.push_to_hub("WellBeing_LLM")
