In [None]:
!pip install -q fastapi nest-asyncio pyngrok uvicorn python-multipart transformers accelerate bitsandbytes

# BioMistral Model

Since this model is already trained on healthcare datasets it's quite excellent for our project and the testing results have proven so.

In [None]:
%%writefile careplan_api.py
import torch
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from fastapi.middleware.cors import CORSMiddleware
from pyngrok import ngrok
import uvicorn
import nest_asyncio
import os

# Configuration
MODEL_NAME = "BioMistral/BioMistral-7B-DARE"
PORT = 8000
NGROK_TOKEN = "2vVORBaJkFe0IQePzAisOOvcmYo_7XZQVXi4VWTa2BoScr5GH"  # Get from https://dashboard.ngrok.com

# Cleanup
os.system(f"fuser -k {PORT}/tcp > /dev/null 2>&1")
ngrok.kill()

# Model Loading
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True  # Reduces memory usage
)

# FastAPI Setup
app = FastAPI(title="BioMistral Care Plan Generator")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

class PatientRequest(BaseModel):
    profile: str
    condition: str
    subtype: str
    comorbidities: list[str]

def format_prompt(request: PatientRequest):
    return f"""<s>[INST] Generate comprehensive care plan for {request.condition} ({request.subtype}):
Patient: {request.profile}
Comorbidities: {', '.join(request.comorbidities) if request.comorbidities else 'None'}
Format response with EXACT sections:
### Monitoring
### Medications
### Lifestyle [/INST]"""

@app.post("/generate")
async def generate_plan(request: PatientRequest):
    try:
        prompt = format_prompt(request)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        care_plan = full_response.split("[/INST]", 1)[-1].strip()

        return {"plan": care_plan}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    return {"status": "healthy", "model": MODEL_NAME}

if __name__ == "__main__":
    # Start ngrok tunnel
    ngrok.set_auth_token(NGROK_TOKEN)
    public_url = ngrok.connect(PORT, "http").public_url
    print(f"\nAPI Accessible at: {public_url}\n")

    # Start server
    nest_asyncio.apply()
    uvicorn.run(app, host="0.0.0.0", port=PORT)

The below code is from our personal trained Mistal model on healthcare datasets. Over the API because there is a problem with how the merging is being done for the msitral model it's giving gibberish output.
Testing it locally is giving excellent output. Due to time constraints we are loading an already trained BioMistral model from hugging face.

In [None]:
# # Install core dependencies
# !pip install -q transformers accelerate bitsandbytes peft

# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# import torch
# from peft import PeftModel, PeftConfig
# from transformers import (
#     AutoTokenizer,
#     AutoModelForCausalLM,
#     BitsAndBytesConfig
# )

# # Configuration
# PEFT_MODEL_PATH = "/content/drive/MyDrive/ClinicConnect/trained_models/mistral-clinicconnect"
# BASE_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# # Load model function
# def load_model():
#     # Load PEFT config
#     config = PeftConfig.from_pretrained(PEFT_MODEL_PATH)

#     # Quantization config (MUST match training setup)
#     bnb_config = BitsAndBytesConfig(
#         load_in_4bit=True,
#         bnb_4bit_quant_type="nf4",
#         bnb_4bit_compute_dtype=torch.float16,
#         bnb_4bit_use_double_quant=True
#     )

#     # Load base model
#     base_model = AutoModelForCausalLM.from_pretrained(
#         BASE_MODEL_NAME,
#         quantization_config=bnb_config,
#         device_map="auto",
#         trust_remote_code=True
#     )

#     # Load PEFT adapter
#     model = PeftModel.from_pretrained(
#         base_model,
#         PEFT_MODEL_PATH,
#         device_map="auto"
#     )
#     model.eval()

#     # Load tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
#     tokenizer.pad_token = tokenizer.eos_token

#     return model, tokenizer

# # Load the model
# model, tokenizer = load_model()

# # Test generation function
# def test_generation(patient_profile, condition, subtype, comorbidities):
#     prompt = f"""<s>[INST] Generate care plan for {condition} ({subtype}):
# Patient: {patient_profile}
# Comorbidities: {', '.join(comorbidities) if comorbidities else 'None'}
# Format Response With:
# **Monitoring**, **Medications**, **Lifestyle** [/INST]"""

#     inputs = tokenizer(
#         prompt,
#         return_tensors="pt",
#         max_length=2048,
#         truncation=True
#     ).to(model.device)

#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=512,
#             temperature=0.7,
#             top_p=0.85,
#             do_sample=True,
#             repetition_penalty=1.15,
#             pad_token_id=tokenizer.eos_token_id
#         )

#     full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

#     # Extract generated text
#     if "[/INST]" in full_response:
#         return full_response.split("[/INST]")[-1].strip()
#     return full_response

# # Test case
# test_data = {
#     "patient_profile": "65yo Male, T2DM, HbA1c 8.5%, CKD Stage 3, Hypertension",
#     "condition": "diabetes",
#     "subtype": "Type 2",
#     "comorbidities": ["CKD Stage 3", "Hypertension"]
# }

# # Run test
# print("Testing model generation...\n")
# result = test_generation(**test_data)

# print("Generated Care Plan:")
# print(result)

# # Validation check
# required_sections = ["Monitoring", "Medications", "Lifestyle"]
# missing = [section for section in required_sections if section not in result]
# print("\nValidation:")
# print(f"Missing sections: {missing if missing else 'None'}")

# # Memory cleanup
# torch.cuda.empty_cache()