In [None]:
# 🚀 Install requirements
!pip install bitsandbytes accelerate transformers sentence-transformers faiss-cpu fastapi uvicorn nest-asyncio pyngrok

# 🚀 Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 k

# API


In [None]:
!pip install fastapi uvicorn nest_asyncio pyngrok transformers bitsandbytes accelerate




In [None]:
import re
import torch
import nest_asyncio
from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Apply nest_asyncio so uvicorn works in Jupyter/Colab
nest_asyncio.apply()

# Load model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

# Define API app
app = FastAPI()

class QuestionRequest(BaseModel):
    question: str
    need_doctor: bool = True

def generate_medical_answer(query: str, need_doctor: bool = True) -> str:
    context = ""
    if need_doctor:
        prompt = (
            f"{context}"
            f"You are a licensed physician. Provide a medically accurate and specific answer using clinical terminology where appropriate. "
            f"Include potential diagnoses, lab tests, treatment protocols, and differential considerations if relevant.\n"
            f"Question: {query}\nAnswer:"
        )
    else:
        prompt = (
            f"{context}"
            f"Please explain the following health question in a very simple way. "
            f"Avoid complex terms, use everyday language, and assume the person has no medical knowledge.\n"
            f"Question: {query}\nAnswer:"
        )

    inputs = tokenizer([prompt], return_tensors="pt", truncation=True, padding=True).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=400,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        top_k=50
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Answer:" in decoded:
        final_answer = decoded.split("Answer:")[-1].strip()
        final_answer = re.split(r'\n\s*\n', final_answer)[0].strip()
    else:
        final_answer = decoded.strip()

    return final_answer

@app.post("/generate")
async def generate(request: QuestionRequest):
    answer = generate_medical_answer(request.question, request.need_doctor)
    return {
        "question": request.question,
        "answer_type": "doctor" if request.need_doctor else "patient",
        "answer": answer
    }

@app.get("/")
def root():
    return {"message": "Use POST /generate with {question, need_doctor}"}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
!ngrok config add-authtoken 2xxLdOSa8oGgfLMD1Eyg6hQFVbp_4HV3syoyBYQoN92WzVTbU


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from uvicorn import Config, Server

# Create public URL with ngrok
public_url = ngrok.connect(8000)
print(f"🚀 Your FastAPI is live at: {public_url}/docs")

# Start FastAPI server
config = Config(app=app, host="0.0.0.0", port=8000, log_level="info")
server = Server(config=config)
await server.serve()


🚀 Your FastAPI is live at: NgrokTunnel: "https://49e3-34-87-9-26.ngrok-free.app" -> "http://localhost:8000"/docs


INFO:     Started server process [183]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     154.177.143.76:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     154.177.143.76:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     154.177.143.76:0 - "GET / HTTP/1.1" 200 OK
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     41.42.242.214:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed
INFO:     154.177.236.104:0 - "POST / HTTP/1.1" 405 Method Not Allowed


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     154.177.236.104:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     154.177.236.104:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     154.177.236.104:0 - "GET /generate HTTP/1.1" 405 Method Not Allowed
INFO:     41.42.242.214:0 - "POST / HTTP/1.1" 405 Method Not Allowed


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


INFO:     41.42.242.214:0 - "POST /generate HTTP/1.1" 200 OK
