In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from fastapi import FastAPI
from pydantic import BaseModel
import nest_asyncio, uvicorn


In [9]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tok = AutoTokenizer.from_pretrained(model_id)


In [10]:
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=dtype,
    )
else:
    device = "cpu"
    dtype = torch.float32
    bnb_config = None

print(f"Using device: {device}, dtype: {dtype}")


Using device: cuda, dtype: torch.float16


In [11]:
# Force CUDA usage (since you have it installed)
dtype = torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",          # force to use GPU
    torch_dtype=dtype,          # float16 for efficiency
    quantization_config=bnb_config,
)

print("✅ Model loaded on CUDA with 4-bit quantization")


✅ Model loaded on CUDA with 4-bit quantization


In [12]:
import torch
print(torch.__version__)         # should show +cu121
print(torch.version.cuda)        # should say 12.1
print(torch.cuda.is_available()) # should be True
print(torch.cuda.get_device_name(0))


2.5.1+cu121
12.1
True
NVIDIA GeForce GTX 1650


In [13]:
app = FastAPI()

class Query(BaseModel):
    prompt: str

@app.post("/generate")
def generate_text(q: Query):
    prompt = f"<|system|>You are a helpful assistant.<|user|>{q.prompt}<|assistant|>"
    inputs = tok(prompt, return_tensors="pt").to(model.device)

    output_tokens = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
    )

    response = tok.decode(output_tokens[0], skip_special_tokens=True)
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[-1].strip()

    return {"response": response}


In [14]:
nest_asyncio.apply()  # allow running inside Jupyter

uvicorn.run(app, host="0.0.0.0", port=8000)


INFO:     Started server process [17792]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:58054 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:58337 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:58515 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:58650 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:58934 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59093 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59240 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59310 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59348 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59420 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59454 - "POST /generate HTTP/1.1" 200 OK
INFO:     127.0.0.1:59502 - "POST /generate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [17792]
