<a href="https://colab.research.google.com/github/Ajaycharann/speech-emotion-bot/blob/main/chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab cell: FastAPI LLM backend + ngrok
# Run this in Google Colab. After model loads it will print a public URL like:
# https://abc123.ngrok-free.app/chat
# Copy that URL (append /chat) and paste into the React app.

# Install required packages
!pip install -q fastapi uvicorn[standard] pyngrok transformers accelerate nest_asyncio torch --upgrade

# Optional: install bitsandbytes if you want 4/8-bit loading (may help for big models)
# !pip install -q bitsandbytes

# --- Backend code ---
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import nest_asyncio
import uvicorn
from pyngrok import ngrok
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM

# ---------- Configuration ----------
# Replace with a small/medium model that fits Colab GPU.
# If you have a GPU runtime, device will be "cuda", else "cpu".
# Recommended starter models that usually fit Colab GPU:
#  - "google/flan-t5-small"  (small, CPU/GPU friendly, seq2seq style)
#  - "tiiuae/falcon-7b-instruct" (7B - may not fit on free Colab GPUs)
# Modify MODEL_ID if you want to try another model.
MODEL_ID = "google/flan-t5-small"
USE_SEQ2SEQ = True  # flan-t5 is seq2seq; set True for T5-like models

# ngrok token: either set here or the cell will ask you to paste it
NGROK_AUTH_TOKEN = ""  # <-- paste your ngrok authtoken here if you want

# ---------- Setup device ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}, model: {MODEL_ID}")

# ---------- Load model ----------
print("Loading tokenizer and model (this can take 30-120s)...")
if USE_SEQ2SEQ:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID)

# Move model to device
try:
    model = model.to(device)
except Exception as e:
    print("Model to(device) failed:", e)

# ---------- Create FastAPI app ----------
app = FastAPI(title="Colab LLM Bridge")

# Allow CORS from localhost:3000 and any origin (ngrok + local dev)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # relaxed for dev; restrict for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class Message(BaseModel):
    text: str

@app.get("/")
def index():
    return {"status": "ok", "model": MODEL_ID, "device": device}

@app.post("/chat")
def chat(msg: Message):
    # The model type matters: seq2seq (T5/Flan) vs causal (GPT-style).
    prompt = msg.text.strip()
    if not prompt:
        return {"reply": ""}

    # Add a short system instruction if you'd like
    # For seq2seq we just feed the prompt; for causal we may prepend instruction.
    if USE_SEQ2SEQ:
        input_ids = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**input_ids, max_new_tokens=200, do_sample=True, top_p=0.95, temperature=0.7)
        reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
    else:
        input_ids = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(input_ids["input_ids"], max_new_tokens=200, do_sample=True, top_p=0.95, temperature=0.7)
        reply = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return {"reply": reply}

# ---------- Expose via ngrok ----------
if not NGROK_AUTH_TOKEN:
    print("🔑 No ngrok auth token provided in the script.")
    NGROK_AUTH_TOKEN = input("31XPr7BozgpdVgvctxA3v76uHvy_3P9abZMLufBX5YwCdJptH").strip()

# Configure ngrok
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
print("Starting ngrok tunnel on port 8000...")
public_url = ngrok.connect(8000).public_url
print("🚀 Public API URL (paste this into the React app):", public_url + "/chat")

# ---------- Run server ----------
# Allow nested event loop (Colab)
nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8000)


Using device: cuda, model: google/flan-t5-small
Loading tokenizer and model (this can take 30-120s)...
🔑 No ngrok auth token provided in the script.
31XPr7BozgpdVgvctxA3v76uHvy_3P9abZMLufBX5YwCdJptH31XPr7BozgpdVgvctxA3v76uHvy_3P9abZMLufBX5YwCdJptH
Starting ngrok tunnel on port 8000...
🚀 Public API URL (paste this into the React app): https://de3a791eab9b.ngrok-free.app/chat


INFO:     Started server process [560]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     183.82.97.138:0 - "OPTIONS /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
INFO:     183.82.97.138:0 - "POST /chat HTTP/1.1" 200 OK
