# Marco v3 Colab Inference API

This notebook creates a FastAPI inference service for the Marco v3 Italian Teacher model.

**What it does:**
- Loads your fine-tuned Marco v3 LoRA model with vLLM (4.4x faster inference)
- Exposes a FastAPI endpoint for generating Italian exercises
- Creates a public tunnel via ngrok so your local API can access it

**Usage:**
1. Run all cells in order
2. Copy the ngrok URL from the output
3. Export it locally: `export INFERENCE_API_URL="https://your-url.ngrok.io"`
4. Start your local API: `./run_api.sh`
5. Your local API will now use Colab GPU for homework generation!

In [None]:
# Cell 1: Install dependencies
!pip install fastapi uvicorn pyngrok vllm nest-asyncio -q
print("✅ Dependencies installed")

In [None]:
# Cell 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("✅ Google Drive mounted")

In [None]:
# Cell 3: Setup paths and verify model exists
import os
import sys

PROJECT_ROOT = "/content/drive/MyDrive/Colab Notebooks/italian_teacher"
MODEL_PATH = os.path.join(PROJECT_ROOT, "models/minerva_marco_v3_merged")

# Add project to Python path for imports
sys.path.insert(0, PROJECT_ROOT)

# Verify model exists
if not os.path.exists(MODEL_PATH):
    print(f"❌ Model not found at: {MODEL_PATH}")
    print("Please update MODEL_PATH to point to your merged LoRA model")
else:
    print(f"✅ Model found at: {MODEL_PATH}")
    print(f"✅ Project root: {PROJECT_ROOT}")

In [None]:
# Cell 4: Load model with vLLM (this takes ~30 seconds)
import torch
from vllm import LLM

print("⏳ Loading Marco v3 model with vLLM...")

llm = LLM(
    model=MODEL_PATH,
    tensor_parallel_size=1,
    dtype="half",
    max_model_len=2048,
    gpu_memory_utilization=0.85,
    trust_remote_code=True
)

print("✅ Marco v3 model loaded successfully!")
print(f"🔥 GPU: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")
print(f"💾 GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f}GB")

In [None]:
# Cell 5: Create FastAPI application
import nest_asyncio
from src.api.inference import create_inference_app

# Allow nested event loops (required for Colab)
nest_asyncio.apply()

# Port for Colab API (8001 to avoid conflict with local API on 8000)
COLAB_PORT = 8001

# Create the FastAPI app
app = create_inference_app(llm, port=COLAB_PORT)

print(f"✅ FastAPI application created (port {COLAB_PORT})")
print(f"📋 Version: 1.0.5")
print("🚀 Ready to start server!")

In [None]:
# Cell 6: Setup ngrok tunnel
from pyngrok import ngrok

# Set your ngrok auth token (get free token at https://ngrok.com)
NGROK_AUTH_TOKEN = "33VKJ1gR2EjYu8WvlRmSqOiUiJk_2qCL7X8Kp4vHTUAcu4xvh"

# Authenticate ngrok
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Create tunnel
tunnel = ngrok.connect(COLAB_PORT)
public_url = str(tunnel.public_url)

print("🌐 ngrok tunnel created!")
print(f"\n📍 Public URL: {public_url}")
print(f"\n🔗 API Endpoints:")
print(f"   Health: {public_url}/health")
print(f"   Generate: {public_url}/generate")
print(f"\n✅ Copy the public URL above for use in your local environment")

In [None]:
# Cell 7: Start FastAPI server
import uvicorn
from threading import Thread
import time
import requests
import json

print(f"🚀 Starting FastAPI server on port {COLAB_PORT}...")

# Create uvicorn config
config = uvicorn.Config(
    app=app,
    host="0.0.0.0",
    port=COLAB_PORT,
    log_level="error"
)

# Create server
server = uvicorn.Server(config)

# Start in background thread
def run_server():
    import asyncio
    asyncio.run(server.serve())

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

# Wait for server to be ready
print("⏳ Waiting for server to start...")
time.sleep(3)

# Test if it's working
try:
    response = requests.get(f"http://localhost:{COLAB_PORT}/health", timeout=2)

    if response.status_code == 200:
        print("\n✅ SERVER IS RUNNING!")
        print(f"📡 Listening on http://0.0.0.0:{COLAB_PORT}\n")

        print("🧪 Health check response:")
        print(json.dumps(response.json(), indent=2))

        print("\n" + "="*70)
        print("🌐 YOUR NGROK PUBLIC URL:")
        print("="*70)
        print(f"\n{public_url}\n")
        print("="*70)

        print("\n📋 COPY AND RUN ON YOUR MAC:\n")
        print(f'export INFERENCE_API_URL="{public_url}"')
        print("./run_api.sh")

        print("\n" + "="*70)
        print("\n⚡ Server is running! Keep this notebook open!")
        print("🛑 To stop: Runtime → Interrupt execution")
        print("="*70)

except Exception as e:
    print(f"\n❌ Server failed to start: {e}")
    print("\n🔄 Try this:")
    print("   1. Runtime → Restart runtime")
    print("   2. Re-run all cells")

In [None]:
# Cell 8 (OPTIONAL): Test the API locally in Colab
import requests
import json

print("🧪 Testing API endpoints...\n")

# Test generation endpoint
test_request = {
    "cefr_level": "A2",
    "grammar_focus": "present_tense",
    "topic": "daily routines",
    "quantity": 3,
    "exercise_types": ["fill_in_blank", "translation", "multiple_choice"],
    "temperature": 0.7,
    "max_tokens": 2500
}

response = requests.post(
    f"http://localhost:{COLAB_PORT}/generate",
    json=test_request,
    timeout=120
)

if response.status_code == 200:
    result = response.json()
    print("✅ Generation successful!")
    print(f"\nGenerated {len(result['exercises'])} exercises in {result['inference_time']:.2f}s")
    print(f"Tokens generated: {result['generated_tokens']}")
    print(f"Parsing strategy: {result['parsing_strategy']}\n")

    for i, ex in enumerate(result['exercises'], 1):
        print(f"Exercise {i}:")
        print(json.dumps(ex, indent=2, ensure_ascii=False))
        print()
else:
    print(f"❌ Generation failed: {response.status_code}")
    print(response.text)

## 🎯 Next Steps

1. **On your Mac**, export the ngrok URL:
   ```bash
   export INFERENCE_API_URL="https://your-ngrok-url.ngrok.io"
   ```

2. **Start your local API**:
   ```bash
   ./run_api.sh
   ```

3. **Test the integration**:
   ```bash
   python test_exercise_quality.py
   ```

4. **Create assignments** via your local API - they'll use Colab GPU for generation!

## ⚠️ Important Notes

- **Keep this notebook running** while using the API
- Free ngrok tunnels expire after ~2 hours
- Colab disconnects after ~90 min of inactivity (free tier)
- For production, consider Colab Pro ($10/month) or paid ngrok