# üöÄ LLM GPU Worker (Colab)

This notebook runs llama.cpp server on Colab's T4 GPU and exposes it via Cloudflare Tunnel.

## Prerequisites
1. **Google Drive** with:
   - `llama.cpp/` folder containing compiled `llama-server`
   - Your GGUF model file
2. **Cloudflare Account** with a tunnel token
3. **Oracle VM** running the FastAPI gateway

---

## ‚öôÔ∏è Configuration
Edit these values before running:

In [None]:
# =============================================================================
# CONFIGURATION - Edit these values!
# =============================================================================

# Path to your GGUF model in Google Drive
MODEL_PATH = "/content/drive/MyDrive/llm/models/your-model.gguf"

# Path to llama.cpp folder in Google Drive (should contain llama-server binary)
LLAMA_CPP_PATH = "/content/drive/MyDrive/llm/llama.cpp"

# Your Cloudflare Tunnel token (get from Cloudflare Zero Trust dashboard)
# This should be for a tunnel configured to point to localhost:8000
CLOUDFLARE_TUNNEL_TOKEN = "your-tunnel-token-here"

# Your Oracle VM Gateway URL (where to register this worker)
GATEWAY_URL = "https://your-gateway-domain.com"

# llama-server settings
LLAMA_PORT = 8000
CONTEXT_SIZE = 4096  # Adjust based on your model and GPU memory
GPU_LAYERS = 99  # Number of layers to offload to GPU (-1 for all)
THREADS = 4

print("‚úÖ Configuration loaded!")

## üìÅ Step 1: Mount Google Drive

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Verify paths exist
if os.path.exists(MODEL_PATH):
    model_size = os.path.getsize(MODEL_PATH) / (1024**3)
    print(f"‚úÖ Model found: {MODEL_PATH} ({model_size:.2f} GB)")
else:
    print(f"‚ùå Model NOT found at: {MODEL_PATH}")
    print("Please update MODEL_PATH in the configuration cell!")

if os.path.exists(LLAMA_CPP_PATH):
    print(f"‚úÖ llama.cpp folder found: {LLAMA_CPP_PATH}")
else:
    print(f"‚ö†Ô∏è llama.cpp not found. Will compile from source...")

## üîß Step 2: Setup llama.cpp

In [None]:
import subprocess
import os

LLAMA_SERVER_PATH = os.path.join(LLAMA_CPP_PATH, "build/bin/llama-server")

# Check if llama-server already exists in Drive
if os.path.exists(LLAMA_SERVER_PATH):
    print(f"‚úÖ llama-server already compiled at: {LLAMA_SERVER_PATH}")
    # Make it executable
    os.chmod(LLAMA_SERVER_PATH, 0o755)
else:
    print("üî® Compiling llama.cpp from source...")
    print("This will take a few minutes but only needs to be done once.")
    
    # Clone if not exists
    if not os.path.exists(LLAMA_CPP_PATH):
        !git clone https://github.com/ggerganov/llama.cpp {LLAMA_CPP_PATH}
    
    # Install CUDA toolkit
    !apt-get update && apt-get install -y cmake build-essential
    
    # Build with CUDA support
    os.chdir(LLAMA_CPP_PATH)
    !cmake -B build -DGGML_CUDA=ON
    !cmake --build build --config Release -j$(nproc)
    
    if os.path.exists(LLAMA_SERVER_PATH):
        print("‚úÖ llama.cpp compiled successfully!")
    else:
        print("‚ùå Compilation failed. Check the output above.")

## üåê Step 3: Install Cloudflare Tunnel Client

In [None]:
# Install cloudflared
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb
!dpkg -i cloudflared-linux-amd64.deb
!rm cloudflared-linux-amd64.deb

# Verify installation
!cloudflared --version
print("\n‚úÖ Cloudflared installed!")

## üöÄ Step 4: Start LLM Server

In [None]:
import subprocess
import time
import requests
import threading

# Build the command
cmd = [
    LLAMA_SERVER_PATH,
    "--model", MODEL_PATH,
    "--host", "0.0.0.0",
    "--port", str(LLAMA_PORT),
    "--ctx-size", str(CONTEXT_SIZE),
    "--n-gpu-layers", str(GPU_LAYERS),
    "--threads", str(THREADS),
]

print(f"üöÄ Starting llama-server...")
print(f"   Command: {' '.join(cmd)}")

# Start the server in background
llama_process = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True
)

# Function to print server output
def print_output():
    for line in llama_process.stdout:
        print(line, end='')

# Start output thread
output_thread = threading.Thread(target=print_output, daemon=True)
output_thread.start()

# Wait for server to start
print("\n‚è≥ Waiting for server to load model...")
for i in range(120):  # 2 minute timeout
    try:
        response = requests.get(f"http://localhost:{LLAMA_PORT}/health", timeout=2)
        if response.status_code == 200:
            print(f"\n‚úÖ llama-server is ready on port {LLAMA_PORT}!")
            break
    except:
        pass
    time.sleep(1)
    if i % 10 == 0:
        print(f"   Still loading... ({i}s)")
else:
    print("‚ùå Timeout waiting for server to start")
    print("Check the output above for errors.")

## üîó Step 5: Start Cloudflare Tunnel

In [None]:
import subprocess
import time
import threading
import re

# Start cloudflared tunnel
print("üîó Starting Cloudflare Tunnel...")

tunnel_process = subprocess.Popen(
    ["cloudflared", "tunnel", "run", "--token", CLOUDFLARE_TUNNEL_TOKEN],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True
)

# Capture tunnel output
tunnel_url = None

def monitor_tunnel():
    global tunnel_url
    for line in tunnel_process.stdout:
        print(line, end='')
        # Look for connection established message
        if "Registered tunnel connection" in line or "Connection registered" in line:
            tunnel_url = "connected"

tunnel_thread = threading.Thread(target=monitor_tunnel, daemon=True)
tunnel_thread.start()

# Wait for tunnel to connect
print("‚è≥ Waiting for tunnel connection...")
for i in range(30):
    if tunnel_url:
        break
    time.sleep(1)

if tunnel_url:
    print("\n" + "="*60)
    print("‚úÖ CLOUDFLARE TUNNEL CONNECTED!")
    print("="*60)
    print(f"\nYour llama-server is now accessible via your Cloudflare domain.")
else:
    print("\n‚ö†Ô∏è Tunnel may still be connecting. Check the output above.")

## üì° Step 6: Register with Gateway

In [None]:
import requests
import os

# The worker URL is your Cloudflare tunnel URL for the worker
# You need to configure this in Cloudflare Zero Trust dashboard
# For example: https://llm-worker.your-domain.com
WORKER_PUBLIC_URL = "https://llm-worker.your-domain.com"  # UPDATE THIS!

# Register with the gateway
print(f"üì° Registering with gateway at {GATEWAY_URL}...")

try:
    response = requests.post(
        f"{GATEWAY_URL}/register-worker",
        json={
            "worker_url": WORKER_PUBLIC_URL,
            "model_name": os.path.basename(MODEL_PATH)
        },
        timeout=10
    )
    
    if response.status_code == 200:
        result = response.json()
        print(f"\n‚úÖ {result['message']}")
    else:
        print(f"\n‚ö†Ô∏è Registration returned: {response.status_code}")
        print(response.text)
except Exception as e:
    print(f"\n‚ùå Failed to register: {e}")
    print("Make sure your gateway is running and accessible.")

# Print ready signal
print("\n" + "="*60)
print("üéâ READY - Worker is now serving requests!")
print("="*60)
print(f"\nüìä Worker URL: {WORKER_PUBLIC_URL}")
print(f"üè† Gateway URL: {GATEWAY_URL}")
print(f"\n‚ö†Ô∏è Keep this notebook running to serve requests!")

## üß™ Step 7: Test Local Inference

In [None]:
import requests
import json

# Test completion locally
print("üß™ Testing local inference...\n")

test_prompt = """Write a simple Python function that adds two numbers:

```python
"""

response = requests.post(
    f"http://localhost:{LLAMA_PORT}/completion",
    json={
        "prompt": test_prompt,
        "n_predict": 100,
        "temperature": 0.7,
        "stop": ["```"]
    },
    timeout=60
)

if response.status_code == 200:
    result = response.json()
    print("‚úÖ Response received!\n")
    print(f"Generated code:\n```python\n{result['content']}```\n")
    print(f"Tokens generated: {result.get('tokens_predicted', 'N/A')}")
else:
    print(f"‚ùå Error: {response.status_code}")
    print(response.text)

## üõë Cleanup (Run before disconnecting)

In [None]:
import requests

# Unregister from gateway
print("üõë Cleaning up...")

try:
    requests.post(f"{GATEWAY_URL}/unregister-worker", timeout=5)
    print("‚úÖ Unregistered from gateway")
except:
    print("‚ö†Ô∏è Could not unregister from gateway")

# Stop processes
try:
    tunnel_process.terminate()
    print("‚úÖ Cloudflare tunnel stopped")
except:
    pass

try:
    llama_process.terminate()
    print("‚úÖ llama-server stopped")
except:
    pass

print("\nüëã Cleanup complete!")