# GPU-Powered Ollama Server for LLMCache Experiments

This notebook sets up Ollama on Kaggle's Tesla T4 GPU and exposes it via ngrok tunnel for remote access.

**Usage:**
1. Run all cells in order
2. Copy the ngrok URL from the last cell
3. Use this URL as `base_url` in your experiment config
4. Keep this notebook running during experiments

**Requirements:**
- Enable GPU accelerator in Kaggle notebook settings
- Have an ngrok account (free tier works)

In [None]:
# Check GPU availability and environment
import subprocess
import os
import sys

# Suppress debugger warnings
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'

def detect_environment():
    """Detect if running on Kaggle or locally"""
    if os.path.exists('/kaggle'):
        return 'kaggle'
    elif 'COLAB_GPU' in os.environ:
        return 'colab'
    else:
        return 'local'

env = detect_environment()

print(f"🔍 Environment: {env.upper()}")

if env == 'local':
    print("⚠️  WARNING: This notebook is designed for Kaggle GPU acceleration")
    print("   For local testing, use: docker-compose up")
    print("   To use Kaggle GPU:")
    print("   1. Upload this notebook to Kaggle")
    print("   2. Enable GPU accelerator in settings")
    print("   3. Run all cells")
    
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        # Parse GPU info more cleanly
        lines = result.stdout.split('\n')
        gpu_line = next((line for line in lines if 'Tesla' in line or 'GeForce' in line or 'RTX' in line), None)
        if gpu_line:
            gpu_name = gpu_line.split('|')[1].strip().split()[0:3]
            print(f"✅ GPU detected: {' '.join(gpu_name)}")
        else:
            print("✅ GPU detected")
            
        if env == 'kaggle':
            print("🚀 Perfect! Kaggle GPU environment ready")
    else:
        print("⚠️  nvidia-smi failed but GPU might still be available")
        
except FileNotFoundError:
    print("❌ No NVIDIA GPU detected (nvidia-smi not found)")
    if env != 'local':
        print("   Make sure GPU accelerator is enabled in notebook settings")
except Exception as e:
    print(f"⚠️  GPU check failed: {e}")
    
print(f"📍 Ready to proceed with {env} setup")

In [None]:
# Install Ollama
import subprocess
import sys

print("📦 Installing Ollama...")

try:
    # Download and install Ollama with suppressed output
    result = subprocess.run([
        'bash', '-c', 
        'curl -fsSL https://ollama.com/install.sh | sh > /dev/null 2>&1'
    ], capture_output=True, text=True, timeout=120)
    
    if result.returncode == 0:
        print("✅ Ollama installed successfully")
        
        # Verify installation
        verify = subprocess.run(['which', 'ollama'], capture_output=True, text=True)
        if verify.returncode == 0:
            print(f"📍 Ollama location: {verify.stdout.strip()}")
        else:
            print("⚠️  Installation completed but ollama not found in PATH")
    else:
        print("❌ Ollama installation failed")
        if result.stderr:
            print(f"Error: {result.stderr}")
            
except subprocess.TimeoutExpired:
    print("⏱️  Installation taking longer than expected...")
    print("This is normal for first-time Ollama installation")
except Exception as e:
    print(f"❌ Installation error: {e}")

In [None]:
# Install pyngrok for tunneling
!pip install pyngrok

In [None]:
# Start Ollama service in background
import subprocess
import time
import os
import requests

# Set environment variables for GPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'

print("🚀 Starting Ollama server...")

# Start Ollama server
ollama_process = subprocess.Popen(['ollama', 'serve'], 
                                  stdout=subprocess.PIPE, 
                                  stderr=subprocess.PIPE,
                                  text=True)

# Wait and verify server startup
max_attempts = 10
for attempt in range(max_attempts):
    time.sleep(2)
    try:
        # Test if server is responding
        response = requests.get('http://localhost:11434/api/tags', timeout=5)
        if response.status_code == 200:
            print("✅ Ollama server started successfully")
            print(f"📡 Server running on port 11434")
            break
    except requests.exceptions.RequestException:
        if attempt == max_attempts - 1:
            print("❌ Ollama server failed to start properly")
            print("Checking server output...")
            # Get any error output
            try:
                stdout, stderr = ollama_process.communicate(timeout=1)
                if stderr:
                    print(f"Server error: {stderr[:200]}...")
            except:
                pass
        else:
            print(f"⏳ Waiting for server startup... ({attempt + 1}/{max_attempts})")

print("📍 Server setup completed")

In [None]:
# Pull the required model (adjust model name as needed)
import subprocess
import time

MODEL_NAME = "gemma2:9b"  # Change this if needed

print(f"📥 Downloading {MODEL_NAME} model...")
print("⏳ This may take several minutes for large models")

start_time = time.time()

try:
    # Use ollama pull with minimal output
    result = subprocess.run(['ollama', 'pull', MODEL_NAME], 
                          capture_output=True, text=True, timeout=600)  # 10 min timeout
    
    elapsed = time.time() - start_time
    
    if result.returncode == 0:
        print(f"✅ Model {MODEL_NAME} downloaded successfully in {elapsed:.1f}s")
        
        # Verify model is available
        list_result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
        if MODEL_NAME in list_result.stdout:
            # Parse model size from output
            lines = list_result.stdout.split('\n')
            model_line = next((line for line in lines if MODEL_NAME in line), None)
            if model_line:
                parts = model_line.split()
                if len(parts) >= 2:
                    size = parts[1]
                    print(f"📊 Model size: {size}")
        
    else:
        print(f"❌ Model download failed after {elapsed:.1f}s")
        # Show only first few lines of error to avoid spam
        if result.stderr:
            error_lines = result.stderr.split('\n')[:3]
            print("Error details:")
            for line in error_lines:
                if line.strip():
                    print(f"  {line}")
                    
except subprocess.TimeoutExpired:
    elapsed = time.time() - start_time
    print(f"⏱️  Download timeout after {elapsed:.1f}s")
    print("Large models may take longer - consider using a smaller model for testing")
except Exception as e:
    elapsed = time.time() - start_time
    print(f"❌ Download error after {elapsed:.1f}s: {e}")

In [None]:
# Test local Ollama
!ollama list

In [None]:
# Setup ngrok tunnel (you need to get your auth token from ngrok.com)
from pyngrok import ngrok
import os

# Get ngrok token from environment variable or prompt user
NGROK_AUTH_TOKEN = os.getenv('NGROK_AUTH_TOKEN')

if not NGROK_AUTH_TOKEN:
    print("🔑 ngrok auth token not found in environment")
    print("📋 To set up ngrok:")
    print("   1. Get your token from: https://dashboard.ngrok.com/get-started/your-authtoken")
    print("   2. Set environment variable: NGROK_AUTH_TOKEN=your_token_here")
    print("   3. Or paste token below (not recommended for shared notebooks)")
    print()
    
    # Allow manual token entry (with warning)
    manual_token = input("Enter ngrok token (or press Enter to skip): ").strip()
    if manual_token:
        NGROK_AUTH_TOKEN = manual_token
        print("⚠️  WARNING: Token entered manually - avoid sharing this notebook!")
    else:
        print("❌ Skipping ngrok setup - no token provided")

if NGROK_AUTH_TOKEN:
    try:
        ngrok.set_auth_token(NGROK_AUTH_TOKEN)
        
        # Create tunnel to Ollama port
        public_tunnel = ngrok.connect(11434)
        
        print(f"\n🚀 Ollama is now publicly accessible!")
        print(f"📡 Public URL: {public_tunnel.public_url}")
        print(f"\n📝 Update your experiment.yaml with:")
        print(f'  model:')
        print(f'    base_url: "{public_tunnel.public_url}"')
        print(f"\n⚠️  Keep this notebook running during experiments!")
        
        # Store tunnel info for later cells
        globals()['public_tunnel'] = public_tunnel
        
    except Exception as e:
        print(f"❌ ngrok setup failed: {e}")
        print("Check your auth token and try again")
else:
    print("⏭️  Ngrok setup skipped")

In [None]:
# Test the public endpoint
import requests
import json

# Check if tunnel was created successfully
if 'public_tunnel' not in globals():
    print("❌ No ngrok tunnel available - run the previous cell first")
else:
    try:
        print("🧪 Testing public Ollama endpoint...")
        
        # Test with a simple generation request
        response = requests.post(
            f"{public_tunnel.public_url}/api/generate",
            json={
                "model": MODEL_NAME,  # Use the model from cell 5
                "prompt": "What is 2+2?",
                "stream": False
            },
            timeout=30
        )
        
        if response.status_code == 200:
            result = response.json()
            test_response = result.get('response', 'No response field')
            print("✅ Ollama is working correctly!")
            print(f"🤖 Test response: {test_response[:100]}...")
            
            # Show model info
            if 'model' in result:
                print(f"📊 Model used: {result['model']}")
                
        else:
            print(f"❌ HTTP Error: {response.status_code}")
            print(f"Response: {response.text[:200]}...")
            
    except requests.exceptions.Timeout:
        print("⏱️  Request timeout - model might be loading")
        print("Large models can take 30+ seconds for first request")
    except requests.exceptions.ConnectionError:
        print("❌ Connection failed - check if Ollama server is running")
    except Exception as e:
        print(f"❌ Test failed: {e}")

In [None]:
# Keep-alive cell - run this to prevent the notebook from timing out
import time
import datetime
import requests

if 'public_tunnel' not in globals():
    print("❌ No ngrok tunnel available - run the ngrok setup cell first")
else:
    print("🔄 Keep-alive loop started. Ollama server is running...")
    print(f"🌐 Public URL: {public_tunnel.public_url}")
    print("\n⚡ Your GPU-powered Ollama is ready for experiments!")
    print("📋 To use: Update base_url in your experiment.yaml and run experiments")
    print("\n🛑 Press interrupt button or Ctrl+C to stop")
    
    # Keep the server alive
    try:
        counter = 0
        while True:
            counter += 1
            current_time = datetime.datetime.now().strftime("%H:%M:%S")
            
            # Ping Ollama every 5 minutes to keep it alive
            if counter % 30 == 0:  # Every 30 * 10s = 5 minutes
                try:
                    response = requests.get(f"http://localhost:11434/api/tags", timeout=5)
                    status = "🟢 Online" if response.status_code == 200 else "🟡 Warning"
                except:
                    status = "🔴 Offline"
                
                print(f"[{current_time}] Ollama: {status} | Keep-alive #{counter//30}")
            elif counter % 6 == 0:  # Every minute show a dot
                print(".", end="", flush=True)
            
            time.sleep(10)  # Wait 10 seconds between checks
            
    except KeyboardInterrupt:
        print("\n🛑 Keep-alive stopped. Shutting down...")
        try:
            ngrok.disconnect(public_tunnel.public_url)
            print("✅ Tunnel closed.")
        except:
            print("⚠️  Tunnel cleanup failed")
    except Exception as e:
        print(f"\n❌ Keep-alive error: {e}")
        print("Restarting keep-alive loop...")