# Check GPU

In [None]:
!nvidia-smi

# Install Deps

In [None]:
!apt update -y -qq
!apt install -y -qq curl lshw libcairo2-dev pkg-config python3-dev
!curl https://ollama.ai/install.sh | sh

!pip install flask -q
!pip install pyngrok -q
!pip install requests -q
!pip install flask-cors -q
!apt update -y -qq
!apt install -y -qq curl lshw libcairo2-dev pkg-config python3-dev zstd
!curl https://ollama.ai/install.sh | sh

# Configuration

Please set the NGROK auth token to access the tunnel.

In [None]:
NGROK_AUTH_TOKEN = '' #@param {type:'string'}
OLLAMA_URL = 'http://127.0.0.1:11434' #@param {type:'string'}

# Start Ollama Service

This cell starts the Ollama service and pulls the default model.

In [None]:
import subprocess
import time
import requests

ollama_process = subprocess.Popen(
    ["ollama", "serve"],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

print("Waiting for Ollama to start...")
max_attempts = 30
for i in range(max_attempts):
    try:
        response = requests.get(f'{OLLAMA_URL}/api/tags', timeout=1)
        if response.status_code == 200:
            print("‚úì Ollama service is ready!")
            break
    except:
        pass
    time.sleep(1)
    if i == max_attempts - 1:
        print("‚ö† Warning: Ollama may not have started properly")

print("Recommended models:\n")
print("  ‚Ä¢ llama3.1:8b      - Fast, good quality (4.7GB)")
print("  ‚Ä¢ mistral:7b       - Fast, efficient (4.1GB)")
print("  ‚Ä¢ gemma2:9b        - Very good quality (5.5GB)")
print("  ‚Ä¢ qwen2.5:7b       - Great for coding (4.7GB)")
print("  ‚Ä¢ llama3.2:3b      - Ultra fast, smaller (2GB)")

!ollama pull gemma2:9b
!ollama pull llama3.1:8b
!ollama pull zephyr

print("\n‚úì Model ready!")
print("\nTo pull additional models later, use:")
print("  !ollama pull <model-name>")

# Main Code

In [None]:
from flask import Flask, request, Response, jsonify
import json
import subprocess
import requests
from pyngrok import ngrok
from flask_cors import CORS
import os
from urllib.parse import urlencode

In [None]:
app = Flask(__name__)
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
CORS(app)

@app.route('/api/generate', methods=['POST'])
def generate_completion():
    try:
        # Get data from JSON body (more standard than form data)
        data = request.get_json() if request.is_json else {}

        # Fallback to form data if no JSON
        model = data.get('model') or request.form.get('model') or 'zephyr'
        prompt = data.get('prompt') or request.form.get('prompt')
        persona = data.get('persona') or request.form.get('persona') or "You are 2B from NieR Automata. Answer as 2B, the assistant, only."
        temperature = data.get('temperature') or request.form.get('temperature') or 0.8

        if not prompt:
            return jsonify({"error": "No prompt provided"}), 400

        json_data = {
            "model": model,
            "prompt": prompt,
            "system": persona,
            "options": {"temperature": float(temperature)},
            "stream": False
        }

        headers = {"Content-Type": "application/json"}
        response = requests.post(f'{OLLAMA_URL}/api/generate', json=json_data, headers=headers, timeout=120)

        return jsonify(response.json())

    except requests.exceptions.ConnectionError:
        return jsonify({"error": "Cannot connect to Ollama service. Make sure it's running."}), 503
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/pull', methods=['POST'])
def pull_model():
    try:
        data = request.get_json() if request.is_json else {}
        model_name = data.get('name') or request.form.get('name')

        if not model_name:
            return jsonify({"error": "No model name provided"}), 400

        json_data = {"name": model_name, "stream": False}
        headers = {"Content-Type": "application/json"}
        response = requests.post(f'{OLLAMA_URL}/api/pull', json=json_data, headers=headers, timeout=300)

        return jsonify(response.json())

    except requests.exceptions.ConnectionError:
        return jsonify({"error": "Cannot connect to Ollama service. Make sure it's running."}), 503
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/api/tags', methods=['GET'])
def list_models():
    """List available models"""
    try:
        response = requests.get(f'{OLLAMA_URL}/api/tags', timeout=10)
        return jsonify(response.json())
    except:
        return jsonify({"error": "Cannot connect to Ollama service"}), 503

@app.route('/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    try:
        response = requests.get(f'{OLLAMA_URL}/api/tags', timeout=5)
        if response.status_code == 200:
            return jsonify({"status": "healthy", "ollama": "connected"})
    except:
        pass
    return jsonify({"status": "unhealthy", "ollama": "disconnected"}), 503

# Start ngrok tunnel
print("\n" + "="*60)
print("STARTING NGROK TUNNEL")
print("="*60)
http_tunnel = ngrok.connect(5000)
public_url = http_tunnel.public_url
print(f"\nüåê Public URL: {public_url}")
print(f"\nüìù Add this to your .env file:")
print(f"   OLLAMA_BASE_URL={public_url}")
print("\n" + "="*60)
print("Available endpoints:")
print(f"  ‚Ä¢ POST {public_url}/api/generate")
print(f"  ‚Ä¢ POST {public_url}/api/pull")
print(f"  ‚Ä¢ GET  {public_url}/api/tags")
print(f"  ‚Ä¢ GET  {public_url}/health")
print("="*60 + "\n")

# Run Flask app
app.run(host='0.0.0.0', port=5000)