In [None]:
!python.exe -m pip install --upgrade pip
%pip install numpy pandas pygame matplotlib kagglehub torch torchvision torchaudio transformers sentencepiece accelerate bitsandbytes ollama ipykernel jupyter notebook pyperclip openai llama-cpp-python

In [14]:
import requests
import time
from typing import List, Dict, Optional

OLLAMA_HOST = "http://localhost:11434"

# Default performance settings tuned for ~32 GB VRAM
MODEL_DEFAULTS = {
    "qwen3:30b":       {"num_ctx": 16384, "num_batch": 4096, "num_predict": 2048},
    "deepseek-r1:14b": {"num_ctx": 16384, "num_batch": 6144, "num_predict": 2048},
    "llama3.1:8b":     {"num_ctx": 16384, "num_batch": 6144, "num_predict": 2048},
}

def list_models() -> List[str]:
    """Return all locally installed Ollama models (names only)."""
    url = f"{OLLAMA_HOST}/api/tags"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    models = [m["name"] for m in r.json().get("models", [])]
    for i, name in enumerate(models):
        print(f"[{i}] {name}")
    return models

def run_model(
    prompt: str,
    model_index: int = 0,
    messages: Optional[List[Dict[str, str]]] = None,
    options: Optional[Dict] = None,
    timeout: int = 600,
) -> str:
    """Run a chat prompt on a selected Ollama model by index."""

    models = list_models()
    if not models:
        raise RuntimeError("No models installed in Ollama.")

    if model_index < 0 or model_index >= len(models):
        raise IndexError(f"Invalid model index {model_index}, available 0..{len(models)-1}")

    model = models[model_index]
    print(f"\nUsing model [{model_index}] {model}")

    # Build default messages
    if messages is None:
        messages = [
            {"role": "system", "content": "I respond in form of a python dictionary."},
            {"role": "user", "content": prompt},
        ]

    # Merge defaults
    base_opts = {
        "num_gpu": 999,
        "num_thread": 16,
        "temperature": 0.6,
        "top_p": 0.9,
    }
    opts = {**base_opts, **MODEL_DEFAULTS.get(model, {}), **(options or {})}

    payload = {"model": model, "messages": messages, "options": opts, "stream": False}
    url = f"{OLLAMA_HOST}/api/chat"

    t0 = time.time()
    r = requests.post(url, json=payload, timeout=timeout)
    r.raise_for_status()
    dt = time.time() - t0
    data = r.json()
    text = data["message"]["content"]
    # print(f"[{model}] Elapsed {dt:.2f}s, {len(text)} chars")
    print(data)
    return text

# --- Example usage ---
if __name__ == "__main__":
    # Show available models
    print("\nSelect a model by index when calling run_model().")

    # Example: run on index 1 (deepseek-r1:14b)
    prompt = "Tell me a way to measure your model's performance."
    reply = run_model(prompt, model_index=1)
    print("\nModel reply:\n", reply)



Select a model by index when calling run_model().
[0] qwen3:30b
[1] deepseek-r1:14b
[2] llama3.1:8b

Using model [1] deepseek-r1:14b
{'model': 'deepseek-r1:14b', 'created_at': '2025-09-03T08:11:18.4731303Z', 'message': {'role': 'assistant', 'content': '<think>\nOkay, so I need to figure out how to measure the performance of a machine learning model. Hmm, where do I start? I remember from my studies that there are different types of metrics depending on whether it\'s classification or regression.\n\nLet me think about classification first. There\'s accuracy, which is just the percentage of correct predictions. But wait, isn\'t accuracy not always the best measure? Like, in cases where the classes are imbalanced, accuracy can be misleading. So maybe precision and recall are better. Precision is the ratio of correctly predicted positive observations to the total predicted positives. Recall is the ratio of correctly predicted positive observations to the actual positives. Oh right, and th