In [31]:
import time, json
from statistics import mean, stdev
from typing import Dict, List, Tuple
import pandas as pd
import psutil
import ollama

In [32]:
# ========= Config =========
PROMPT = "Olá, como você está?"
MODELS = ["Gemma3:4b", "deepseek-r1:8b"]  # None => usa todos instalados (ollama.list())
N_RUNS = 3
SAMPLE_INTERVAL_SEC = 0.0005

# CSVs (opcional)
OUT_CSV_SUMMARY_RAM = "resumo_RAM.csv"
OUT_CSV_RUNS_RAM    = "runs_RAM.csv"
OUT_CSV_SUMMARY_VRAM = "resumo_VRAM.csv"
OUT_CSV_RUNS_VRAM    = "runs_VRAM.csv"

In [33]:
# ========= NVML / GPU =========
NVML_OK = False
NVML_MSG = "NVML não inicializado"
try:
    import pynvml
    try:
        pynvml.nvmlInit()
        NVML_OK = True
        NVML_MSG = f"NVML OK (versão {pynvml.nvmlSystemGetNVMLVersion()})"
    except Exception as e:
        NVML_MSG = f"Falha nvmlInit: {e!r}"
except Exception as e:
    NVML_MSG = f"Falha import pynvml: {e!r}"

print(NVML_MSG)
if not NVML_OK:
    raise RuntimeError("NVML indisponível. Verifique driver NVIDIA e instalação do pynvml.")


NVML OK (versão 12.576.88)


In [None]:
def list_ollama_pids() -> List[int]:
    pids = []
    for proc in psutil.process_iter(attrs=["pid","name"]):
        try:
            if "ollama" in (proc.info.get("name") or "").lower():
                pids.append(proc.info["pid"])
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            pass
    return sorted(set(pids))

def ram_mb_for_pids(pids: List[int]) -> float:
    total = 0
    for pid in pids:
        try:
            total += psutil.Process(pid).memory_info().rss
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            pass
    return total / (1024**2)

def vram_used_mb_all_gpus() -> Dict[int, float]:
    out: Dict[int, float] = {}
    if not NVML_OK:
        return out
    try:
        n = pynvml.nvmlDeviceGetCount()
        for i in range(n):
            h = pynvml.nvmlDeviceGetHandleByIndex(i)
            mem = pynvml.nvmlDeviceGetMemoryInfo(h)
            out[i] = mem.used / (1024**2)
    except Exception:
        return {}
    return out

def vram_peak_delta_mb_during(stream_iter, sample_interval_sec=0.05):
    baseline = vram_used_mb_all_gpus()
    peak_by_gpu = {k: 0.0 for k in baseline.keys()}
    last_sample = time.perf_counter()

    def sample():
        current = vram_used_mb_all_gpus()
        for gi in set(baseline.keys()) | set(current.keys()):
            b = baseline.get(gi, 0.0)
            c = current.get(gi, 0.0)
            delta = max(0.0, c - b)
            peak_by_gpu[gi] = max(peak_by_gpu.get(gi, 0.0), delta)

    sample()  # <-- amostra inicial (novo)

    for _ in stream_iter:
        now = time.perf_counter()
        if (now - last_sample) >= sample_interval_sec:
            sample()
            last_sample = now

    sample()  # <-- amostra final (já tinha, mantenha)

    peak_global = max(peak_by_gpu.values()) if peak_by_gpu else None
    return peak_global, peak_by_gpu

def get_models() -> List[str]:
    if MODELS:
        return MODELS
    tags = ollama.list()
    ms = [m["name"] for m in tags.get("models", []) if "name" in m]
    if not ms:
        raise RuntimeError("Nenhum modelo em `ollama list`.")
    return ms

In [35]:
# ========= Benchmark RAM =========
def run_once_ram(model: str, prompt: str) -> dict:
    pids = list_ollama_pids()
    peak_ram_mb = 0.0
    start = time.perf_counter()
    last_sample = start

    stream = ollama.generate(model=model, prompt=prompt, stream=True)
    text_chunks = []
    for chunk in stream:
        if "response" in chunk:
            text_chunks.append(chunk["response"])
        now = time.perf_counter()
        if pids and (now - last_sample) >= SAMPLE_INTERVAL_SEC:
            peak_ram_mb = max(peak_ram_mb, ram_mb_for_pids(pids))
            last_sample = now

    if pids:
        peak_ram_mb = max(peak_ram_mb, ram_mb_for_pids(pids))

    elapsed_wall = time.perf_counter() - start
    return {
        "model": model,
        "elapsed_wall_sec": elapsed_wall,
        "peak_ram_mb": round(peak_ram_mb, 2) if peak_ram_mb else None,
        "output_preview": ("".join(text_chunks))[:200]
    }

def best_of_n_ram(model: str, prompt: str, n: int):
    runs = []
    for i in range(1, n+1):
        print(f" - {model}: rodada {i}/{n}…")
        runs.append(run_once_ram(model, prompt))
    ram_vals = [r["peak_ram_mb"] for r in runs if r["peak_ram_mb"] is not None]
    t_vals = [r["elapsed_wall_sec"] for r in runs if r["elapsed_wall_sec"] is not None]
    return runs, {
        "model": model,
        "best_peak_ram_mb": min(ram_vals) if ram_vals else None,
        "mean_peak_ram_mb": mean(ram_vals) if ram_vals else None,
        "std_peak_ram_mb": stdev(ram_vals) if len(ram_vals) > 1 else None,
        "best_elapsed_wall_sec": min(t_vals) if t_vals else None,
        "mean_elapsed_wall_sec": mean(t_vals) if t_vals else None,
        "std_elapsed_wall_sec": stdev(t_vals) if len(t_vals) > 1 else None,
    }

# ========= Benchmark VRAM =========
def run_once_vram(model: str, prompt: str) -> dict:
    start = time.perf_counter()
    stream = ollama.generate(model=model, prompt=prompt, stream=True)
    text_preview = []

    def _iter():
        for chunk in stream:
            if "response" in chunk:
                text_preview.append(chunk["response"])
            yield chunk

    peak_vram_delta_mb, peak_vram_delta_detail = vram_peak_delta_mb_during(_iter(), SAMPLE_INTERVAL_SEC)
    elapsed_wall = time.perf_counter() - start

    return {
        "model": model,
        "elapsed_wall_sec": elapsed_wall,
        "peak_vram_mb": round(peak_vram_delta_mb, 2) if peak_vram_delta_mb else None,
        "peak_vram_detail": json.dumps({k: round(v,2) for k,v in peak_vram_delta_detail.items()}),
        "output_preview": ("".join(text_preview))[:200]
    }

def best_of_n_vram(model: str, prompt: str, n: int):
    runs = []
    for i in range(1, n+1):
        print(f" - {model}: rodada {i}/{n}…")
        runs.append(run_once_vram(model, prompt))
    vram_vals = [r["peak_vram_mb"] for r in runs if r["peak_vram_mb"] is not None]
    t_vals = [r["elapsed_wall_sec"] for r in runs if r["elapsed_wall_sec"] is not None]
    return runs, {
        "model": model,
        "best_peak_vram_mb": min(vram_vals) if vram_vals else None,
        "mean_peak_vram_mb": mean(vram_vals) if vram_vals else None,
        "std_peak_vram_mb": stdev(vram_vals) if len(vram_vals) > 1 else None,
        "best_elapsed_wall_sec": min(t_vals) if t_vals else None,
        "mean_elapsed_wall_sec": mean(t_vals) if t_vals else None,
        "std_elapsed_wall_sec": stdev(t_vals) if len(t_vals) > 1 else None,
    }


In [36]:
# ========= Execução RAM =========
models = get_models()
print("Modelos:", models)

all_runs_rows, summary_rows = [], []
for m in models:
    print(f"\n=== {m} — RAM ===")
    runs, summary = best_of_n_ram(m, PROMPT, N_RUNS)
    for idx, r in enumerate(runs, 1):
        all_runs_rows.append({**r, "run": idx})
    summary_rows.append(summary)

df_ram_runs = pd.DataFrame(all_runs_rows)
df_ram_summary = pd.DataFrame(summary_rows)

if OUT_CSV_RUNS_RAM: df_ram_runs.to_csv(OUT_CSV_RUNS_RAM, index=False)
if OUT_CSV_SUMMARY_RAM: df_ram_summary.to_csv(OUT_CSV_SUMMARY_RAM, index=False)

display(df_ram_summary, df_ram_runs)

Modelos: ['Gemma3:4b', 'deepseek-r1:8b']

=== Gemma3:4b — RAM ===
 - Gemma3:4b: rodada 1/3…
 - Gemma3:4b: rodada 2/3…
 - Gemma3:4b: rodada 3/3…

=== deepseek-r1:8b — RAM ===
 - deepseek-r1:8b: rodada 1/3…
 - deepseek-r1:8b: rodada 2/3…
 - deepseek-r1:8b: rodada 3/3…


Unnamed: 0,model,best_peak_ram_mb,mean_peak_ram_mb,std_peak_ram_mb,best_elapsed_wall_sec,mean_elapsed_wall_sec,std_elapsed_wall_sec
0,Gemma3:4b,263.14,993.84,632.869473,0.284039,0.854702,0.916968
1,deepseek-r1:8b,300.88,835.42,463.274259,3.409997,4.506795,1.185375


Unnamed: 0,model,elapsed_wall_sec,peak_ram_mb,output_preview,run
0,Gemma3:4b,1.912424,263.14,"Olá! Eu estou bem, obrigado por perguntar! Com...",1
1,Gemma3:4b,0.367642,1368.24,"Olá! Eu estou bem, obrigado por perguntar! Com...",2
2,Gemma3:4b,0.284039,1350.14,"Olá! Eu estou bem, obrigado por perguntar! Com...",3
3,deepseek-r1:8b,5.764341,300.88,"<think>\nAh, o usuário começou com um “Olá, co...",1
4,deepseek-r1:8b,3.409997,1120.67,"<think>\nAh, o usuário começou com um cumprime...",2
5,deepseek-r1:8b,4.346047,1084.71,"<think>\nAh, o usuário deu um oi simples em po...",3


In [37]:
# ========= Execução VRAM =========
all_runs_rows, summary_rows = [], []
for m in models:
    print(f"\n=== {m} — VRAM ===")
    runs, summary = best_of_n_vram(m, PROMPT, N_RUNS)
    for idx, r in enumerate(runs, 1):
        all_runs_rows.append({**r, "run": idx})
    summary_rows.append(summary)

df_vram_runs = pd.DataFrame(all_runs_rows)
df_vram_summary = pd.DataFrame(summary_rows)

if OUT_CSV_RUNS_VRAM: df_vram_runs.to_csv(OUT_CSV_RUNS_VRAM, index=False)
if OUT_CSV_SUMMARY_VRAM: df_vram_summary.to_csv(OUT_CSV_SUMMARY_VRAM, index=False)

display(df_vram_summary, df_vram_runs)


=== Gemma3:4b — VRAM ===
 - Gemma3:4b: rodada 1/3…
 - Gemma3:4b: rodada 2/3…
 - Gemma3:4b: rodada 3/3…

=== deepseek-r1:8b — VRAM ===
 - deepseek-r1:8b: rodada 1/3…
 - deepseek-r1:8b: rodada 2/3…
 - deepseek-r1:8b: rodada 3/3…


Unnamed: 0,model,best_peak_vram_mb,mean_peak_vram_mb,std_peak_vram_mb,best_elapsed_wall_sec,mean_elapsed_wall_sec,std_elapsed_wall_sec
0,Gemma3:4b,,,,0.227735,0.80971,0.961347
1,deepseek-r1:8b,1.06,300.273333,516.24482,2.000494,3.526182,1.588496


Unnamed: 0,model,elapsed_wall_sec,peak_vram_mb,peak_vram_detail,output_preview,run
0,Gemma3:4b,1.919334,,"{""0"": 0.0}","Olá! Eu estou bem, obrigado por perguntar! Com...",1
1,Gemma3:4b,0.227735,,"{""0"": 0.0}","Olá! Eu estou bem, obrigado por perguntar! Com...",2
2,Gemma3:4b,0.28206,,"{""0"": 0.0}","Olá! Eu estou bem, obrigado por perguntar! Com...",3
3,deepseek-r1:8b,5.1708,896.38,"{""0"": 896.38}",<think>\nHmm…… o usuário começou com um cumpri...,1
4,deepseek-r1:8b,2.000494,1.06,"{""0"": 1.06}","<think>\nAh, o usuário começou com um “Olá” su...",2
5,deepseek-r1:8b,3.407253,3.38,"{""0"": 3.38}","<think>\nAh, o usuário começou com um cumprime...",3
