In [None]:
%pip install langchain transformers accelerate

: 

In [2]:
import time
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate

In [15]:
MODELS = ["llama3.1:8b", "qwen2.5:latest", "gemma2:2b"]

In [9]:
PROMPTS = [
    "What are the differences between CPU and GPU?",
]

In [10]:
NUM_RUNS = 3  # number of repetitions per prompt

In [11]:
def benchmark_model(model_name: str):
    print(f"\n⏱️ Benchmarking: {model_name}")
    llm = Ollama(model=model_name, temperature=0.0)

    total_time = 0
    total_tokens = 0
    total_tasks = len(PROMPTS) * NUM_RUNS

    for prompt in PROMPTS:
        for _ in range(NUM_RUNS):
            start = time.time()
            response = llm.invoke(prompt)
            end = time.time()

            elapsed = end - start
            tokens = len(prompt.split()) + len(response.split())  # rough total

            total_time += elapsed
            total_tokens += tokens

    avg_time = total_time / total_tasks
    avg_tokens = total_tokens / total_tasks
    tpm = (avg_tokens / avg_time) * 60

    print(f"✅ Avg Time: {avg_time:.2f}s | Tokens: {avg_tokens:.2f} | TPM: {tpm:.2f}")

    # Final verdict
    if tpm > 2500:
        verdict = "🚀 Excellent performance — suitable for fast local inference"
    elif tpm > 1500:
        verdict = "👍 Good performance — suitable for general use"
    elif tpm > 1000:
        verdict = "⚠️ Moderate — acceptable but may lag under load"
    else:
        verdict = "🐢 Slow — only suitable for light or occasional usage"

    return {
        "model": model_name,
        "latency": round(avg_time, 2),
        "tokens": round(avg_tokens, 2),
        "tpm": round(tpm, 2),
        "verdict": verdict
    }

In [12]:
def main():
    results = []
    for model in MODELS:
        try:
            result = benchmark_model(model)
            results.append(result)
        except Exception as e:
            print(f"❌ {model} failed: {e}")

    print("\n📊 Benchmark Summary:\n")
    for r in results:
        print(f"{r['model']}:\n"
              f"   ⏱ Avg Latency: {r['latency']}s\n"
              f"   🧠 Avg Tokens: {r['tokens']}\n"
              f"   🚦 TPM: {r['tpm']}\n"
              f"   ✅ Verdict: {r['verdict']}\n")

In [16]:
if __name__ == "__main__":
    main()


⏱️ Benchmarking: llama3.1:8b
✅ Avg Time: 72.09s | Tokens: 345.33 | TPM: 287.40

⏱️ Benchmarking: qwen2.5:latest
✅ Avg Time: 74.10s | Tokens: 385.67 | TPM: 312.29

⏱️ Benchmarking: gemma2:2b
✅ Avg Time: 38.79s | Tokens: 384.33 | TPM: 594.46

📊 Benchmark Summary:

llama3.1:8b:
   ⏱ Avg Latency: 72.09s
   🧠 Avg Tokens: 345.33
   🚦 TPM: 287.4
   ✅ Verdict: 🐢 Slow — only suitable for light or occasional usage

qwen2.5:latest:
   ⏱ Avg Latency: 74.1s
   🧠 Avg Tokens: 385.67
   🚦 TPM: 312.29
   ✅ Verdict: 🐢 Slow — only suitable for light or occasional usage

gemma2:2b:
   ⏱ Avg Latency: 38.79s
   🧠 Avg Tokens: 384.33
   🚦 TPM: 594.46
   ✅ Verdict: 🐢 Slow — only suitable for light or occasional usage

