In [None]:
# 🧪 Developer Lab Notebook
# Experiment: Measure Inference Cost and GPU Usage

import pandas as pd
import numpy as np
import torch
import time
import gc
from pathlib import Path
from core import AnnotatorPipeline

# Load Model
annotator = AnnotatorPipeline()

# Upload Genome File
# Provide a small FNA/FASTA genome file here
genome_file = './webtool/developer-lab/sample_small_genome.fna'  # <-- Replace with your test file

# Parameters
output_format = "CSV"
uuid = "test-inference-cost"
tasks = {uuid: {"progress": 0, "status": "", "result": "", "exec_state": {}}}

# Function to measure GPU memory
def get_gpu_memory_mb():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / (1024 ** 2)
    else:
        return 0

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

# ----------------------------
# 1. Measure Inference Time
# ----------------------------
start_gpu_memory = get_gpu_memory_mb()

start_time = time.time()
result_path = annotator.pipeline(Path(genome_file), output_format, tasks, uuid, logging=None)
end_time = time.time()

end_gpu_memory = get_gpu_memory_mb()

# ----------------------------
# 2. Results
# ----------------------------
total_time_sec = end_time - start_time
max_gpu_used_mb = end_gpu_memory - start_gpu_memory

print(f"✅ Inference completed!")
print(f"Average Inference Time: {total_time_sec:.2f} seconds")
print(f"Approximate GPU Memory Usage: {max_gpu_used_mb:.2f} MB")
print(f"Output saved at: {result_path}")

# ----------------------------
# 3. Summary Table
# ----------------------------
import matplotlib.pyplot as plt

metrics = {
    "Metric": ["Inference Time (s)", "GPU Memory Usage (MB)"],
    "Value": [total_time_sec, max_gpu_used_mb]
}

df_metrics = pd.DataFrame(metrics)
print(df_metrics)

df_metrics.plot.bar(x="Metric", y="Value", legend=False)
plt.title("Inference Cost Metrics")
plt.ylabel("Value")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

# ----------------------------
# 4. (Optional) Discussion Ideas
# ----------------------------

print("""
📚 Discussion:
- **Observed Time**: {:.2f} seconds per genome.
- **Observed GPU Usage**: {:.2f} MB (rough).

✅ Possible optimizations:
- Use **Mixed Precision Inference** (float16 instead of float32) with PyTorch `autocast`.
- Use **ONNX export** + TensorRT for optimized inference.
- Apply **model quantization** (8-bit) to reduce memory footprint.
- **Batching larger inputs** to maximize GPU parallelism.

⚡ Potential future work: Add a lightweight distilled model for faster annotation on CPUs.
""".format(total_time_sec, max_gpu_used_mb))
