In [None]:
#| default_exp core

# API

In [2]:
#| export
from fastcore.utils import *
from pynvml import *
from psutil import Process
from dataclasses import dataclass

In [3]:
#| export
class NVML:
    "Convenient access to `pynvml` (the library behind `nvidia-smi`)"
    def __init__(self):
        nvmlInit()
        self.driver_version = nvmlSystemGetDriverVersion()
        cv = nvmlSystemGetCudaDriverVersion()
        self.cuda_version = f"{cv // 1000}.{(cv % 1000) // 10}"

    def __getitem__(self, i): return nvmlDeviceGetHandleByIndex(i)
    def __del__(self): nvmlShutdown()

In [4]:
nv = NVML()
nv.driver_version, nv.cuda_version

('535.183.06', '12.2')

In [5]:
handle = nv[0]
nvmlDeviceGetName(handle)

'NVIDIA RTX A6000'

In [6]:
#| export
@dataclass
class _Info:
    name:bytes; serial:str; uuid:bytes; persistence_mode:int; bus_id:bytes; display_active:int
    performance_state:int; fan_speed:int; temperature:int; compute_mode:int

In [6]:
#| export
@patch
def info(self:NVML, i:int=0) -> _Info:
    "Basic information about GPU `i`"
    h = self[i]
    return _Info(
        name=nvmlDeviceGetName(h),
        serial=nvmlDeviceGetSerial(h),
        uuid=nvmlDeviceGetUUID(h),
        persistence_mode=nvmlDeviceGetPersistenceMode(h),
        bus_id=nvmlDeviceGetPciInfo(h).busId,
        display_active=nvmlDeviceGetDisplayActive(h),
        performance_state=nvmlDeviceGetPerformanceState(h),
        fan_speed=nvmlDeviceGetFanSpeed(self[i]),
        temperature=nvmlDeviceGetTemperature(self[i], NVML_TEMPERATURE_GPU),
        compute_mode=nvmlDeviceGetComputeMode(h))

In [7]:
nv.info(0)

_Info(name='NVIDIA RTX A6000', serial='1322123048138', uuid='GPU-61e56e6f-2a64-c0f4-b26c-ab3ead0eed5b', persistence_mode=1, bus_id='00000000:01:00.0', display_active=0, performance_state=8, fan_speed=30, temperature=31, compute_mode=0)

In [8]:
#| export
@dataclass
class _Memory: free: float; total: float; used: float

In [9]:
#| export
@patch
def mem(self:NVML, i:int=0)->_Memory:
    "Memory total/free/used for GPU `i`, in MB"
    res = nvmlDeviceGetMemoryInfo(self[i])
    return _Memory(*(getattr(res, fld) / 1024**2 for fld in ('free', 'total', 'used')))

In [10]:
nv.mem(0)

_Memory(free=2193.25, total=49140.0, used=46946.75)

In [11]:
#| export
@dataclass
class _Utilization: gpu: int; memory: int; enc: int; dec: int

In [12]:
#| export
@patch
def utilization(self:NVML, i:int=0) -> _Utilization:
    "% of time during which GPU `i` was actively using various components"
    h = self[i]
    u = nvmlDeviceGetUtilizationRates(h)
    enc, _ = nvmlDeviceGetEncoderUtilization(h)
    dec, _ = nvmlDeviceGetDecoderUtilization(h)
    return _Utilization(u.gpu, u.memory, enc, dec)

In [13]:
nv.utilization(0)

_Utilization(gpu=0, memory=0, enc=0, dec=0)

In [14]:
#| export
@dataclass
class _Power: usage: float; limit: float

In [15]:
#| export
@patch
def power(self:NVML, i:int=0) -> _Power:
    "Get power usage and limit for GPU `i` in watts"
    return _Power(
        usage=nvmlDeviceGetPowerUsage(self[i]) / 1000,
        limit=nvmlDeviceGetPowerManagementLimit(self[i]) / 1000)

In [16]:
nv.power(0)

_Power(usage=16.866, limit=300.0)

Here "limit" refers to the maximum power draw allowed for the GPU.

In [17]:
#| export
@dataclass
class _Clocks: graphics: int; sm: int; mem: int

In [18]:
#| export
@patch
def clocks(self:NVML, i:int=0) -> _Clocks:
    "Get current clock speeds (in MHz) for GPU `i`"
    h = self[i]
    return _Clocks(
        graphics=nvmlDeviceGetClockInfo(h, NVML_CLOCK_GRAPHICS),
        sm=nvmlDeviceGetClockInfo(h, NVML_CLOCK_SM),
        mem=nvmlDeviceGetClockInfo(h, NVML_CLOCK_MEM))

In [19]:
nv.clocks(0)

_Clocks(graphics=0, sm=0, mem=405)

In [20]:
#| export
@dataclass
class _PCIeThroughput: rx: float; tx: float

In [21]:
#| export
@patch
def pcie_throughput(self:NVML, i:int=0) -> _PCIeThroughput:
    "Get PCIe throughput (in KB/s) for GPU `i`"
    h = self[i]
    return _PCIeThroughput(
        rx=nvmlDeviceGetPcieThroughput(h, NVML_PCIE_UTIL_RX_BYTES) / 1024,
        tx=nvmlDeviceGetPcieThroughput(h, NVML_PCIE_UTIL_TX_BYTES) / 1024)

In [22]:
nv.pcie_throughput(0)

_PCIeThroughput(rx=0.0, tx=0.0)

1. "rx" represents receive - data flowing from the motherboard to the GPU.
2. "tx" represents transmit - data flowing from the GPU to the motherboard.

In [23]:
#| export
def _procinfo(p): return {'pid': p.pid, 'name': Process(p.pid).exe(), 'memory': p.usedGpuMemory / 1024**2}

In [24]:
#| export
@dataclass
class _ProcessInfo: pid: int; name: str; memory: float

In [25]:
#| export
@patch
def processes(self:NVML, i:int=0) -> List[_ProcessInfo]:
    "Get information about processes running on GPU `i`"
    h = self[i]
    procs = nvmlDeviceGetComputeRunningProcesses(h)
    return [_ProcessInfo(p.pid, Process(p.pid).exe(), p.usedGpuMemory / 1024**2) for p in procs]

In [26]:
nv.processes(0)

[_ProcessInfo(pid=201084, name='/home/jhoward/miniconda3/bin/python3.12', memory=46476.0)]

In [27]:
#| export
@dataclass
class _DMon: pwr:float; gtemp:int; sm:int; mem:int; enc:int; dec:int; mclk:int; pclk:int

In [28]:
#| export
@patch
def dmon(self:NVML, i:int=0) -> _DMon:
    "Get key monitoring metrics for GPU `i`, similar to `nvidia-smi dmon`"
    power = self.power(i)
    util = self.utilization(i)
    clocks = self.clocks(i)

    return _DMon(pwr=power.usage, gtemp=self.info(i).temperature, sm=util.gpu, mem=util.memory,
        enc=util.enc, dec=util.dec, mclk=clocks.mem, pclk=clocks.graphics)

In [29]:
nv.dmon(0)

_DMon(pwr=16.866, gtemp=31, sm=0, mem=0, enc=0, dec=0, mclk=405, pclk=0)

# Export -

In [2]:
#|hide
import nbdev; nbdev.nbdev_export()