# Arcface

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from models.arcface import iresnet100
import numpy as np
from PIL import Image
import onnx
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = iresnet100()
state_dict = torch.load("../checkpoints/arcface-r100-glint360k.pth", map_location="cpu")
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
print("Missing keys:", missing_keys)
print("Unexpected keys:", unexpected_keys)
model.eval()
model = model.to(device)

model.eval()
model.to("cpu")     # ensure params on CPU
dummy_input = torch.randn(1, 3, 112, 112)  # adjust to your model's input shape

out_path = "../checkpoints/arcface-r100-glint360k.onnx"
out_dir = Path(out_path).parent
out_dir.mkdir(parents=True, exist_ok=True)

with torch.no_grad():
    torch.onnx.export(
        model,
        dummy_input,
        str(out_path),
        export_params=True,          # make sure weights are embedded
        opset_version=18,            # set >=18 as suggested by your log
        do_constant_folding=True,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes=None,           # try None first; if you need dynamic axes use dynamic_shapes (below)
        dynamo=False,                # force legacy exporter (disable torch.export/dynamo path)
        verbose=False,
        keep_initializers_as_inputs=True
    )

print("Export done ->", out_path)


### you should have Nvidia tensorrt installed

In [None]:
!trtexec  --onnx=arcface-r100-glint360k.onnx --saveEngine=arcface-r100-glint360k_fp16.engine --fp16

# Retinaface

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from src.models.retinaface import retina50
import numpy as np
from PIL import Image
import onnx
from pathlib import Path


def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes
def decode_landm(pre, priors, variances):
    """Decode landm from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        pre (tensor): landm predictions for loc layers,
            Shape: [num_priors,10]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded landm predictions
    """
    landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:],
                        priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:],
                        priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:],
                        priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:],
                        priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:],
                        ), dim=1)
    return landms
def py_cpu_nms(dets, thresh):
    """Pure Python NMS baseline."""
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep
def check_keys(model, pretrained_state_dict):
    ckpt_keys = set(pretrained_state_dict.keys())
    model_keys = set(model.state_dict().keys())
    used_pretrained_keys = model_keys & ckpt_keys
    unused_pretrained_keys = ckpt_keys - model_keys
    missing_keys = model_keys - ckpt_keys
    print('Missing keys:{}'.format(len(missing_keys)))
    print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
    print('Used keys:{}'.format(len(used_pretrained_keys)))
    assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
    return True
def remove_prefix(state_dict, prefix):
    ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
    print('remove prefix \'{}\''.format(prefix))
    f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
    return {f(key): value for key, value in state_dict.items()}
def load_model(model, pretrained_path, load_to_cpu):
    print('Loading pretrained model from {}'.format(pretrained_path))
    if load_to_cpu:
        pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
    else:
        device = torch.cuda.current_device()
        pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
    if "state_dict" in pretrained_dict.keys():
        pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
    else:
        pretrained_dict = remove_prefix(pretrained_dict, 'module.')
    check_keys(model, pretrained_dict)
    model.load_state_dict(pretrained_dict, strict=False)
    return model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = retina50()
model = load_model(model, "checkpoints/RetinaFace-R50.pth", device)
model.eval()
model = model.to(device)

model.eval()
model.to("cpu")     # ensure params on CPU
dummy_input = torch.randn(1, 3, 640, 640)

out_path = "../checkpoints/RetinaFace-R50.onnx"
out_dir = Path(out_path).parent
out_dir.mkdir(parents=True, exist_ok=True)

with torch.no_grad():
    torch.onnx.export(
        model,
        dummy_input,
        str(out_path),
        export_params=True,          # make sure weights are embedded
        opset_version=18,            # set >=18 as suggested by your log
        do_constant_folding=True,
        input_names=["input"],
        output_names=["output"],
        dynamic_axes=None,           # try None first; if you need dynamic axes use dynamic_shapes (below)
        dynamo=False,                # force legacy exporter (disable torch.export/dynamo path)
        verbose=False,
        keep_initializers_as_inputs=True
    )

print("Export done ->", out_path)


In [None]:
!trtexec  --onnx=RetinaFace-R50.onnx --saveEngine=RetinaFace-R50_fp16.engine --fp16