## **torch.compile**

In [1]:
import torch
import time
from chop.models import get_model
from chop.dataset import get_dataset_info


def timed_gpu(fn):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    result = fn()
    end.record()
    torch.cuda.synchronize()
    return result, start.elapsed_time(end) / 1000


def timed_cpu(fn):
    start = time.time()
    result = fn()
    return result, time.time() - start


def get_data():
    return torch.randn(128, 3, 224, 224)


def time_model(fn, n=1000, device="cpu"):
    times = []
    data = get_data().to(device)
    for _ in range(n):
        if device == "cpu":
            _, t = timed_cpu(lambda: fn(data.cpu()))
        else:
            _, t = timed_gpu(lambda: fn(data))
        times.append(t)
    avg_time = sum(times) / len(times)
    return avg_time

cifar10_info = get_dataset_info("imagenet")
model = get_model("resnet18", pretrained=True, dataset_info=cifar10_info)
image = torch.randn(64, 3, 224, 224)

opt_model = torch.compile(model)

In [4]:
device = "cpu"
n = 5

model.to(device)
opt_model.to(device)
avg_t = time_model(model, n=n, device=device)
opt_avg_t = time_model(opt_model, n=n, device=device)
print(f"Original model: {avg_t:.4f} s")
print(f"Optimized model: {opt_avg_t:.4f} s")

Original model: 2.1772 s
Optimized model: 1.5918 s


In [8]:
device = "cuda"
n = 5

model.to(device)
opt_model.to(device)
avg_t = time_model(model, n=n, device=device)
opt_avg_t = time_model(opt_model, n=n, device=device)
print(f"Original model: {avg_t:.4f} s")
print(f"Optimized model: {opt_avg_t:.4f} s")

Original model: 0.0761 s
Optimized model: 0.0666 s


## **kernel fusion**

In [73]:
import math
import torch
import torch.nn.functional as F


class ScaledDotProductAttention(torch.nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, query, key, value):
        scale_factor = 1 / math.sqrt(query.size(-1))
        score = query @ key.transpose(-2, -1) * scale_factor
        attn = F.softmax(score, -1)
        context = attn @ value
        return context


class ScaledDotProductAttentionFused(torch.nn.Module):
    def forward(self, query, key, value):
        return F.scaled_dot_product_attention(query, key, value)

In [74]:
""" forward """
device = "cpu"  
dtype = torch.float32  

query = torch.ones(32, 8, 128, 64, dtype=dtype, device=device)
key = torch.ones(32, 8, 128, 64, dtype=dtype, device=device)
value = torch.ones(32, 8, 128, 64, dtype=dtype, device=device)

y1 = ScaledDotProductAttention()(query, key, value)
y2 = ScaledDotProductAttentionFused()(query, key, value)
print(y1[0, 0, 0, 0], y2[0, 0, 0, 0])


tensor(1.) tensor(1.)


In [75]:
model_navie = ScaledDotProductAttention()
model_fused = ScaledDotProductAttentionFused()


In [76]:
def timed_gpu(fn):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    result = fn()
    end.record()
    torch.cuda.synchronize()
    return result, start.elapsed_time(end) / 1000


def timed_cpu(fn):
    start = time.time()
    result = fn()
    return result, time.time() - start


def get_data():
    query = torch.ones(32, 8, 128, 64, dtype=dtype, device=device)
    key = torch.ones(32, 8, 128, 64, dtype=dtype, device=device)
    value = torch.ones(32, 8, 128, 64, dtype=dtype, device=device)
    return query, key, value


def time_model(fn, n=1000, device="cpu"):
    times = []
    query, key, value = get_data()
    query = query.to(device)
    key = key.to(device)    
    value = value.to(device)    
    for _ in range(n):
        if device == "cpu":
            _, t = timed_cpu(lambda: fn(query.cpu(), key.cpu(), value.cpu()))
        else:
            _, t = timed_gpu(lambda: fn(query, key, value))
        times.append(t)
    avg_time = sum(times) / len(times)
    return avg_time

In [62]:
import torch
import time
n = 5

In [97]:
device = "cpu"
model_navie.to(device)
model_fused.to(device)
avg_t = time_model(model_navie, n=n, device=device)
fused_avg_t = time_model(model_fused, n=n, device=device)
print(f"Original model: {avg_t:.4f} s")
print(f"Optimized model: {fused_avg_t:.4f} s")

Original model: 0.0144 s
Optimized model: 0.0068 s


In [111]:
device = "cuda"
model_navie.to(device)
model_fused.to(device)
avg_t = time_model(model_navie, n=n, device=device)
fused_avg_t = time_model(model_fused, n=n, device=device)
print(f"Original model: {avg_t:.4f} s")
print(f"Optimized model: {fused_avg_t:.4f} s")

Original model: 0.0058 s
Optimized model: 0.0031 s


## **Custom kernel**