In [1]:
# Install flash-attn Package (About 20 Min)
!pip install flash-attn==1.0.9 --no-build-isolation -q

!pip install wandb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [2]:
import torch
!nvidia-smi
print("CUDA Usage :", torch.cuda.is_available())
print("GPU :", torch.cuda.get_device_name(0))

Wed Nov 26 16:25:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import wandb

import math
import json
import torch
import torch.nn.functional as F
from einops import rearrange
from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func

# ---------------------------
# FLOPs and Efficiency Calculation
# ---------------------------
def flops(batch_size, seq_len, head_dim, num_heads, causal, mode='fwd'):
    assert mode in ['fwd', 'bwd', 'fwd_bwd']
    f = 4 * batch_size * seq_len**2 * num_heads * head_dim // (2 if causal else 1)
    return f if mode == 'fwd' else (2.5 * f if mode == 'bwd' else 3.5 * f)

def efficiency(flop, time):
    return (flop / time / 10**12) if time > 0 else 0.0

# ---------------------------
# Custom Benchmark Function
# ---------------------------
def benchmark_fwd_bwd(func, qkv, cu_seqlens, dropout_p, causal, repeats=30):
    fwd_times, bwd_times = [], []
    for _ in range(repeats):
        torch.cuda.synchronize()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        start.record()
        out = func(qkv, cu_seqlens, qkv.shape[0], dropout_p, causal=causal)
        end.record()
        torch.cuda.synchronize()
        fwd_times.append(start.elapsed_time(end) / 1000.0)

        grad = torch.randn_like(out)
        start.record()
        out.backward(grad, retain_graph=True)
        end.record()
        torch.cuda.synchronize()
        bwd_times.append(start.elapsed_time(end) / 1000.0)

    return fwd_times, bwd_times

# ---------------------------
# PyTorch baseline attention
# ---------------------------
def pytorch_attn_func(qkv, dropout_p=0.0, causal=True):
    batch_size, seq_len, _, num_heads, head_dim = qkv.shape
    q, k, v = qkv.unbind(dim=2)
    q = rearrange(q, 'b t h d -> (b h) t d')
    k = rearrange(k, 'b s h d -> (b h) d s')
    softmax_scale = 1.0 / math.sqrt(head_dim)

    scores = torch.empty(batch_size * num_heads, seq_len, seq_len, dtype=qkv.dtype, device=qkv.device)
    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
                       '(b h) t s -> b h t s', h=num_heads)
    if causal:
        causal_mask = torch.triu(torch.full((seq_len, seq_len), -10000.0, device=scores.device), 1)
        scores = scores + causal_mask.to(dtype=scores.dtype)
    attention = torch.softmax(scores, dim=-1)
    attention_drop = F.dropout(attention, dropout_p)
    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
    return output.to(dtype=qkv.dtype)

# ---------------------------
# Main Benchmark Function
# ---------------------------
def benchmark_attention(batch_size, seq_len, num_heads, emb_dim, impl, causal, repeats, output):

    # 初始化 wandb
    wandb.init(
        project="2025-PP-Lab6",  # 專案名稱
        config={
            "batch_size": batch_size,
            "seq_len": seq_len,
            "num_heads": num_heads,
            "emb_dim": emb_dim,
            "implementation": impl,
            "causal": causal,
            "repeats": repeats,
            "head_dim": emb_dim // num_heads
        }
    )


    assert impl in ['Pytorch', 'Flash1']
    device = 'cuda'
    dtype = torch.float16
    dropout_p = 0.0
    head_dim = emb_dim // num_heads

    qkv = torch.randn(
        batch_size, seq_len, 3, num_heads, head_dim,
        device=device, dtype=dtype, requires_grad=True
    )

    if impl == 'Flash1':
        total_len = batch_size * seq_len
        cu_seqlens = torch.arange(0, (batch_size + 1) * seq_len, step=seq_len, dtype=torch.int32, device=device)
        qkv_unpad = rearrange(qkv, 'b s three h d -> (b s) three h d')
        attention_func = lambda x, cu, tot, dp, causal: flash_attn_unpadded_qkvpacked_func(
            x, cu, tot, dropout_p=dp, causal=causal
        )
        input_tensor = qkv_unpad
        cu_input = cu_seqlens
    else:
        attention_func = lambda x, cu, tot, dp, causal: pytorch_attn_func(x, dp, causal)
        input_tensor = qkv
        cu_input = None

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    fwd_times, bwd_times = benchmark_fwd_bwd(attention_func, input_tensor, cu_input, dropout_p, causal=causal, repeats=repeats)

    forward_time = sum(fwd_times) / len(fwd_times)
    backward_time = sum(bwd_times) / len(bwd_times)
    peak_memory_usage = torch.cuda.max_memory_allocated() / (1024**2)

    benchmark_result = {
        'forward': {
            'time(s)': forward_time,
            'FLOPS(TFLOPs/s)': efficiency(
                flops(batch_size, seq_len, head_dim, num_heads, causal, mode='fwd'),
                forward_time
            )
        },
        'backward': {
            'time(s)': backward_time,
            'FLOPS(TFLOPs/s)': efficiency(
                flops(batch_size, seq_len, head_dim, num_heads, causal, mode='bwd'),
                backward_time
            )
        },
        'forward_backward': {
            'time(s)': forward_time + backward_time,
            'FLOPS(TFLOPs/s)': efficiency(
                flops(batch_size, seq_len, head_dim, num_heads, causal, mode='fwd_bwd'),
                forward_time + backward_time
            )
        },
        'peak_memory_usage(MB)': peak_memory_usage,
    }


    wandb.log({
        "forward_time": forward_time,
        "backward_time": backward_time,
        "total_time": forward_time + backward_time,
        "forward_tflops": benchmark_result['forward']['FLOPS(TFLOPs/s)'],
        "backward_tflops": benchmark_result['backward']['FLOPS(TFLOPs/s)'],
        "total_tflops": benchmark_result['forward_backward']['FLOPS(TFLOPs/s)'],
        "peak_memory_mb": peak_memory_usage
    })

    with open(output, 'w') as json_file:
        json.dump(benchmark_result, json_file, indent=2)

    print(f"Benchmark completed. Results saved to {output}")
    print(json.dumps(benchmark_result, indent=2))

    wandb.finish()


In [None]:
# Run benchmark for FlashAttention v1
"""
benchmark_attention(
    batch_size=16,
    seq_len=512,
    num_heads=8,
    emb_dim=512,
    impl='Flash1',  # Choose between 'Pytorch' or 'Flash1'
    causal=True,
    repeats=30,
    output='flash1_benchmark.json'
)
"""

configs = [
    # ============================================================
    # 第一組：系統性測試 batch_size 的影響 (固定其他參數)
    # ============================================================
    {"batch_size": 1, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 2, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 4, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 8, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 32, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 64, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # ============================================================
    # 第二組：系統性測試 seq_len 的影響 (展現 O(N^2) 複雜度)
    # ============================================================
    {"batch_size": 16, "seq_len": 64, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 128, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 768, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 1536, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 2048, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # ============================================================
    # 第三組：測試不同 num_heads (並行化程度)
    # ============================================================
    # 固定 emb_dim=512
    {"batch_size": 16, "seq_len": 512, "num_heads": 1, "emb_dim": 64, "impl": "Flash1", "causal": True},   # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 2, "emb_dim": 128, "impl": "Flash1", "causal": True},  # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 4, "emb_dim": 256, "impl": "Flash1", "causal": True},  # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},  # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 1024, "impl": "Flash1", "causal": True}, # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 32, "emb_dim": 2048, "impl": "Flash1", "causal": True}, # head_dim=64

    # 固定 emb_dim=512，改變 head_dim
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},  # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 512, "impl": "Flash1", "causal": True}, # head_dim=32
    {"batch_size": 16, "seq_len": 512, "num_heads": 32, "emb_dim": 512, "impl": "Flash1", "causal": True}, # head_dim=16

    # ============================================================
    # 第四組：測試不同 emb_dim (模型大小)
    # ============================================================
    {"batch_size": 16, "seq_len": 512, "num_heads": 2, "emb_dim": 128, "impl": "Flash1", "causal": True},   # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 4, "emb_dim": 256, "impl": "Flash1", "causal": True},   # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},   # head_dim=64 (BERT-base)
    {"batch_size": 16, "seq_len": 512, "num_heads": 12, "emb_dim": 768, "impl": "Flash1", "causal": True},  # head_dim=64 (GPT-2)
    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 1024, "impl": "Flash1", "causal": True}, # head_dim=64 (BERT-large)
    {"batch_size": 16, "seq_len": 512, "num_heads": 24, "emb_dim": 1536, "impl": "Flash1", "causal": True}, # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 32, "emb_dim": 2048, "impl": "Flash1", "causal": True}, # head_dim=64

    # ============================================================
    # 第五組：Pytorch vs Flash1 詳細對比
    # ============================================================
    # 小規模
    {"batch_size": 8, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": True},
    {"batch_size": 8, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # 中等規模
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": True},
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # 大規模
    {"batch_size": 16, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": True},
    {"batch_size": 16, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # 超大規模
    {"batch_size": 16, "seq_len": 2048, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": True},
    {"batch_size": 16, "seq_len": 2048, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # 不同 emb_dim 對比
    {"batch_size": 16, "seq_len": 512, "num_heads": 12, "emb_dim": 768, "impl": "Pytorch", "causal": True},
    {"batch_size": 16, "seq_len": 512, "num_heads": 12, "emb_dim": 768, "impl": "Flash1", "causal": True},

    # ============================================================
    # 第六組：Causal vs Non-Causal 詳細對比
    # ============================================================
    # Flash1 對比
    {"batch_size": 16, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": False},
    {"batch_size": 16, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": False},
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    {"batch_size": 16, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": False},
    {"batch_size": 16, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # Pytorch 對比
    {"batch_size": 16, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": False},
    {"batch_size": 16, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": True},

    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": False},
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Pytorch", "causal": True},

    # ============================================================
    # 第七組：head_dim 的影響 (固定 emb_dim，改變 num_heads)
    # ============================================================
    {"batch_size": 16, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},   # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 512, "impl": "Flash1", "causal": True},  # head_dim=32
    {"batch_size": 16, "seq_len": 512, "num_heads": 32, "emb_dim": 512, "impl": "Flash1", "causal": True},  # head_dim=16

    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 1024, "impl": "Flash1", "causal": True}, # head_dim=64
    {"batch_size": 16, "seq_len": 512, "num_heads": 32, "emb_dim": 1024, "impl": "Flash1", "causal": True}, # head_dim=32
    {"batch_size": 16, "seq_len": 512, "num_heads": 64, "emb_dim": 1024, "impl": "Flash1", "causal": True}, # head_dim=16

    # ============================================================
    # 第八組：交叉測試 - batch_size x seq_len
    # ============================================================
    {"batch_size": 4, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 4, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 32, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 32, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},

    # ============================================================
    # 第九組：真實模型配置測試
    # ============================================================
    # BERT-base
    {"batch_size": 32, "seq_len": 512, "num_heads": 12, "emb_dim": 768, "impl": "Pytorch", "causal": False},
    {"batch_size": 32, "seq_len": 512, "num_heads": 12, "emb_dim": 768, "impl": "Flash1", "causal": False},

    # GPT-2 small
    {"batch_size": 16, "seq_len": 1024, "num_heads": 12, "emb_dim": 768, "impl": "Pytorch", "causal": True},
    {"batch_size": 16, "seq_len": 1024, "num_heads": 12, "emb_dim": 768, "impl": "Flash1", "causal": True},

    # BERT-large
    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 1024, "impl": "Pytorch", "causal": False},
    {"batch_size": 16, "seq_len": 512, "num_heads": 16, "emb_dim": 1024, "impl": "Flash1", "causal": False},

    # GPT-2 medium
    {"batch_size": 8, "seq_len": 1024, "num_heads": 16, "emb_dim": 1024, "impl": "Pytorch", "causal": True},
    {"batch_size": 8, "seq_len": 1024, "num_heads": 16, "emb_dim": 1024, "impl": "Flash1", "causal": True},

    # ============================================================
    # 第十組：記憶體壓力測試 (逐步增加)
    # ============================================================
    {"batch_size": 64, "seq_len": 256, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 64, "seq_len": 512, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 32, "seq_len": 1024, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
    {"batch_size": 16, "seq_len": 2048, "num_heads": 8, "emb_dim": 512, "impl": "Flash1", "causal": True},
]

# 執行所有實驗
for i, config in enumerate(configs):
    head_dim = config["emb_dim"] // config["num_heads"]

    print(f"\n{'='*60}")
    print(f"Experiment {i+1}/{len(configs)}")
    print(f"{'='*60}")
    print(f"Config: batch_size={config['batch_size']}, seq_len={config['seq_len']}, "
          f"num_heads={config['num_heads']}, emb_dim={config['emb_dim']}")
    print(f"Implementation: {config['impl']}, Causal: {config['causal']}")
    print(f"head_dim = {head_dim}")
    print(f"{'='*60}\n")

    try:
        benchmark_attention(
            batch_size=config["batch_size"],
            seq_len=config["seq_len"],
            num_heads=config["num_heads"],
            emb_dim=config["emb_dim"],
            impl=config["impl"],
            causal=config["causal"],
            repeats=30,
            output=f'benchmark_exp{i+1:03d}.json'
        )
    except Exception as e:
        print(f"Experiment {i+1} failed with error: {e}")
        continue

print(f"\nAll {len(configs)} experiments completed!")



Experiment 1/73
Config: batch_size=1, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp001.json
{
  "forward": {
    "time(s)": 0.00017317973325649895,
    "FLOPS(TFLOPs/s)": 1.5500396666070415
  },
  "backward": {
    "time(s)": 0.0004637173304955164,
    "FLOPS(TFLOPs/s)": 1.4471933565279778
  },
  "forward_backward": {
    "time(s)": 0.0006368970637520154,
    "FLOPS(TFLOPs/s)": 1.4751584666840551
  },
  "peak_memory_usage(MB)": 72.533203125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,1.44719
backward_time,0.00046
forward_tflops,1.55004
forward_time,0.00017
peak_memory_mb,72.5332
total_tflops,1.47516
total_time,0.00064



Experiment 2/73
Config: batch_size=2, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp002.json
{
  "forward": {
    "time(s)": 0.00023656639953454335,
    "FLOPS(TFLOPs/s)": 2.2694301179555567
  },
  "backward": {
    "time(s)": 0.0005085973342259725,
    "FLOPS(TFLOPs/s)": 2.638978204717966
  },
  "forward_backward": {
    "time(s)": 0.0007451637337605159,
    "FLOPS(TFLOPs/s)": 2.5216581361485004
  },
  "peak_memory_usage(MB)": 79.126953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.63898
backward_time,0.00051
forward_tflops,2.26943
forward_time,0.00024
peak_memory_mb,79.12695
total_tflops,2.52166
total_time,0.00075



Experiment 3/73
Config: batch_size=4, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp003.json
{
  "forward": {
    "time(s)": 0.00037299413383007053,
    "FLOPS(TFLOPs/s)": 2.878709680965593
  },
  "backward": {
    "time(s)": 0.0007976277311642964,
    "FLOPS(TFLOPs/s)": 3.3654228095626144
  },
  "forward_backward": {
    "time(s)": 0.0011706218649943669,
    "FLOPS(TFLOPs/s)": 3.2103418673271444
  },
  "peak_memory_usage(MB)": 92.501953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.36542
backward_time,0.0008
forward_tflops,2.87871
forward_time,0.00037
peak_memory_mb,92.50195
total_tflops,3.21034
total_time,0.00117



Experiment 4/73
Config: batch_size=8, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp004.json
{
  "forward": {
    "time(s)": 0.0006461066663265228,
    "FLOPS(TFLOPs/s)": 3.3237292848402933
  },
  "backward": {
    "time(s)": 0.0015080277442932129,
    "FLOPS(TFLOPs/s)": 3.5600864376114134
  },
  "forward_backward": {
    "time(s)": 0.0021541344106197355,
    "FLOPS(TFLOPs/s)": 3.4891939569535135
  },
  "peak_memory_usage(MB)": 120.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.56009
backward_time,0.00151
forward_tflops,3.32373
forward_time,0.00065
peak_memory_mb,120.00195
total_tflops,3.48919
total_time,0.00215



Experiment 5/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp005.json
{
  "forward": {
    "time(s)": 0.0010936277310053507,
    "FLOPS(TFLOPs/s)": 3.9272662664211344
  },
  "backward": {
    "time(s)": 0.002542585611343384,
    "FLOPS(TFLOPs/s)": 4.2230311506902805
  },
  "forward_backward": {
    "time(s)": 0.0036362133423487345,
    "FLOPS(TFLOPs/s)": 4.13407688732866
  },
  "peak_memory_usage(MB)": 178.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.22303
backward_time,0.00254
forward_tflops,3.92727
forward_time,0.00109
peak_memory_mb,178.00195
total_tflops,4.13408
total_time,0.00364



Experiment 6/73
Config: batch_size=32, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp006.json
{
  "forward": {
    "time(s)": 0.0019210666735966998,
    "FLOPS(TFLOPs/s)": 4.47144011712908
  },
  "backward": {
    "time(s)": 0.0051347114721934,
    "FLOPS(TFLOPs/s)": 4.182286891151563
  },
  "forward_backward": {
    "time(s)": 0.0070557781457901,
    "FLOPS(TFLOPs/s)": 4.261014228450259
  },
  "peak_memory_usage(MB)": 306.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.18229
backward_time,0.00513
forward_tflops,4.47144
forward_time,0.00192
peak_memory_mb,306.00195
total_tflops,4.26101
total_time,0.00706



Experiment 7/73
Config: batch_size=64, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp007.json
{
  "forward": {
    "time(s)": 0.0025936735868453977,
    "FLOPS(TFLOPs/s)": 6.6237591619596685
  },
  "backward": {
    "time(s)": 0.009036909850438436,
    "FLOPS(TFLOPs/s)": 4.75269463465061
  },
  "forward_backward": {
    "time(s)": 0.011630583437283834,
    "FLOPS(TFLOPs/s)": 5.169950627863123
  },
  "peak_memory_usage(MB)": 610.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.75269
backward_time,0.00904
forward_tflops,6.62376
forward_time,0.00259
peak_memory_mb,610.00195
total_tflops,5.16995
total_time,0.01163



Experiment 8/73
Config: batch_size=16, seq_len=64, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp008.json
{
  "forward": {
    "time(s)": 0.00023544853379329047,
    "FLOPS(TFLOPs/s)": 0.28502561863017384
  },
  "backward": {
    "time(s)": 0.0004169493347406387,
    "FLOPS(TFLOPs/s)": 0.4023802079079029
  },
  "forward_backward": {
    "time(s)": 0.0006523978685339291,
    "FLOPS(TFLOPs/s)": 0.3600272706712324
  },
  "peak_memory_usage(MB)": 80.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,0.40238
backward_time,0.00042
forward_tflops,0.28503
forward_time,0.00024
peak_memory_mb,80.00195
total_tflops,0.36003
total_time,0.00065



Experiment 9/73
Config: batch_size=16, seq_len=128, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp009.json
{
  "forward": {
    "time(s)": 0.0003237279991308848,
    "FLOPS(TFLOPs/s)": 0.8292006150863406
  },
  "backward": {
    "time(s)": 0.0006121695975462596,
    "FLOPS(TFLOPs/s)": 1.0962462734018543
  },
  "forward_backward": {
    "time(s)": 0.0009358975966771444,
    "FLOPS(TFLOPs/s)": 1.0038748890217597
  },
  "peak_memory_usage(MB)": 94.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,1.09625
backward_time,0.00061
forward_tflops,0.8292
forward_time,0.00032
peak_memory_mb,94.00195
total_tflops,1.00387
total_time,0.00094



Experiment 10/73
Config: batch_size=16, seq_len=256, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp010.json
{
  "forward": {
    "time(s)": 0.0005120831996202469,
    "FLOPS(TFLOPs/s)": 2.096811269723886
  },
  "backward": {
    "time(s)": 0.0011205642700195314,
    "FLOPS(TFLOPs/s)": 2.395538240705472
  },
  "forward_backward": {
    "time(s)": 0.0016326474696397781,
    "FLOPS(TFLOPs/s)": 2.3018419186532495
  },
  "peak_memory_usage(MB)": 122.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.39554
backward_time,0.00112
forward_tflops,2.09681
forward_time,0.00051
peak_memory_mb,122.00195
total_tflops,2.30184
total_time,0.00163



Experiment 11/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp011.json
{
  "forward": {
    "time(s)": 0.0011001472016175587,
    "FLOPS(TFLOPs/s)": 3.903993292611263
  },
  "backward": {
    "time(s)": 0.002573205327987671,
    "FLOPS(TFLOPs/s)": 4.172779421530657
  },
  "forward_backward": {
    "time(s)": 0.00367335252960523,
    "FLOPS(TFLOPs/s)": 4.092279576993257
  },
  "peak_memory_usage(MB)": 178.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.17278
backward_time,0.00257
forward_tflops,3.90399
forward_time,0.0011
peak_memory_mb,178.00195
total_tflops,4.09228
total_time,0.00367



Experiment 12/73
Config: batch_size=16, seq_len=768, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp012.json
{
  "forward": {
    "time(s)": 0.0019331776062647503,
    "FLOPS(TFLOPs/s)": 4.998855968889468
  },
  "backward": {
    "time(s)": 0.004585584020614624,
    "FLOPS(TFLOPs/s)": 5.268509077882265
  },
  "forward_backward": {
    "time(s)": 0.0065187616268793745,
    "FLOPS(TFLOPs/s)": 5.188541841526348
  },
  "peak_memory_usage(MB)": 234.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,5.26851
backward_time,0.00459
forward_tflops,4.99886
forward_time,0.00193
peak_memory_mb,234.00195
total_tflops,5.18854
total_time,0.00652



Experiment 13/73
Config: batch_size=16, seq_len=1024, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp013.json
{
  "forward": {
    "time(s)": 0.002806781880060832,
    "FLOPS(TFLOPs/s)": 6.120842273510636
  },
  "backward": {
    "time(s)": 0.0066846975962320965,
    "FLOPS(TFLOPs/s)": 6.425073437010681
  },
  "forward_backward": {
    "time(s)": 0.00949147947629293,
    "FLOPS(TFLOPs/s)": 6.335107429161791
  },
  "peak_memory_usage(MB)": 290.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,6.42507
backward_time,0.00668
forward_tflops,6.12084
forward_time,0.00281
peak_memory_mb,290.00195
total_tflops,6.33511
total_time,0.00949



Experiment 14/73
Config: batch_size=16, seq_len=1536, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp014.json
{
  "forward": {
    "time(s)": 0.005076259191830953,
    "FLOPS(TFLOPs/s)": 7.614801412466423
  },
  "backward": {
    "time(s)": 0.012427877362569173,
    "FLOPS(TFLOPs/s)": 7.775806064119593
  },
  "forward_backward": {
    "time(s)": 0.017504136554400127,
    "FLOPS(TFLOPs/s)": 7.729114167016191
  },
  "peak_memory_usage(MB)": 402.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,7.77581
backward_time,0.01243
forward_tflops,7.6148
forward_time,0.00508
peak_memory_mb,402.00195
total_tflops,7.72911
total_time,0.0175



Experiment 15/73
Config: batch_size=16, seq_len=2048, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp015.json
{
  "forward": {
    "time(s)": 0.00772893541653951,
    "FLOPS(TFLOPs/s)": 8.891195621708002
  },
  "backward": {
    "time(s)": 0.018924214363098147,
    "FLOPS(TFLOPs/s)": 9.078246977322564
  },
  "forward_backward": {
    "time(s)": 0.026653149779637657,
    "FLOPS(TFLOPs/s)": 9.024005438927517
  },
  "peak_memory_usage(MB)": 514.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,9.07825
backward_time,0.01892
forward_tflops,8.8912
forward_time,0.00773
peak_memory_mb,514.00195
total_tflops,9.02401
total_time,0.02665



Experiment 16/73
Config: batch_size=16, seq_len=512, num_heads=1, emb_dim=64
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp016.json
{
  "forward": {
    "time(s)": 0.0002460192014773687,
    "FLOPS(TFLOPs/s)": 2.182231747668634
  },
  "backward": {
    "time(s)": 0.0005212970664103826,
    "FLOPS(TFLOPs/s)": 2.5746879590981484
  },
  "forward_backward": {
    "time(s)": 0.0007673162678877514,
    "FLOPS(TFLOPs/s)": 2.4488574928465883
  },
  "peak_memory_usage(MB)": 80.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.57469
backward_time,0.00052
forward_tflops,2.18223
forward_time,0.00025
peak_memory_mb,80.00195
total_tflops,2.44886
total_time,0.00077



Experiment 17/73
Config: batch_size=16, seq_len=512, num_heads=2, emb_dim=128
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp017.json
{
  "forward": {
    "time(s)": 0.000412173867225647,
    "FLOPS(TFLOPs/s)": 2.6050701157435916
  },
  "backward": {
    "time(s)": 0.0008167743901411692,
    "FLOPS(TFLOPs/s)": 3.286531253184913
  },
  "forward_backward": {
    "time(s)": 0.0012289482573668161,
    "FLOPS(TFLOPs/s)": 3.0579777150685072
  },
  "peak_memory_usage(MB)": 94.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.28653
backward_time,0.00082
forward_tflops,2.60507
forward_time,0.00041
peak_memory_mb,94.00195
total_tflops,3.05798
total_time,0.00123



Experiment 18/73
Config: batch_size=16, seq_len=512, num_heads=4, emb_dim=256
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp018.json
{
  "forward": {
    "time(s)": 0.0006184831957022349,
    "FLOPS(TFLOPs/s)": 3.472177842377295
  },
  "backward": {
    "time(s)": 0.0014429450670878092,
    "FLOPS(TFLOPs/s)": 3.7206607808260324
  },
  "forward_backward": {
    "time(s)": 0.002061428262790044,
    "FLOPS(TFLOPs/s)": 3.646109303763592
  },
  "peak_memory_usage(MB)": 122.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.72066
backward_time,0.00144
forward_tflops,3.47218
forward_time,0.00062
peak_memory_mb,122.00195
total_tflops,3.64611
total_time,0.00206



Experiment 19/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp019.json
{
  "forward": {
    "time(s)": 0.0011067445357640583,
    "FLOPS(TFLOPs/s)": 3.880721482880331
  },
  "backward": {
    "time(s)": 0.0025582901239395144,
    "FLOPS(TFLOPs/s)": 4.197107333340846
  },
  "forward_backward": {
    "time(s)": 0.0036650346597035728,
    "FLOPS(TFLOPs/s)": 4.101567087830983
  },
  "peak_memory_usage(MB)": 178.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.19711
backward_time,0.00256
forward_tflops,3.88072
forward_time,0.00111
peak_memory_mb,178.00195
total_tflops,4.10157
total_time,0.00367



Experiment 20/73
Config: batch_size=16, seq_len=512, num_heads=16, emb_dim=1024
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp020.json
{
  "forward": {
    "time(s)": 0.0018522880077362059,
    "FLOPS(TFLOPs/s)": 4.637472442796994
  },
  "backward": {
    "time(s)": 0.004852953624725342,
    "FLOPS(TFLOPs/s)": 4.425106469303092
  },
  "forward_backward": {
    "time(s)": 0.006705241632461547,
    "FLOPS(TFLOPs/s)": 4.483771461187892
  },
  "peak_memory_usage(MB)": 290.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.42511
backward_time,0.00485
forward_tflops,4.63747
forward_time,0.00185
peak_memory_mb,290.00195
total_tflops,4.48377
total_time,0.00671



Experiment 21/73
Config: batch_size=16, seq_len=512, num_heads=32, emb_dim=2048
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp021.json
{
  "forward": {
    "time(s)": 0.002795404815673828,
    "FLOPS(TFLOPs/s)": 6.145753590919109
  },
  "backward": {
    "time(s)": 0.00835989867846171,
    "FLOPS(TFLOPs/s)": 5.137582955479443
  },
  "forward_backward": {
    "time(s)": 0.011155303494135538,
    "FLOPS(TFLOPs/s)": 5.3902201921813
  },
  "peak_memory_usage(MB)": 514.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,5.13758
backward_time,0.00836
forward_tflops,6.14575
forward_time,0.0028
peak_memory_mb,514.00195
total_tflops,5.39022
total_time,0.01116



Experiment 22/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp022.json
{
  "forward": {
    "time(s)": 0.0010802688042322794,
    "FLOPS(TFLOPs/s)": 3.9758320143775028
  },
  "backward": {
    "time(s)": 0.0025590389251708982,
    "FLOPS(TFLOPs/s)": 4.195879216367501
  },
  "forward_backward": {
    "time(s)": 0.0036393077294031777,
    "FLOPS(TFLOPs/s)": 4.130561813871456
  },
  "peak_memory_usage(MB)": 178.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.19588
backward_time,0.00256
forward_tflops,3.97583
forward_time,0.00108
peak_memory_mb,178.00195
total_tflops,4.13056
total_time,0.00364



Experiment 23/73
Config: batch_size=16, seq_len=512, num_heads=16, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 32



Benchmark completed. Results saved to benchmark_exp023.json
{
  "forward": {
    "time(s)": 0.0015143114646275838,
    "FLOPS(TFLOPs/s)": 2.8362509274512204
  },
  "backward": {
    "time(s)": 0.0035220885117848715,
    "FLOPS(TFLOPs/s)": 3.0485940952570356
  },
  "forward_backward": {
    "time(s)": 0.005036399976412455,
    "FLOPS(TFLOPs/s)": 2.9847481547142567
  },
  "peak_memory_usage(MB)": 186.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.04859
backward_time,0.00352
forward_tflops,2.83625
forward_time,0.00151
peak_memory_mb,186.00195
total_tflops,2.98475
total_time,0.00504



Experiment 24/73
Config: batch_size=16, seq_len=512, num_heads=32, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 16



Benchmark completed. Results saved to benchmark_exp024.json
{
  "forward": {
    "time(s)": 0.001686949336528778,
    "FLOPS(TFLOPs/s)": 2.545996612345051
  },
  "backward": {
    "time(s)": 0.004954141902923584,
    "FLOPS(TFLOPs/s)": 2.167361866171725
  },
  "forward_backward": {
    "time(s)": 0.006641091239452362,
    "FLOPS(TFLOPs/s)": 2.263541486480105
  },
  "peak_memory_usage(MB)": 202.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.16736
backward_time,0.00495
forward_tflops,2.546
forward_time,0.00169
peak_memory_mb,202.00195
total_tflops,2.26354
total_time,0.00664



Experiment 25/73
Config: batch_size=16, seq_len=512, num_heads=2, emb_dim=128
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp025.json
{
  "forward": {
    "time(s)": 0.0003932330658038457,
    "FLOPS(TFLOPs/s)": 2.73054815928325
  },
  "backward": {
    "time(s)": 0.0008274581293265025,
    "FLOPS(TFLOPs/s)": 3.2440971510968066
  },
  "forward_backward": {
    "time(s)": 0.0012206911951303482,
    "FLOPS(TFLOPs/s)": 3.078662645386495
  },
  "peak_memory_usage(MB)": 94.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.2441
backward_time,0.00083
forward_tflops,2.73055
forward_time,0.00039
peak_memory_mb,94.00195
total_tflops,3.07866
total_time,0.00122



Experiment 26/73
Config: batch_size=16, seq_len=512, num_heads=4, emb_dim=256
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp026.json
{
  "forward": {
    "time(s)": 0.0006131391982237499,
    "FLOPS(TFLOPs/s)": 3.502440643529578
  },
  "backward": {
    "time(s)": 0.0014154944022496542,
    "FLOPS(TFLOPs/s)": 3.7928155077600283
  },
  "forward_backward": {
    "time(s)": 0.002028633600473404,
    "FLOPS(TFLOPs/s)": 3.705051896136401
  },
  "peak_memory_usage(MB)": 122.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.79282
backward_time,0.00142
forward_tflops,3.50244
forward_time,0.00061
peak_memory_mb,122.00195
total_tflops,3.70505
total_time,0.00203



Experiment 27/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp027.json
{
  "forward": {
    "time(s)": 0.0011139295975367226,
    "FLOPS(TFLOPs/s)": 3.8556900772702636
  },
  "backward": {
    "time(s)": 0.0025088213443756105,
    "FLOPS(TFLOPs/s)": 4.27986562856364
  },
  "forward_backward": {
    "time(s)": 0.003622750941912333,
    "FLOPS(TFLOPs/s)": 4.149439411384127
  },
  "peak_memory_usage(MB)": 178.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.27987
backward_time,0.00251
forward_tflops,3.85569
forward_time,0.00111
peak_memory_mb,178.00195
total_tflops,4.14944
total_time,0.00362



Experiment 28/73
Config: batch_size=16, seq_len=512, num_heads=12, emb_dim=768
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp028.json
{
  "forward": {
    "time(s)": 0.001340717860062917,
    "FLOPS(TFLOPs/s)": 4.805224973804458
  },
  "backward": {
    "time(s)": 0.0036788096268971764,
    "FLOPS(TFLOPs/s)": 4.378081225579594
  },
  "forward_backward": {
    "time(s)": 0.005019527486960093,
    "FLOPS(TFLOPs/s)": 4.492171496735001
  },
  "peak_memory_usage(MB)": 234.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.37808
backward_time,0.00368
forward_tflops,4.80522
forward_time,0.00134
peak_memory_mb,234.00195
total_tflops,4.49217
total_time,0.00502



Experiment 29/73
Config: batch_size=16, seq_len=512, num_heads=16, emb_dim=1024
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp029.json
{
  "forward": {
    "time(s)": 0.0017905098795890808,
    "FLOPS(TFLOPs/s)": 4.797479583844226
  },
  "backward": {
    "time(s)": 0.004715118948618572,
    "FLOPS(TFLOPs/s)": 4.554463357979901
  },
  "forward_backward": {
    "time(s)": 0.006505628828207653,
    "FLOPS(TFLOPs/s)": 4.6213474309574245
  },
  "peak_memory_usage(MB)": 290.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.55446
backward_time,0.00472
forward_tflops,4.79748
forward_time,0.00179
peak_memory_mb,290.00195
total_tflops,4.62135
total_time,0.00651



Experiment 30/73
Config: batch_size=16, seq_len=512, num_heads=24, emb_dim=1536
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp030.json
{
  "forward": {
    "time(s)": 0.0022091850717862447,
    "FLOPS(TFLOPs/s)": 5.832423028995875
  },
  "backward": {
    "time(s)": 0.006386874675750732,
    "FLOPS(TFLOPs/s)": 5.043508187549284
  },
  "forward_backward": {
    "time(s)": 0.008596059747536976,
    "FLOPS(TFLOPs/s)": 5.246259092245335
  },
  "peak_memory_usage(MB)": 402.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,5.04351
backward_time,0.00639
forward_tflops,5.83242
forward_time,0.00221
peak_memory_mb,402.00195
total_tflops,5.24626
total_time,0.0086



Experiment 31/73
Config: batch_size=16, seq_len=512, num_heads=32, emb_dim=2048
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp031.json
{
  "forward": {
    "time(s)": 0.0026309226830800374,
    "FLOPS(TFLOPs/s)": 6.529978738823075
  },
  "backward": {
    "time(s)": 0.007947644821802775,
    "FLOPS(TFLOPs/s)": 5.404075537217787
  },
  "forward_backward": {
    "time(s)": 0.010578567504882813,
    "FLOPS(TFLOPs/s)": 5.684091169834256
  },
  "peak_memory_usage(MB)": 514.001953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,5.40408
backward_time,0.00795
forward_tflops,6.52998
forward_time,0.00263
peak_memory_mb,514.00195
total_tflops,5.68409
total_time,0.01058



Experiment 32/73
Config: batch_size=8, seq_len=256, num_heads=8, emb_dim=512
Implementation: Pytorch, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp032.json
{
  "forward": {
    "time(s)": 0.009050217703978219,
    "FLOPS(TFLOPs/s)": 0.05932132569186781
  },
  "backward": {
    "time(s)": 0.0027153535564740498,
    "FLOPS(TFLOPs/s)": 0.4942919041978639
  },
  "forward_backward": {
    "time(s)": 0.011765571260452268,
    "FLOPS(TFLOPs/s)": 0.159707348704441
  },
  "peak_memory_usage(MB)": 138.2509765625
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,0.49429
backward_time,0.00272
forward_tflops,0.05932
forward_time,0.00905
peak_memory_mb,138.25098
total_tflops,0.15971
total_time,0.01177



Experiment 33/73
Config: batch_size=8, seq_len=256, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp033.json
{
  "forward": {
    "time(s)": 0.0003046592007080714,
    "FLOPS(TFLOPs/s)": 1.7622015378240192
  },
  "backward": {
    "time(s)": 0.000648116264740626,
    "FLOPS(TFLOPs/s)": 2.0708896736870734
  },
  "forward_backward": {
    "time(s)": 0.0009527754654486975,
    "FLOPS(TFLOPs/s)": 1.9721836467683245
  },
  "peak_memory_usage(MB)": 109.251953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.07089
backward_time,0.00065
forward_tflops,1.7622
forward_time,0.0003
peak_memory_mb,109.25195
total_tflops,1.97218
total_time,0.00095



Experiment 34/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Pytorch, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp034.json
{
  "forward": {
    "time(s)": 0.0038484959840774536,
    "FLOPS(TFLOPs/s)": 1.1160118949765703
  },
  "backward": {
    "time(s)": 0.00503444053332011,
    "FLOPS(TFLOPs/s)": 2.1327927440864802
  },
  "forward_backward": {
    "time(s)": 0.008882936517397563,
    "FLOPS(TFLOPs/s)": 1.6922765919308904
  },
  "peak_memory_usage(MB)": 434.2509765625
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.13279
backward_time,0.00503
forward_tflops,1.11601
forward_time,0.00385
peak_memory_mb,434.25098
total_tflops,1.69228
total_time,0.00888



Experiment 35/73
Config: batch_size=16, seq_len=512, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp035.json
{
  "forward": {
    "time(s)": 0.0010919306635856628,
    "FLOPS(TFLOPs/s)": 3.9333699833066884
  },
  "backward": {
    "time(s)": 0.002550047993659973,
    "FLOPS(TFLOPs/s)": 4.210673001722235
  },
  "forward_backward": {
    "time(s)": 0.003641978657245636,
    "FLOPS(TFLOPs/s)": 4.127532572463982
  },
  "peak_memory_usage(MB)": 194.251953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.21067
backward_time,0.00255
forward_tflops,3.93337
forward_time,0.00109
peak_memory_mb,194.25195
total_tflops,4.12753
total_time,0.00364



Experiment 36/73
Config: batch_size=16, seq_len=1024, num_heads=8, emb_dim=512
Implementation: Pytorch, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp036.json
{
  "forward": {
    "time(s)": 0.011038316790262858,
    "FLOPS(TFLOPs/s)": 1.5563848646883138
  },
  "backward": {
    "time(s)": 0.014685708872477214,
    "FLOPS(TFLOPs/s)": 2.924589703701185
  },
  "forward_backward": {
    "time(s)": 0.025724025662740072,
    "FLOPS(TFLOPs/s)": 2.3374857004241973
  },
  "peak_memory_usage(MB)": 1298.2509765625
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.92459
backward_time,0.01469
forward_tflops,1.55638
forward_time,0.01104
peak_memory_mb,1298.25098
total_tflops,2.33749
total_time,0.02572



Experiment 37/73
Config: batch_size=16, seq_len=1024, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp037.json
{
  "forward": {
    "time(s)": 0.002770065073172251,
    "FLOPS(TFLOPs/s)": 6.201973141492227
  },
  "backward": {
    "time(s)": 0.006702064053217571,
    "FLOPS(TFLOPs/s)": 6.408424720945548
  },
  "forward_backward": {
    "time(s)": 0.009472129126389821,
    "FLOPS(TFLOPs/s)": 6.3480492444382035
  },
  "peak_memory_usage(MB)": 306.251953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,6.40842
backward_time,0.0067
forward_tflops,6.20197
forward_time,0.00277
peak_memory_mb,306.25195
total_tflops,6.34805
total_time,0.00947



Experiment 38/73
Config: batch_size=16, seq_len=2048, num_heads=8, emb_dim=512
Implementation: Pytorch, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp038.json
{
  "forward": {
    "time(s)": 0.03975888188680013,
    "FLOPS(TFLOPs/s)": 1.728405666227116
  },
  "backward": {
    "time(s)": 0.05059814478556315,
    "FLOPS(TFLOPs/s)": 3.395355552423698
  },
  "forward_backward": {
    "time(s)": 0.09035702667236328,
    "FLOPS(TFLOPs/s)": 2.6618645769312947
  },
  "peak_memory_usage(MB)": 4562.2509765625
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,3.39536
backward_time,0.0506
forward_tflops,1.72841
forward_time,0.03976
peak_memory_mb,4562.25098
total_tflops,2.66186
total_time,0.09036



Experiment 39/73
Config: batch_size=16, seq_len=2048, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp039.json
{
  "forward": {
    "time(s)": 0.007697713088989258,
    "FLOPS(TFLOPs/s)": 8.927258776934119
  },
  "backward": {
    "time(s)": 0.018815755780537922,
    "FLOPS(TFLOPs/s)": 9.130576196025036
  },
  "forward_backward": {
    "time(s)": 0.02651346886952718,
    "FLOPS(TFLOPs/s)": 9.071546607484304
  },
  "peak_memory_usage(MB)": 530.251953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,9.13058
backward_time,0.01882
forward_tflops,8.92726
forward_time,0.0077
peak_memory_mb,530.25195
total_tflops,9.07155
total_time,0.02651



Experiment 40/73
Config: batch_size=16, seq_len=512, num_heads=12, emb_dim=768
Implementation: Pytorch, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp040.json
{
  "forward": {
    "time(s)": 0.00513384104569753,
    "FLOPS(TFLOPs/s)": 1.2548987953959276
  },
  "backward": {
    "time(s)": 0.00704390082359314,
    "FLOPS(TFLOPs/s)": 2.286535225773403
  },
  "forward_backward": {
    "time(s)": 0.01217774186929067,
    "FLOPS(TFLOPs/s)": 1.8516222913923048
  },
  "peak_memory_usage(MB)": 610.2509765625
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,2.28654
backward_time,0.00704
forward_tflops,1.2549
forward_time,0.00513
peak_memory_mb,610.25098
total_tflops,1.85162
total_time,0.01218



Experiment 41/73
Config: batch_size=16, seq_len=512, num_heads=12, emb_dim=768
Implementation: Flash1, Causal: True
head_dim = 64



Benchmark completed. Results saved to benchmark_exp041.json
{
  "forward": {
    "time(s)": 0.0013448405345280964,
    "FLOPS(TFLOPs/s)": 4.790494321514967
  },
  "backward": {
    "time(s)": 0.003684765839576721,
    "FLOPS(TFLOPs/s)": 4.371004308336227
  },
  "forward_backward": {
    "time(s)": 0.005029606374104817,
    "FLOPS(TFLOPs/s)": 4.483169581638137
  },
  "peak_memory_usage(MB)": 250.251953125
}


0,1
backward_tflops,▁
backward_time,▁
forward_tflops,▁
forward_time,▁
peak_memory_mb,▁
total_tflops,▁
total_time,▁

0,1
backward_tflops,4.371
backward_time,0.00368
forward_tflops,4.79049
forward_time,0.00134
peak_memory_mb,250.25195
total_tflops,4.48317
total_time,0.00503



Experiment 42/73
Config: batch_size=16, seq_len=256, num_heads=8, emb_dim=512
Implementation: Flash1, Causal: False
head_dim = 64

