In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("github_repos_wildcard")

In [2]:
repo_url = f"https://{token}@github.com/gaserSami/panther.git"
branch = "autotuner"

In [3]:
!git clone -b {branch} {repo_url}

Cloning into 'panther'...
remote: Enumerating objects: 1046, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (96/96), done.[K
remote: Total 1046 (delta 125), reused 110 (delta 85), pack-reused 865 (from 1)[K
Receiving objects: 100% (1046/1046), 27.77 MiB | 19.15 MiB/s, done.
Resolving deltas: 100% (626/626), done.


In [4]:
!mv panther Panther

In [5]:
# First uninstall existing torch, torchvision, torchaudio
!pip uninstall -y torch torchvision torchaudio

# Install the specified versions from PyTorch's official CUDA 12.4 wheels
!pip install torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124

Found existing installation: torch 2.5.1+cu124
Uninstalling torch-2.5.1+cu124:
  Successfully uninstalled torch-2.5.1+cu124
Found existing installation: torchvision 0.20.1+cu124
Uninstalling torchvision-0.20.1+cu124:
  Successfully uninstalled torchvision-0.20.1+cu124
Found existing installation: torchaudio 2.5.1+cu124
Uninstalling torchaudio-2.5.1+cu124:
  Successfully uninstalled torchaudio-2.5.1+cu124
Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.6.0+cu124
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0+cu124
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0+cu124
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from

In [6]:
import torch
print(torch.__version__)
import triton
print(triton.__version__)

2.6.0+cu124
3.2.0


In [7]:
# !export LC_ALL="en_US.UTF-8"
# !export LD_LIBRARY_PATH="/usr/lib64-nvidia"
# !export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
# !ldconfig /usr/lib64-nvidia

In [8]:
%%writefile /kaggle/working/Panther/pawX/setup.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

setup(
    name="pawX",
    ext_modules=[
        CUDAExtension(
            name="pawX",
            sources=[
                "skops.cpp",
                "bindings.cpp",
                "linear.cpp",
                "linear_cuda.cu",
                "cqrrpt.cpp",
                "rsvd.cpp",
                "attention.cpp",
                "conv2d.cpp"
            ],
            # Use system includes and libraries
            include_dirs=["/usr/include/x86_64-linux-gnu"],
            library_dirs=[],
            libraries=["openblas"],
            extra_compile_args={"cxx": ["-O2", "-fopenmp"], "nvcc": ["-O2"]},
            extra_link_args=["-llapacke", "-lopenblas"]
        )
    ],
    cmdclass={"build_ext": BuildExtension},
)

Overwriting /kaggle/working/Panther/pawX/setup.py


In [9]:
!sudo apt-get install liblapacke-dev




The following additional packages will be installed:
  liblapacke libtmglib-dev libtmglib3
Suggested packages:
  liblapack-doc
The following NEW packages will be installed:
  liblapacke liblapacke-dev libtmglib-dev libtmglib3
0 upgraded, 4 newly installed, 0 to remove and 122 not upgraded.
Need to get 1,071 kB of archives.
After this operation, 12.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libtmglib3 amd64 3.10.0-2ubuntu1 [144 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 liblapacke amd64 3.10.0-2ubuntu1 [435 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libtmglib-dev amd64 3.10.0-2ubuntu1 [134 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 liblapacke-dev amd64 3.10.0-2ubuntu1 [358 kB]
Fetched 1,071 kB in 0s (3,233 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /

In [10]:
!cd /kaggle/working/Panther/pawX; python setup.py install
!cd /kaggle/working/Panther/pawX; pip install --no-build-isolation -e .

!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Emitting ninja build

In [11]:
import os

In [12]:
os.chdir("/kaggle/working/Panther/")

In [13]:
!pwd

/kaggle/working/Panther


In [14]:
import time
import numpy as np
import torch
import torch._dynamo
import torch._inductor.config as config
import itertools
import pandas as pd

# Configure torch
config.max_autotune_gemm = False
torch._dynamo.config.cache_size_limit = 2**16
torch._dynamo.config.accumulated_cache_size_limit = 2**16

def is_valid_params(embed_dim, num_heads, num_random_features):
    """
    Check if parameter combination is valid:
    embed_dim must be divisible by num_heads
    """
    return embed_dim % num_heads == 0

class BenchmarkParams:
    def __init__(self, 
                 embed_dim=256,
                 num_heads=8,
                 num_random_features=128,
                 batch_size=64, 
                 seq_length=32,
                 num_runs=200, 
                 warmup=15, 
                 device=torch.device("cuda"),
                 dtype=torch.float32):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_random_features = num_random_features
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.num_runs = num_runs
        self.warmup = warmup
        self.device = device
        self.dtype = dtype

def benchmark_model(model, inputs, model_name, params):
    """
    Generic benchmarking function for any PyTorch model.
    
    Args:
        model: The PyTorch model to benchmark
        inputs: Dictionary of input tensors
        model_name: Name of the model for logging
        params: Benchmark parameters
    
    Returns:
        Dictionary with benchmark results
    """
    # Compile the model
    # model_compiled = torch.compile(
    #     model,
    #     backend="inductor",
    #     fullgraph=True,
    #     dynamic=False
    # )
    model_compiled = model
    
    # Benchmark forward pass
    print(f"\n=== {model_name} FORWARD PASS BENCHMARK ===")
    
    # Warmup runs for forward pass
    model_compiled.eval()
    with torch.no_grad():
        for _ in range(params.warmup):
            _ = model_compiled(**inputs)
    
    torch.cuda.synchronize()
    
    # Actual timed runs for forward
    forward_times = []
    forward_memories = []
    with torch.no_grad():
        for _ in range(params.num_runs):
            torch.cuda.reset_peak_memory_stats()
            torch.cuda.synchronize()
            start = time.perf_counter()
            _ = model_compiled(**inputs)
            torch.cuda.synchronize()
            end = time.perf_counter()
            
            forward_times.append((end - start) * 1000)  # Convert to ms
            forward_memories.append(torch.cuda.max_memory_allocated() / (1024 * 1024))  # Convert to MB
    
    mean_forward = np.mean(forward_times)
    std_forward = np.std(forward_times)
    mean_forward_memory = np.mean(forward_memories)
    std_forward_memory = np.std(forward_memories)
    print(f"{model_name} forward: {mean_forward:.3f} ± {std_forward:.3f} ms, Memory: {mean_forward_memory:.2f} ± {std_forward_memory:.2f} MB")
    
    # Benchmark backward pass
    print(f"\n=== {model_name} BACKWARD PASS BENCHMARK ===")
    
    # Get query for backward
    query = inputs['query']
    
    # Warmup runs for backward pass
    model_compiled.train()
    for _ in range(params.warmup):
        out = model_compiled(**inputs)[0]
        loss = out.sum()
        loss.backward()
        query.grad.zero_()
    
    torch.cuda.synchronize()
    
    # Actual timed runs for backward
    backward_times = []
    backward_memories = []
    for _ in range(params.num_runs):
        out = model_compiled(**inputs)[0]
        loss = out.sum()
        
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        start = time.perf_counter()
        loss.backward()
        torch.cuda.synchronize()
        end = time.perf_counter()
        
        backward_times.append((end - start) * 1000)  # Convert to ms
        backward_memories.append(torch.cuda.max_memory_allocated() / (1024 * 1024))  # Convert to MB
        query.grad.zero_()
    
    mean_backward = np.mean(backward_times)
    std_backward = np.std(backward_times)
    mean_backward_memory = np.mean(backward_memories)
    std_backward_memory = np.std(backward_memories)
    print(f"{model_name} backward: {mean_backward:.3f} ± {std_backward:.3f} ms, Memory: {mean_backward_memory:.2f} ± {std_backward_memory:.2f} MB")
    
    return {
        "forward": {
            "mean": mean_forward,
            "std": std_forward,
            "times": forward_times,
            "memory_mb": mean_forward_memory,
            "memory_std": std_forward_memory,
            "memories": forward_memories
        },
        "backward": {
            "mean": mean_backward,
            "std": std_backward,
            "times": backward_times,
            "memory_mb": mean_backward_memory,
            "memory_std": std_backward_memory,
            "memories": backward_memories
        }
    }

def benchmark_model_factory(model_factory, model_name, params):
    """
    Benchmark a model using a factory function.
    
    Args:
        model_factory: Function that creates the model
        model_name: Name of the model for logging
        params: Benchmark parameters
    
    Returns:
        Dictionary with benchmark results
    """
    # Create the model
    torch.manual_seed(42)
    model = model_factory(params)
    
    # Create input tensors for benchmarking
    query = torch.randn(params.batch_size, params.seq_length, params.embed_dim, 
                      dtype=params.dtype, device=params.device, requires_grad=True)
    key = torch.randn(params.batch_size, params.seq_length, params.embed_dim, 
                     dtype=params.dtype, device=params.device)
    value = torch.randn(params.batch_size, params.seq_length, params.embed_dim, 
                       dtype=params.dtype, device=params.device)
    
    inputs = {
        'query': query,
        'key': key,
        'value': value,
        # 'attention_mask': None
    }
    
    return benchmark_model(model, inputs, model_name, params)

if __name__ == "__main__":
    import torch.nn as nn
    from panther.nn.attention import RandMultiHeadAttention
    
    # Parameter combinations to test
    embed_dims = [128, 256, 512, 1024]
    num_heads_options = [4, 8, 16]
    num_random_features_options = [64, 128, 256]
    kernel_fn_options = ["softmax", "relu"]
    causal_options = [False]
    # causal_options = [False, True]
    seq_lens = [512, 1024, 2048, 4096, 8192]
    
    # Define model factories
    def create_attention(p):
        return RandMultiHeadAttention(
            embed_dim=p.embed_dim,
            num_heads=p.num_heads,
            num_random_features=p.num_random_features,
            dropout=0.0,
            kernel_fn=p.kernel_fn if hasattr(p, 'kernel_fn') else "softmax",
            iscausal=p.iscausal if hasattr(p, 'iscausal') else False,
            device=p.device,
            dtype=p.dtype
        )
    
    def create_torch_attention(p):
        return torch.nn.MultiheadAttention(
            embed_dim=p.embed_dim,
            num_heads=p.num_heads,
            dropout=0.0,
            batch_first=True,  # Since your inputs are [batch, seq, dim]
            device=p.device,
            dtype=p.dtype
        )
    
    models_to_benchmark = [
        (create_torch_attention, "attention")
    ]
    
    # Prepare data structure to store all results
    results_data = []
    
    # Iterate through all parameter combinations
    total_combinations = len(embed_dims) * len(num_heads_options) * len(num_random_features_options) * len(kernel_fn_options) * len(causal_options) * len(seq_lens)
    current_combo = 0
    
    for embed_dim, num_heads, num_random_features, kernel_fn, iscausal, seq_length in itertools.product(
        embed_dims, num_heads_options, num_random_features_options, kernel_fn_options, causal_options, seq_lens
    ):
        current_combo += 1
        print(f"\n\n{'='*20} COMBINATION {current_combo}/{total_combinations} {'='*20}")
        print(f"Embed dimension: {embed_dim}, Num heads: {num_heads}, Num random features: {num_random_features}")
        print(f"Kernel function: {kernel_fn}, Causal: {iscausal}, Sequence length: {seq_length}")
        
        # Check if parameters are valid
        is_valid = is_valid_params(embed_dim, num_heads, num_random_features)
        
        if not is_valid:
            print(f"INVALID COMBINATION: {embed_dim} is not divisible by {num_heads}")
            print("Skipping benchmarks for this invalid combination")
            
            # Add invalid entry to results data
            for model_name in [m[1] for m in models_to_benchmark]:
                results_data.append({
                    'model': model_name,
                    'embed_dim': embed_dim,
                    'num_heads': num_heads,
                    'num_random_features': num_random_features,
                    'kernel_fn': kernel_fn,
                    'iscausal': iscausal,
                    'seq_length': seq_length,
                    'forward_mean_ms': float('nan'),
                    'forward_std_ms': float('nan'),
                    'backward_mean_ms': float('nan'),
                    'backward_std_ms': float('nan'),
                    'forward_memory_mb': float('nan'),
                    'backward_memory_mb': float('nan'),
                    'is_valid': False,
                    'error': "Invalid parameter combination"
                })
            continue
        
        # Create parameter object for this combination
        params = BenchmarkParams(
            embed_dim=embed_dim,
            num_heads=num_heads,
            num_random_features=num_random_features,
            seq_length=seq_length
        )
        # Add the new parameters
        params.kernel_fn = kernel_fn
        params.iscausal = iscausal
        
        all_results = {}
        for model_factory, model_name in models_to_benchmark:
            print(f"\n{'='*20} Benchmarking {model_name} {'='*20}")
            try:
                results = benchmark_model_factory(model_factory, model_name, params)
                all_results[model_name] = results
                
                # Add result to our data collection
                results_data.append({
                    'model': model_name,
                    'embed_dim': embed_dim,
                    'num_heads': num_heads,
                    'num_random_features': num_random_features,
                    'kernel_fn': kernel_fn,
                    'iscausal': iscausal,
                    'seq_length': seq_length,
                    'forward_mean_ms': results['forward']['mean'],
                    'forward_std_ms': results['forward']['std'],
                    'backward_mean_ms': results['backward']['mean'],
                    'backward_std_ms': results['backward']['std'],
                    'forward_memory_mb': results['forward']['memory_mb'],
                    'backward_memory_mb': results['backward']['memory_mb'],
                    'is_valid': True
                })
            except Exception as e:
                print(f"Error benchmarking {model_name}: {e}")
                # Add error entry to data
                results_data.append({
                    'model': model_name,
                    'embed_dim': embed_dim,
                    'num_heads': num_heads,
                    'num_random_features': num_random_features,
                    'kernel_fn': kernel_fn, 
                    'iscausal': iscausal,
                    'seq_length': seq_length,
                    'forward_mean_ms': float('nan'),
                    'forward_std_ms': float('nan'),
                    'backward_mean_ms': float('nan'),
                    'backward_std_ms': float('nan'),
                    'forward_memory_mb': float('nan'),
                    'backward_memory_mb': float('nan'),
                    'is_valid': True,
                    'error': str(e)
                })
        
        # Print comparative summary for this combination
        if all_results:
            print("\n" + "="*60)
            print(f"{'='*20} SUMMARY FOR CURRENT COMBINATION {'='*20}")
            print("="*60)
            print(f"{'Model':<30} {'Forward (ms)':<25} {'Backward (ms)':<25} {'Forward Memory (MB)':<25} {'Backward Memory (MB)':<25}")
            print("-"*60)
            
            for model_name, results in all_results.items():
                fwd = f"{results['forward']['mean']:.3f} ± {results['forward']['std']:.3f}"
                bwd = f"{results['backward']['mean']:.3f} ± {results['backward']['std']:.3f}"
                fwd_mem = f"{results['forward']['memory_mb']:.2f}"
                bwd_mem = f"{results['backward']['memory_mb']:.2f}"
                print(f"{model_name:<30} {fwd:<25} {bwd:<25} {fwd_mem:<25} {bwd_mem:<25}")
    
    # Create a DataFrame with all results
    df = pd.DataFrame(results_data)
    
    # Save results to CSV
    results_file = "attention_benchmark_results.csv"
    df.to_csv(results_file, index=False)
    print(f"\nAll benchmark results saved to {results_file}")



Embed dimension: 128, Num heads: 4, Num random features: 64
Kernel function: softmax, Causal: False, Sequence length: 512


=== attention FORWARD PASS BENCHMARK ===
attention forward: 10.375 ± 0.382 ms, Memory: 632.38 ± 0.00 MB

=== attention BACKWARD PASS BENCHMARK ===
attention backward: 16.641 ± 0.105 ms, Memory: 1216.75 ± 0.00 MB

Model                          Forward (ms)              Backward (ms)             Forward Memory (MB)       Backward Memory (MB)     
------------------------------------------------------------
attention                      10.375 ± 0.382            16.641 ± 0.105            632.38                    1216.75                  


Embed dimension: 128, Num heads: 4, Num random features: 64
Kernel function: softmax, Causal: False, Sequence length: 1024


=== attention FORWARD PASS BENCHMARK ===
attention forward: 35.850 ± 0.343 ms, Memory: 2288.50 ± 0.00 MB

=== attention BACKWARD PASS BENCHMARK ===
attention backward: 60.072 ± 0.780 ms, Memory: 4464.75 