In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("github_repos_wildcard")

In [2]:
repo_url = f"https://{token}@github.com/gaserSami/panther.git"
branch = "autotuner"

In [3]:
!git clone -b {branch} {repo_url}

Cloning into 'panther'...
remote: Enumerating objects: 1295, done.[K
remote: Counting objects: 100% (236/236), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 1295 (delta 188), reused 187 (delta 162), pack-reused 1059 (from 1)[K
Receiving objects: 100% (1295/1295), 27.81 MiB | 18.51 MiB/s, done.
Resolving deltas: 100% (823/823), done.


In [4]:
# First uninstall existing torch, torchvision, torchaudio
!pip uninstall -y torch torchvision torchaudio

# Install the specified versions from PyTorch's official CUDA 12.4 wheels
!pip install torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124

Found existing installation: torch 2.5.1+cu124
Uninstalling torch-2.5.1+cu124:
  Successfully uninstalled torch-2.5.1+cu124
Found existing installation: torchvision 0.20.1+cu124
Uninstalling torchvision-0.20.1+cu124:
  Successfully uninstalled torchvision-0.20.1+cu124
Found existing installation: torchaudio 2.5.1+cu124
Uninstalling torchaudio-2.5.1+cu124:
  Successfully uninstalled torchaudio-2.5.1+cu124
Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting torch==2.6.0+cu124
  Downloading https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0+cu124
  Downloading https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0+cu124
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from

In [5]:
!mv panther Panther

In [6]:
!ls

__notebook__.ipynb  Panther


In [7]:
# import os

# os.rename("/kaggle/working/Panther", "/kaggle/working/panther")

In [8]:
%%writefile /kaggle/working/Panther/pawX/setup.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

setup(
    name="pawX",
    ext_modules=[
        CUDAExtension(
            name="pawX",
            sources=[
                "skops.cpp",
                "bindings.cpp",
                "linear.cpp",
                "linear_cuda.cu",
                "cqrrpt.cpp",
                "rsvd.cpp",
                "attention.cpp",
                "conv2d.cpp"
            ],
            # Use system includes and libraries
            include_dirs=["/usr/include/x86_64-linux-gnu"],
            library_dirs=[],
            libraries=["openblas"],
            extra_compile_args={"cxx": ["-O2", "-fopenmp"], "nvcc": ["-O2"]},
            extra_link_args=["-llapacke", "-lopenblas"]
        )
    ],
    cmdclass={"build_ext": BuildExtension},
)

Overwriting /kaggle/working/Panther/pawX/setup.py


In [9]:
!sudo apt-get install liblapacke-dev




The following additional packages will be installed:
  liblapacke libtmglib-dev libtmglib3
Suggested packages:
  liblapack-doc
The following NEW packages will be installed:
  liblapacke liblapacke-dev libtmglib-dev libtmglib3
0 upgraded, 4 newly installed, 0 to remove and 122 not upgraded.
Need to get 1,071 kB of archives.
After this operation, 12.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libtmglib3 amd64 3.10.0-2ubuntu1 [144 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 liblapacke amd64 3.10.0-2ubuntu1 [435 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/main amd64 libtmglib-dev amd64 3.10.0-2ubuntu1 [134 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/main amd64 liblapacke-dev amd64 3.10.0-2ubuntu1 [358 kB]
Fetched 1,071 kB in 0s (3,259 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /

In [10]:
!cd /kaggle/working/Panther/pawX; python setup.py install
!cd /kaggle/working/Panther/pawX; pip install --no-build-isolation -e .

!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Emitting ninja build

In [11]:
import torch
print(torch.__version__)
import triton
print(triton.__version__)

2.6.0+cu124
3.2.0


In [12]:
import os
os.chdir("/kaggle/working/Panther")

In [13]:
!pwd

/kaggle/working/Panther


In [14]:
import os
import shutil
from PIL import Image
from tqdm import tqdm

def process_imagenet_jfif(input_path, output_path=None):
    """
    Copy dataset structure to a writable location and convert .jfif files to .jpg
    
    Args:
        input_path: Original dataset path (read-only in Kaggle)
        output_path: Target path (writable, defaults to /kaggle/working/processed_imagenet)
    """
    if output_path is None:
        output_path = "/kaggle/working/processed_imagenet"
    
    print(f"Processing dataset from {input_path} to {output_path}")
    
    # Track statistics
    total_dirs = 0
    total_files = 0
    converted_files = 0
    
    # Create the root output directory
    os.makedirs(output_path, exist_ok=True)
    
    # Walk through the original dataset
    for root, dirs, files in os.walk(input_path):
        # Calculate the relative path and create the corresponding directory
        rel_path = os.path.relpath(root, input_path)
        target_dir = os.path.join(output_path, rel_path) if rel_path != '.' else output_path
        
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
            total_dirs += 1
        
        # Process files
        for file in files:
            source_file = os.path.join(root, file)
            file_ext = os.path.splitext(file)[1].lower()
            
            # For .jfif files, convert to .jpg
            if file_ext == '.jfif':
                base_name = os.path.splitext(file)[0]
                target_file = os.path.join(target_dir, f"{base_name}.jpg")
                
                try:
                    # Open and save in the new format
                    img = Image.open(source_file)
                    img.save(target_file)
                    converted_files += 1
                except Exception as e:
                    print(f"Error processing {source_file}: {e}")
            else:
                # For other files, just copy
                target_file = os.path.join(target_dir, file)
                try:
                    shutil.copy2(source_file, target_file)
                except Exception as e:
                    print(f"Error copying {source_file}: {e}")
            
            total_files += 1
            
            # Print progress
            if total_files % 1000 == 0:
                print(f"Processed {total_files} files ({converted_files} converted)...")
    
    print(f"\nDone! Created {total_dirs} directories and processed {total_files} files.")
    print(f"Converted {converted_files} .jfif files to .jpg format.")
    print(f"New dataset location: {output_path}")
    
    return output_path

# Usage
dataset_path = "/kaggle/input/imagenet1k-val/imagenet1k-val"
new_dataset_path = process_imagenet_jfif(dataset_path)

Processing dataset from /kaggle/input/imagenet1k-val/imagenet1k-val to /kaggle/working/processed_imagenet

Done! Created 2 directories and processed 50 files.
Converted 50 .jfif files to .jpg format.
New dataset location: /kaggle/working/processed_imagenet


In [15]:
!pip install botorch

Collecting botorch
  Downloading botorch-0.14.0-py3-none-any.whl.metadata (10 kB)
Collecting pyre_extensions (from botorch)
  Downloading pyre_extensions-0.0.32-py3-none-any.whl.metadata (4.0 kB)
Collecting gpytorch==1.14 (from botorch)
  Downloading gpytorch-1.14-py3-none-any.whl.metadata (8.0 kB)
Collecting linear_operator==0.6 (from botorch)
  Downloading linear_operator-0.6-py3-none-any.whl.metadata (15 kB)
Collecting pyro-ppl>=1.8.4 (from botorch)
  Downloading pyro_ppl-1.9.1-py3-none-any.whl.metadata (7.8 kB)
Collecting jaxtyping (from gpytorch==1.14->botorch)
  Downloading jaxtyping-0.3.2-py3-none-any.whl.metadata (7.0 kB)
Collecting pyro-api>=0.1.1 (from pyro-ppl>=1.8.4->botorch)
  Downloading pyro_api-0.1.2-py3-none-any.whl.metadata (2.5 kB)
Collecting wadler-lindig>=0.1.3 (from jaxtyping->gpytorch==1.14->botorch)
  Downloading wadler_lindig-0.1.5-py3-none-any.whl.metadata (17 kB)
Downloading botorch-0.14.0-py3-none-any.whl (738 kB)
[2K   [90m━━━━━━━━━━━━━━━

In [16]:
%%writefile /kaggle/working/Panther/panther/utils/SkAutoTuner/ModelVisualizer.py
import torch.nn as nn
import os
import re
import json
import tempfile
import webbrowser
import logging
from typing import Dict, Any, Optional, Tuple

try:
    from graphviz import Digraph, ExecutableNotFound
except ImportError:
    Digraph = None
    ExecutableNotFound = Exception

# Setup a logger for this module
logger = logging.getLogger(__name__)

class ModelVisualizer:
    """
    A utility class for visualizing PyTorch model structures interactively.
    It generates an HTML file with an SVG representation of the model,
    allowing users to explore module hierarchy, view details, and search.
    """
    
    # Get the path to the visualization assets
    # Use a simpler approach that doesn't rely on importlib.resources
    _ASSETS_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'visualization_assets')

    @staticmethod
    def _build_module_tree(named_modules: iter) -> Tuple[Dict[str, Any], Dict[str, str]]:
        """
        Builds a nested dict representing the module hierarchy and collects module types.
        """
        tree = {}
        module_types = {}
        
        for full_name, module in named_modules:
            module_types[full_name] = type(module).__name__
            parts = full_name.split('.') if full_name else []
            current_level = tree
            for part in parts:
                current_level = current_level.setdefault(part, {})
                
        return tree, module_types
    
    @staticmethod
    def _print_tree(subtree: Dict[str, Any], module_types: Dict[str, str], 
                    full_path: str = '', prefix: str = '', is_last: bool = True):
        """
        Recursively prints the nested dict as an ASCII tree with module types.
        """
        branch = '└─ ' if is_last else '├─ '
        for idx, (name, child) in enumerate(sorted(subtree.items())):
            is_child_last = (idx == len(subtree) - 1)
            current_path = f"{full_path}.{name}" if full_path else name
            module_type = f" ({module_types.get(current_path, 'UnknownType')})" if current_path in module_types else ""
            print(prefix + branch + name + module_type + ('/' if child else ''))
            if child:
                extension = '    ' if is_child_last else '│   '
                ModelVisualizer._print_tree(child, module_types, current_path, prefix + extension, is_child_last)
    
    @staticmethod
    def print_module_tree(model: nn.Module, root_name: str = 'model'):
        """
        Prints the modules of a PyTorch model in a tree structure with their types.
        """
        tree, module_types = ModelVisualizer._build_module_tree(model.named_modules())
        module_types[''] = type(model).__name__
        print(f"{root_name} ({module_types.get('', 'UnknownType')})/")
        ModelVisualizer._print_tree(tree, module_types, full_path=root_name)
    
    @staticmethod
    def _collect_module_info(model: nn.Module) -> Dict[str, Dict[str, Any]]:
        """
        Collects detailed information about each module in the model.
        """
        module_info = {}

        try:
            root_param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
            root_is_trainable = any(p.requires_grad for p in model.parameters())
        except AttributeError:
            root_param_count = 0
            root_is_trainable = False
            logger.warning("Could not retrieve parameter info for the root model.")

        module_info['root'] = {
            'type': type(model).__name__,
            'parameters': root_param_count,
            'trainable': root_is_trainable,
            'class': str(type(model)),
            'docstring': model.__doc__.strip().split('\n')[0] if model.__doc__ else "N/A"
        }
        
        for name, module in model.named_modules():
            if not name:
                continue
                
            try:
                param_count = sum(p.numel() for p in module.parameters() if p.requires_grad)
                is_trainable = any(p.requires_grad for p in module.parameters())
            except AttributeError:
                param_count = 0
                is_trainable = False
                logger.warning(f"Module {name} ({type(module).__name__}) does not have 'parameters' attribute or it's not iterable.")
            except RuntimeError:
                param_count = 0
                is_trainable = False
                logger.warning(f"Could not count parameters for module {name} ({type(module).__name__}).")

            info = {
                'type': type(module).__name__,
                'parameters': param_count,
                'trainable': is_trainable,
                'class': str(type(module)),
                'docstring': module.__doc__.strip().split('\n')[0] if module.__doc__ else "N/A"
            }
            
            if isinstance(module, nn.Conv2d):
                info.update({
                    'in_channels': module.in_channels,
                    'out_channels': module.out_channels,
                    'kernel_size': module.kernel_size,
                    'stride': module.stride,
                    'padding': module.padding,
                    'groups': module.groups,
                    'dilation': module.dilation,
                })
            elif isinstance(module, nn.Linear):
                info.update({
                    'in_features': module.in_features,
                    'out_features': module.out_features,
                    'bias': module.bias is not None,
                })
            elif isinstance(module, nn.BatchNorm2d):
                info.update({
                    'num_features': module.num_features,
                    'eps': module.eps,
                    'momentum': module.momentum,
                    'affine': module.affine,
                })
            elif isinstance(module, (nn.RNN, nn.LSTM, nn.GRU)):
                info.update({
                    'input_size': module.input_size,
                    'hidden_size': module.hidden_size,
                    'num_layers': module.num_layers,
                    'bidirectional': module.bidirectional,
                    'dropout': module.dropout if hasattr(module, 'dropout') else 0,
                    'bias': module.bias
                })
            elif isinstance(module, nn.Dropout):
                info.update({
                    'p': module.p,
                    'inplace': module.inplace,
                })
            elif isinstance(module, (nn.MaxPool2d, nn.AvgPool2d, nn.AdaptiveAvgPool2d, nn.AdaptiveMaxPool2d)):
                info_pool = {
                    'kernel_size': getattr(module, 'kernel_size', 'N/A'),
                    'stride': getattr(module, 'stride', 'N/A'),
                    'padding': getattr(module, 'padding', 'N/A'),
                }
                if isinstance(module, (nn.AdaptiveAvgPool2d, nn.AdaptiveMaxPool2d)):
                    info_pool['output_size'] = module.output_size
                info.update(info_pool)
            elif isinstance(module, nn.Embedding):
                info.update({
                    'num_embeddings': module.num_embeddings,
                    'embedding_dim': module.embedding_dim,
                    'padding_idx': module.padding_idx,
                })
            elif isinstance(module, nn.LayerNorm):
                info.update({
                    'normalized_shape': module.normalized_shape,
                    'eps': module.eps,
                    'elementwise_affine': module.elementwise_affine,
                })
            elif isinstance(module, nn.MultiheadAttention):
                info.update({
                    'embed_dim': module.embed_dim,
                    'num_heads': module.num_heads,
                    'dropout': module.dropout,
                    'bias': hasattr(module, 'bias_k') and module.bias_k is not None,
                    'add_bias_kv': hasattr(module, 'add_bias_kv') and module.add_bias_kv,
                    'add_zero_attn': hasattr(module, 'add_zero_attn') and module.add_zero_attn,
                    'kdim': getattr(module, 'kdim', None),
                    'vdim': getattr(module, 'vdim', None),
                })
            elif isinstance(module, (nn.TransformerEncoderLayer, nn.TransformerDecoderLayer)):
                info.update({
                    'd_model': module.self_attn.embed_dim if hasattr(module, 'self_attn') else getattr(module, 'd_model', 'N/A'),
                    'nhead': module.self_attn.num_heads if hasattr(module, 'self_attn') else getattr(module, 'nhead', 'N/A'),
                    'dim_feedforward': module.linear1.out_features if hasattr(module, 'linear1') else getattr(module, 'dim_feedforward', 'N/A'),
                    'dropout': module.dropout.p if hasattr(module, 'dropout') else getattr(module, 'dropout', 'N/A'),
                    'activation': type(module.activation).__name__ if hasattr(module, 'activation') else getattr(module, 'activation', 'N/A')
                })
                if isinstance(module, nn.TransformerDecoderLayer):
                     info['cross_attention'] = True
            
            module_info[name] = info
            
        return module_info
    
    @staticmethod
    def create_interactive_visualization(model: nn.Module, output_path: Optional[str] = None, 
                                        graph_attrs: Optional[Dict[str, str]] = None,
                                        node_attrs: Optional[Dict[str, str]] = None,
                                        edge_attrs: Optional[Dict[str, str]] = None,
                                        open_browser: bool = True,
                                        max_label_length: int = 30) -> str:
        """
        Creates an interactive visualization of the model structure.
        """
        if Digraph is None:
            raise ImportError("The graphviz Python package is required. Install with 'pip install graphviz'.")

        default_graph_attrs = {
            'rankdir': 'TB', 'bgcolor': 'transparent', 'splines': 'ortho',
            'fontname': 'Arial, Helvetica, sans-serif', 'fontsize': '14',
            'nodesep': '0.6', 'ranksep': '0.8', 'concentrate': 'true',
            'overlap': 'false',
        }
        if graph_attrs:
            default_graph_attrs.update(graph_attrs)
            
        default_node_attrs = {
            'style': 'filled,rounded', 'shape': 'box', 'fillcolor': '#E5F5FD', 
            'color': '#4285F4', 'fontname': 'Arial, Helvetica, sans-serif', 
            'fontsize': '11', 'height': '0.4', 'margin': '0.1,0.05'
        }
        if node_attrs:
            default_node_attrs.update(node_attrs)

        default_edge_attrs = {
            'color': '#757575', 'arrowsize': '0.7'
        }
        if edge_attrs:
            default_edge_attrs.update(edge_attrs)
        
        dot = Digraph(
            'model_visualization', 
            format='svg',
            graph_attr=default_graph_attrs
        )
        dot.attr('node', **default_node_attrs)
        dot.attr('edge', **default_edge_attrs)
        
        raw_named_modules = list(model.named_modules())
        tree, module_types = ModelVisualizer._build_module_tree(raw_named_modules)
        module_info = ModelVisualizer._collect_module_info(model)
        
        root_name_for_graph = 'model'
        root_display_type = module_info['root']['type']
        root_label = f"{root_name_for_graph} ({root_display_type})"
        if len(root_label) > max_label_length:
            root_label = root_label[:max_label_length-3] + "..."
        
        root_id = "node_root_model"
        dot.node(root_id, root_label, tooltip=f'Root: {root_display_type}\nParameters: {module_info["root"]["parameters"]:,}',
                 id=root_id, data_name='root', fillcolor='#D1E7F7', shape='Mrecord')
        
        node_ids = {'root': root_id}
        
        def add_nodes_and_edges(current_subtree, parent_full_path, parent_node_id):
            for name_part, children_subtree in sorted(current_subtree.items()):
                current_full_path = f"{parent_full_path}.{name_part}" if parent_full_path != 'root' else name_part
                node_id = f"node_{current_full_path.replace('.', '_').replace('-', '_')}"
                node_ids[current_full_path] = node_id
                module_type_name = module_types.get(current_full_path, "Unknown")
                
                label = f"{name_part} ({module_type_name})"
                if len(label) > max_label_length:
                    label = label[:max_label_length-3] + "..."
                tooltip_parts = [f"Name: {current_full_path}", f"Type: {module_type_name}"]
                current_module_details = module_info.get(current_full_path)
                if current_module_details:
                    tooltip_parts.append(f"Parameters: {current_module_details['parameters']:,}")
                    tooltip_parts.append(f"Trainable: {'Yes' if current_module_details['trainable'] else 'No'}")
                node_fillcolor = default_node_attrs.get('fillcolor', '#E5F5FD')
                if not children_subtree:
                    node_fillcolor = "#C2E0F4" 
                
                dot.node(node_id, label, tooltip='\n'.join(tooltip_parts), fillcolor=node_fillcolor,
                         id=node_id, data_name=current_full_path)
                edge_id = f"edge_{parent_node_id}_{node_id}"
                dot.edge(parent_node_id, node_id, id=edge_id, data_source=parent_node_id, data_target=node_id)
                if children_subtree:
                    add_nodes_and_edges(children_subtree, current_full_path, node_id)
        
        add_nodes_and_edges(tree, 'root', root_id)
        
        svg_content_bytes = dot.pipe(format='svg')
        svg_content = svg_content_bytes.decode('utf-8')
        
        for node_path_key, node_html_id in node_ids.items():
            data_name_attr_str = f'data-name="{node_path_key}"'
            g_block_pattern = rf'(<g[^>]*id="{re.escape(node_html_id)}"[^>]*>)([\s\S]*?)(</g>)'
            
            def process_g_block(match_obj):
                g_open_tag, g_content, g_close_tag = match_obj.groups()
                if data_name_attr_str not in g_open_tag:
                    g_open_tag = g_open_tag.rstrip('>') + f' {data_name_attr_str}>'
                def add_data_to_visual_child(child_match):
                    child_tag_open, child_tag_rest = child_match.groups()
                    if data_name_attr_str not in child_tag_open:
                        return child_tag_open + f' {data_name_attr_str}' + child_tag_rest
                    return child_match.group(0)
                
                g_content = re.sub(r'(<(?:rect|polygon|ellipse|text|path|circle)\b[^>]*?)(/?>)', 
                                   add_data_to_visual_child, 
                                   g_content)
                return g_open_tag + g_content + g_close_tag
            
            svg_content = re.sub(g_block_pattern, process_g_block, svg_content)

        js_module_info = json.dumps(module_info)
        
        if output_path is None:
            fd, output_path = tempfile.mkstemp(suffix='.html')
            os.close(fd)
        
        template_path = os.path.join(ModelVisualizer._ASSETS_PATH, 'template.html')
        if not os.path.exists(template_path):
            logger.error(f"HTML template not found at {template_path}")
            raise FileNotFoundError(f"HTML template not found at {template_path}")
        with open(template_path, 'r', encoding='utf-8') as f:
            template_content = f.read()
            
        css_path = os.path.join(ModelVisualizer._ASSETS_PATH, 'css', 'styles.css')
        if not os.path.exists(css_path):
            logger.error(f"CSS file not found at {css_path}")
            raise FileNotFoundError(f"CSS file not found at {css_path}")
        with open(css_path, 'r', encoding='utf-8') as f:
            css_content = f.read()
            
        js_path = os.path.join(ModelVisualizer._ASSETS_PATH, 'js', 'script.js')
        if not os.path.exists(js_path):
            logger.error(f"JavaScript file not found at {js_path}")
            raise FileNotFoundError(f"JavaScript file not found at {js_path}")
        with open(js_path, 'r', encoding='utf-8') as f:
            js_content = f.read()
            
        html_content = template_content.replace('{{SVG_CONTENT}}', svg_content)
        html_content = html_content.replace('{{MODULE_INFO}}', js_module_info)
        html_content = html_content.replace('{{CSS_CONTENT}}', css_content)
        html_content = html_content.replace('{{JS_CONTENT}}', js_content)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        if open_browser:
            try:
                url = 'file://' + os.path.abspath(output_path)
                webbrowser.open(url)
                logger.info(f"Visualization opened in browser: {url}")
            except Exception as e:
                logger.warning(f"Could not automatically open browser: {e}")
            
        logger.info(f"Interactive model visualization saved to: {output_path}")
        return output_path

Overwriting /kaggle/working/Panther/panther/utils/SkAutoTuner/ModelVisualizer.py


In [17]:
import os
import time
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, Subset, random_split
from tqdm import tqdm
from torchvision.models import resnet50, ResNet50_Weights
import tarfile
from pathlib import Path
import requests
from torch import nn, optim
from torchvision import datasets, transforms, models

# Import components
from panther.tuner.SkAutoTuner import *

# Setting up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

##################################### HELPERS #######################################

def count_parameters(model):
    """Count trainable parameters in the model"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def model_size_info(model):
    """Get detailed size information about the model"""
    total_params = count_parameters(model)
    
    # Get layer-wise parameter counts for important components
    layer_params = {}
    
    # Add parameters for each layer
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Conv2d) or isinstance(module, torch.nn.Linear):
            layer_params[name] = sum(p.numel() for p in module.parameters() if p.requires_grad)
    
    return {
        "total_params": total_params,
        "total_params_millions": total_params / 1e6,
        "layer_params": layer_params
    }

def dump_tensor_info(tensor, name="Tensor"):
    """Print details about a tensor"""
    print(f"{name}: shape={tensor.shape}, dtype={tensor.dtype}, device={tensor.device}")
    print(f"  - Values: min={tensor.min().item():.4f}, max={tensor.max().item():.4f}, mean={tensor.mean().item():.4f}")
    print(f"  - First few values: {tensor.flatten()[:5]}")

def measure_time(func, *args, n_runs=50, warmup=5):
    """Measure execution time of a function"""
    # Warmup
    for _ in range(warmup):
        func(*args)
    
    # Timed runs
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    start = time.time()
    for _ in range(n_runs):
        func(*args)
        torch.cuda.synchronize() if torch.cuda.is_available() else None
    end = time.time()
    
    return (end - start) / n_runs

def measure_memory(model, input_tensor):
    """Measure peak memory usage of a model during inference"""
    if not torch.cuda.is_available():
        return 0  # Cannot measure CUDA memory on CPU
    
    # Clear cache
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    # Run inference
    with torch.no_grad():
        model(input_tensor)
    
    # Get peak memory
    return torch.cuda.max_memory_allocated() / (1024 * 1024)  # Convert to MB

def calculate_accuracy(outputs, labels):
    """Calculate top-1 and top-5 accuracy"""
    _, preds = outputs.topk(5, 1, True, True)
    preds = preds.t()
    correct = preds.eq(labels.view(1, -1).expand_as(preds))
    top1 = correct[:1].reshape(-1).float().sum(0, keepdim=True).item() / labels.size(0)
    top5 = correct[:5].reshape(-1).float().sum(0, keepdim=True).item() / labels.size(0)
    return top1, top5

def evaluate_model(model, dataloader):
    """Evaluate model accuracy on a dataset"""
    model.eval()
    total_top1 = 0
    total_top5 = 0
    total_samples = 0
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            
            # Calculate accuracy
            batch_size = inputs.size(0)
            top1, top5 = calculate_accuracy(outputs, labels)
            
            # Accumulate statistics
            total_top1 += top1 * batch_size
            total_top5 += top5 * batch_size
            total_samples += batch_size
    
    return total_top1 / total_samples, total_top5 / total_samples

def accuracy_eval_func(model, val_loader, orig_model=None):
    """
    Real-world model evaluation function for accuracy
    
    Args:
        model: The model to evaluate
        val_loader: Validation data loader
        orig_model: Original model for reference (if None, only accuracy is considered)
    Returns:
        A score between 0 and 1 where higher is better
    """
    # Get accuracy
    model.eval()
    top1_acc, top5_acc = evaluate_model(model, val_loader)
    
    # If no original model, just return accuracy
    if orig_model is None:
        return top1_acc
    
    # Get original model accuracy for reference
    orig_model.eval()
    orig_top1_acc, _ = evaluate_model(orig_model, val_loader)
    
    score = (top1_acc - orig_top1_acc) # since they are < 1 both so the score would be greater than or eqal -1 and less than or equal 1
    
    print("running the accuracy validation function")
    print(f"Top-1 Accuracy: {top1_acc:.4f} (original: {orig_top1_acc:.4f}, diff: {score:.4f})")
    print(f"Top-5 Accuracy: {top5_acc:.4f}")
    print(f"Final score: {score:.4f}")
    
    return score

def get_data():
    """Download and prepare dataset"""
    print("Preparing dataset...")

    dataset_path = "/kaggle/working/processed_imagenet"
    transform = ResNet50_Weights.IMAGENET1K_V1.transforms()

    single_class_dataset  = datasets.ImageFolder(root=dataset_path, transform=transform)
    
    # Download and load Dataset test set
    val_loader = DataLoader(single_class_dataset , batch_size=32, shuffle=False, num_workers=2)
    
    # Small batch for memory testing
    memory_batch_size = len(single_class_dataset)
    memory_batch = torch.stack([single_class_dataset [i][0] for i in range(memory_batch_size)]).to(device)
    
    print(f"Dataset validation set: {len(single_class_dataset )} samples")
    
    return val_loader, memory_batch

def test_specific_layers(model_name="resnet18"):
    """Test SKAutoTuner on specific layers of a model, with custom tuning strategy"""
    
    # Load pre-trained CNN
    model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1).to(device)
    model.eval()
    
    # Create a copy of the model for reference
    orig_model = copy.deepcopy(model)
    
    # Get parameter counts before optimization
    print("\n===== Model Parameter Counts Before Optimization =====")
    orig_params = model_size_info(model)
    print(f"Total parameters: {orig_params['total_params_millions']:.2f}M")
    print("Parameters by layer:")
    for layer_name, param_count in orig_params['layer_params'].items():
        print(f"  - {layer_name}: {param_count/1e6:.2f}M parameters")
    
    # Get real validation data
    val_loader, memory_batch = get_data()
    
    print("\n===== Original Model Structure =====")
    ModelVisualizer.print_module_tree(model)

    # Create an evaluation function for the model
    def acc_eval_func(model):
        return accuracy_eval_func(model, val_loader, orig_model)
    
    # Create a separate speed evaluation function
    def speed_eval_func(model):
        # Measure inference throughput
        batch, _ = next(iter(val_loader))
        batch = batch.to(device)
        
        def infer(model, x):
            with torch.no_grad():
                return model(x)
        
        # Higher is better (inverse of time)
        throughput = 1.0 / measure_time(infer, model, batch, n_runs=50)
        return throughput
    

    # Get baseline accuracy on Dataset
    print("\nBaseline CNN accuracy on Dataset:")
    baseline_top1, baseline_top5 = evaluate_model(model, val_loader)
    print(f"Top-1 accuracy: {baseline_top1:.4f}")
    print(f"Top-5 accuracy: {baseline_top5:.4f}")
    print(f"Original model memory usage: {measure_memory(orig_model, memory_batch):.2f}MB")
    print(f"Original model speed: {speed_eval_func(orig_model):.2f} samples/sec")
    
    # Strategy 1: Tuning specific network blocks
    print("\n===== Strategy 1: Tuning specific network blocks =====")
    
    # Create configs to tune only layer3 and layer4 blocks (higher layers)
    # This is typical in practice to preserve feature quality in early layers
    # configs_strategy1 = TuningConfigs([
    #     LayerConfig(
    #         # Use pattern matching to select conv layers in layer3 and layer4
    #         layer_names={"pattern": "layer4.[012].conv3"},
    #         params={
    #             "num_terms": [1, 2, 3],
    #             "low_rank": [16, 32, 64, 96],
    #         },
    #         separate=False  # Tune these layers as a group
    #     ),
    # ])

    configs_strategy1 = TuningConfigs([
        LayerConfig(
            # Use pattern matching to select conv layers in layer3 and layer4
            layer_names={"pattern": "layer4.0.conv3"},
            params={
                "num_terms": [1, 2, 3],
                "low_rank": [16, 32, 64, 96],
            },
            separate=False  # Tune these layers as a group
        ),
    ])
    
    # Calculate accuracy threshold
    accuracy_threshold = -0.2 # which means 80% of the original accuracy
    print(f"Setting accuracy threshold to {accuracy_threshold:.4f}")
    
    # Create tuner with both accuracy and optimization functions
    tuner_strategy1 = SKAutoTuner(
        model=copy.deepcopy(model),
        configs=configs_strategy1,
        accuracy_eval_func=acc_eval_func,
        search_algorithm=GridSearch(),
        verbose=True,
        accuracy_threshold=accuracy_threshold,  # Set minimum acceptable accuracy
        num_runs_per_param=20,
        optmization_eval_func=speed_eval_func   # Optimize for speed after meeting accuracy threshold
        )
    
    # Run tuning
    print("\nRunning block-specific tuning...")
    best_params = tuner_strategy1.tune()
    print(f"Best parameters: {best_params}")
      # Apply best parameters
    tuned_model_strategy1 = tuner_strategy1.apply_best_params()
    
    print("\n===== Tuned Model Structure (Strategy 1) =====")
    ModelVisualizer.print_module_tree(tuned_model_strategy1)
    
    # Get parameter counts after optimization
    print("\n===== Model Parameter Counts After Strategy 1 Optimization =====")
    tuned_params = model_size_info(tuned_model_strategy1)
    print(f"Original model: {orig_params['total_params_millions']:.2f}M parameters")
    print(f"Tuned model: {tuned_params['total_params_millions']:.2f}M parameters")
    print(f"Reduction: {(1 - tuned_params['total_params_millions']/orig_params['total_params_millions'])*100:.2f}%")
    
    print("\nParameters by layer:")
    for layer_name in sorted(set(list(orig_params['layer_params'].keys()) + list(tuned_params['layer_params'].keys()))):
        orig_count = orig_params['layer_params'].get(layer_name, 0) / 1e6
        tuned_count = tuned_params['layer_params'].get(layer_name, 0) / 1e6
        
        if orig_count > 0 and tuned_count > 0:
            reduction = (1 - tuned_count/orig_count) * 100
            print(f"  - {layer_name}: {orig_count:.2f}M → {tuned_count:.2f}M ({reduction:.2f}% reduction)")
    
    # Test the tuned model
    print("\nEvaluating block-tuned model:")
    final_score = accuracy_eval_func(tuned_model_strategy1, val_loader, orig_model)
    print(f"accuracy score: {final_score:.4f}")
    print(f"speed score: {speed_eval_func(tuned_model_strategy1):.2f} samples/sec")
    print(f"memory usage: {measure_memory(tuned_model_strategy1, memory_batch):.2f}MB")
    
    # # Strategy 2: Layer-specific parameter tuning
    # print("\n===== Strategy 2: Layer-specific parameter tuning =====")
    
    # # Create more granular configs for each type of layer
    # configs_strategy2 = TuningConfigs([
    #     # Tune 3x3 convolutions
    #     LayerConfig(
    #         layer_names={"pattern": "layer4.[012].conv3"},  # 3x3 convs in bottleneck blocks
    #         params={
    #             "num_terms": [1, 2, 3],
    #             "low_rank": [16, 32, 64, 96],
    #         },
    #         separate=False  # Tune each layer separately
    #     ),
    # ])

    # configs_strategy2 = TuningConfigs([
    #     # Tune 3x3 convolutions
    #     LayerConfig(
    #         layer_names={"pattern": "layer4.[012].conv3"},  # 3x3 convs in bottleneck blocks
    #         params={
    #             "num_terms": [1, 2, 3],
    #             "low_rank": [16, 32, 64, 96],
    #         },
    #         separate=False  # Tune each layer separately
    #     ),
    # ])
    
    # # Create tuner with proper accuracy threshold
    # tuner_strategy2 = SKAutoTuner(
    #     model=copy.deepcopy(model),
    #     configs=configs_strategy2,
    #     accuracy_eval_func=acc_eval_func,
    #     accuracy_threshold=accuracy_threshold,
    #     search_algorithm=GridSearch(),  # Use random search instead of grid search
    #     verbose=True,
    #     num_runs_per_param=20,
    #     optmization_eval_func=speed_eval_func  # Still optimize for speed when accuracy is acceptable
    # )
    
    # # Run tuning
    # print("\nRunning layer-specific tuning...")
    # best_params = tuner_strategy2.tune()
    # print(f"Best parameters: {best_params}")
    #   # Apply best parameters
    # tuned_model_strategy2 = tuner_strategy2.apply_best_params()
    
    # print("\n===== Tuned Model Structure (Strategy 2) =====")
    # ModelVisualizer.print_module_tree(tuned_model_strategy2)
    
    # # Get parameter counts after optimization
    # print("\n===== Model Parameter Counts After Strategy 2 Optimization =====")
    # tuned_params2 = model_size_info(tuned_model_strategy2)
    # print(f"Original model: {orig_params['total_params_millions']:.2f}M parameters")
    # print(f"Tuned model: {tuned_params2['total_params_millions']:.2f}M parameters")
    # print(f"Reduction: {(1 - tuned_params2['total_params_millions']/orig_params['total_params_millions'])*100:.2f}%")
    
    # print("\nParameters by layer:")
    # for layer_name in sorted(set(list(orig_params['layer_params'].keys()) + list(tuned_params2['layer_params'].keys()))):
    #     orig_count = orig_params['layer_params'].get(layer_name, 0) / 1e6
    #     tuned_count = tuned_params2['layer_params'].get(layer_name, 0) / 1e6
        
    #     if orig_count > 0 and tuned_count > 0:
    #         reduction = (1 - tuned_count/orig_count) * 100
    #         print(f"  - {layer_name}: {orig_count:.2f}M → {tuned_count:.2f}M ({reduction:.2f}% reduction)")
    
    # # Test the tuned model
    # print("\nEvaluating layer-specific tuned model:")
    # final_score = accuracy_eval_func(tuned_model_strategy2, val_loader, orig_model)
    # print(f"Final score: {final_score:.4f}")
    # print(f"speed score: {speed_eval_func(tuned_model_strategy2):.2f} samples/sec")
    # print(f"memory usage: {measure_memory(tuned_model_strategy2, memory_batch):.2f}MB")
    
    # # Strategy 3: Memory-constrained tuning
    # print("\n===== Strategy 3: Memory-constrained tuning =====")
    
    # # Create configs focused on memory reduction for the largest layers
    # configs_strategy3 = TuningConfigs([
    #     LayerConfig(
    #         # Find the layers with the most parameters
    #         layer_names={"pattern": ["layer4.*", "layer3.*"], "type": "Conv2d"},
    #         params={
    #             "num_terms": [1, 2, 3],
    #             "low_rank": [16, 32, 48, 64],  # Lower rank = lower memory
    #         },
    #         separate=False
    #     ),
    # ])
    
    # # Create a memory optimization function
    # def memory_optimization_func(model):
    #     mem_usage = measure_memory(model, memory_batch)
    #     orig_mem = measure_memory(orig_model, memory_batch)
        
    #     score = (orig_mem - mem_usage) / max(orig_mem, 1e-8)
        
    #     print(f"Memory optimization: {mem_usage:.2f}MB (original: {orig_mem:.2f}MB, reduction: {score:.2f}x)")
    #     return score
    
    # # Create memory-aware tuner
    # tuner_strategy3 = SKAutoTuner(
    #     model=copy.deepcopy(model),
    #     configs=configs_strategy3,
    #     accuracy_eval_func=acc_eval_func,
    #     accuracy_threshold=accuracy_threshold,
    #     search_algorithm=GridSearch(),
    #     verbose=True,
    #     num_runs_per_param=20,
    #     optmization_eval_func=memory_optimization_func  # Optimize for memory specifically
    # )
    
    # # Run tuning
    # print("\nRunning memory-constrained tuning...")
    # best_params = tuner_strategy3.tune()
    # print(f"Best parameters: {best_params}")
    #   # Apply best parameters
    # tuned_model_strategy3 = tuner_strategy3.apply_best_params()
    
    # print("\n===== Tuned Model Structure (Strategy 3) =====")
    # ModelVisualizer.print_module_tree(tuned_model_strategy3)
    
    # # Get parameter counts after optimization
    # print("\n===== Model Parameter Counts After Strategy 3 Optimization =====")
    # tuned_params3 = model_size_info(tuned_model_strategy3)
    # print(f"Original model: {orig_params['total_params_millions']:.2f}M parameters")
    # print(f"Tuned model: {tuned_params3['total_params_millions']:.2f}M parameters")
    # print(f"Reduction: {(1 - tuned_params3['total_params_millions']/orig_params['total_params_millions'])*100:.2f}%")
    
    # print("\nParameters by layer:")
    # for layer_name in sorted(set(list(orig_params['layer_params'].keys()) + list(tuned_params3['layer_params'].keys()))):
    #     orig_count = orig_params['layer_params'].get(layer_name, 0) / 1e6
    #     tuned_count = tuned_params3['layer_params'].get(layer_name, 0) / 1e6
        
    #     if orig_count > 0 and tuned_count > 0:
    #         reduction = (1 - tuned_count/orig_count) * 100
    #         print(f"  - {layer_name}: {orig_count:.2f}M → {tuned_count:.2f}M ({reduction:.2f}% reduction)")
    
    # # Test the tuned model
    # print("\nEvaluating memory-optimized model:")
    # print(f"Final score: {accuracy_eval_func(tuned_model_strategy3, val_loader, orig_model):.4f}")
    # print(f"speed score: {speed_eval_func(tuned_model_strategy3):.2f} samples/sec")
    # print(f"memory usage: {measure_memory(tuned_model_strategy3, memory_batch):.2f}MB")
    
    # # Final comparison table for all strategies
    # print("\n===== Parameter Reduction Comparison Across Strategies =====")
    # print("| Strategy | Original Params (M) | Tuned Params (M) | Reduction (%) |")
    # print("|----------|---------------------|------------------|---------------|")
    # print(f"| Strategy 1 | {orig_params['total_params_millions']:.2f} | {tuned_params['total_params_millions']:.2f} | {(1 - tuned_params['total_params_millions']/orig_params['total_params_millions'])*100:.2f} |")
    # print(f"| Strategy 2 | {orig_params['total_params_millions']:.2f} | {tuned_params2['total_params_millions']:.2f} | {(1 - tuned_params2['total_params_millions']/orig_params['total_params_millions'])*100:.2f} |")
    # print(f"| Strategy 3 | {orig_params['total_params_millions']:.2f} | {tuned_params3['total_params_millions']:.2f} | {(1 - tuned_params3['total_params_millions']/orig_params['total_params_millions'])*100:.2f} |")

def print_original_conv_params(model):
    """Print parameters of all original Conv2d layers in the model"""
    print("\n===== Original Conv2d Layer Parameters =====")
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d):
            print(f"Layer: {name}")
            print(f"  - in_channels: {module.in_channels}")
            print(f"  - out_channels: {module.out_channels}")
            print(f"  - kernel_size: {module.kernel_size}")
            print(f"  - stride: {module.stride}")
            print(f"  - padding: {module.padding}")
            print(f"  - dilation: {module.dilation}")
            print(f"  - groups: {module.groups}")
            print(f"  - bias: {module.bias is not None}")
            print("----------------------------------------")

if __name__ == "__main__":
    import copy  # Used for deep copying models

    # Load pre-trained CNN
    model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1).to(device)
    model.eval()

    # Print original Conv2d layer parameters
    print_original_conv_params(model)

    # Run the full test with multiple strategies
    print("\nRunning full test with multiple tuning strategies...")
    test_specific_layers()
    
    print("\nAll tests completed.")

Using device: cuda


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 195MB/s]



===== Original Conv2d Layer Parameters =====
Layer: conv1
  - in_channels: 3
  - out_channels: 64
  - kernel_size: (7, 7)
  - stride: (2, 2)
  - padding: (3, 3)
  - dilation: (1, 1)
  - groups: 1
  - bias: False
----------------------------------------
Layer: layer1.0.conv1
  - in_channels: 64
  - out_channels: 64
  - kernel_size: (1, 1)
  - stride: (1, 1)
  - padding: (0, 0)
  - dilation: (1, 1)
  - groups: 1
  - bias: False
----------------------------------------
Layer: layer1.0.conv2
  - in_channels: 64
  - out_channels: 64
  - kernel_size: (3, 3)
  - stride: (1, 1)
  - padding: (1, 1)
  - dilation: (1, 1)
  - groups: 1
  - bias: False
----------------------------------------
Layer: layer1.0.conv3
  - in_channels: 64
  - out_channels: 256
  - kernel_size: (1, 1)
  - stride: (1, 1)
  - padding: (0, 0)
  - dilation: (1, 1)
  - groups: 1
  - bias: False
----------------------------------------
Layer: layer1.0.downsample.0
  - in_channels: 64
  - out_channels: 256
  - kernel_size: (1,