# Implementation of a KANBoost
## Initialisations



In [None]:
import torch
import numpy as np
from torch import nn

# Define the Echo State Network (ESN) class
class EchoStateNetwork(nn.Module):
    def __init__(self, input_dim, reservoir_size, output_dim, spectral_radius=0.95):
        super(EchoStateNetwork, self).__init__()
        self.input_dim = input_dim
        self.reservoir_size = reservoir_size
        self.output_dim = output_dim

        # Initialize weights for input to reservoir
        self.Win = torch.randn(reservoir_size, input_dim) * 0.1

        # Initialize reservoir weights
        W = torch.randn(reservoir_size, reservoir_size) * 0.1
        max_eigenvalue = max(abs(np.linalg.eigvals(W.numpy())))
        self.W = nn.Parameter(torch.tensor(W * (spectral_radius / max_eigenvalue), requires_grad=False))

        # Readout weights (trainable)
        self.Wout = nn.Linear(reservoir_size, output_dim)

        # Reservoir state
        self.reservoir_state = torch.zeros(reservoir_size)

    def forward(self, x):
        # Update reservoir state
        self.reservoir_state = torch.tanh(
            torch.matmul(self.Win, x) + torch.matmul(self.W, self.reservoir_state)
        )

        # Output
        return self.Wout(self.reservoir_state)

# Integrate the ESN into the pipeline

def train_esn(train_inputs, train_labels, input_dim, output_dim, reservoir_size=100, epochs=100):
    model = EchoStateNetwork(input_dim, reservoir_size, output_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    # Training loop
    for epoch in range(epochs):
        total_loss = 0.0
        for i in range(train_inputs.size(0)):
            x = train_inputs[i]
            y = train_labels[i]

            # Forward pass
            output = model(x)

            # Compute loss
            loss = criterion(output, y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    return model

# Example usage with preprocessed data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pykan

Collecting pykan
  Downloading pykan-0.2.8-py3-none-any.whl.metadata (11 kB)
Downloading pykan-0.2.8-py3-none-any.whl (78 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.1/78.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pykan
Successfully installed pykan-0.2.8


# Sample inputs

> Add blockquote



In [None]:

import torch
from sklearn.model_selection import train_test_split

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print("****** DEVICE: ",device , " ******\n")

****** DEVICE:  cuda  ******



# KANBoost Preprocessor Class Implementation

In [None]:
class Preprocessor:
    def __init__(self, page_size, block_size):
        self.page_size = page_size
        self.block_size = block_size

    def ensure_48bit_address(self, load_address):
        # Ensure the load_address is a 48-bit binary string
        return bin(int(load_address, 16))[2:].zfill(48)

    def calculate_delta(self, block1, block2):
        # Calculate the delta between two blocks (binary subtraction)
        return int(block1, 2) - int(block2, 2)

    def split_load_address(self, load_address):
        binary_address = self.ensure_48bit_address(load_address)

        page = binary_address[:self.page_size]  # x Bit (Varies)
        block = binary_address[self.page_size:self.page_size + self.block_size]  # 6 Bit Fixed
        block_offset = binary_address[self.page_size + self.block_size:]  # Remaining bits

        return (page, block, block_offset)


    def unsplit_load_address(self, load_address, block_delta):
        # Convert the load address (hex) to binary
        binary_address = self.ensure_48bit_address(load_address)

        # Split the binary address into page, block, and offset
        current_page = binary_address[:self.page_size]
        current_block = binary_address[self.page_size:self.page_size + self.block_size]
        current_block_offset = binary_address[self.page_size + self.block_size:]
        adjusted_block_int=int(current_block, 2) + block_delta
        if(adjusted_block_int<0):
          adjusted_block_int=0
        # Adjust the block by adding the block delta
        adjusted_block = bin(adjusted_block_int)[2:].zfill(self.block_size)  # Add delta to block and ensure correct bit length
        # Reconstruct the binary address
        reconstructed_binary_address = current_page + adjusted_block + current_block_offset

        # Convert the reconstructed binary address to hexadecimal
        reconstructed_load_address = hex(int(reconstructed_binary_address, 2))[2:].lower()  # Remove '0x' prefix and convert to uppercase

        return reconstructed_load_address

    def preprocess_data(self, data):
        input_features = []
        output_labels = []
        page_blocks = {}
        preprocessed_details = []  # Store detailed information

        for i in range(len(data) - 1):
            instr_id, cycle_count, load_address, instr_ptr, llc_hit_miss = data[i]
            current_page, current_block, current_block_offset = self.split_load_address(load_address)

            _, _, next_load_address, _, _ = data[i+1]
            next_page, next_block, next_block_offset = self.split_load_address(next_load_address)
            # Initialize page_blocks if current_page is not present
            if current_page not in page_blocks:
                page_blocks[current_page] = ['000001']

            # Calculate delta values for the past blocks
            delta1 = delta2 = delta3 = delta4 = delta5 = 1 + 64

            if len(page_blocks[current_page]) > 1:
                delta1 = 64 + self.calculate_delta(page_blocks[current_page][-1], page_blocks[current_page][-2])
            if len(page_blocks[current_page]) > 2:
                delta2 = 64 + self.calculate_delta(page_blocks[current_page][-2], page_blocks[current_page][-3])
            if len(page_blocks[current_page]) > 3:
                delta3 = 64 + self.calculate_delta(page_blocks[current_page][-3], page_blocks[current_page][-4])
            if len(page_blocks[current_page]) > 4:
                delta4 = 64 + self.calculate_delta(page_blocks[current_page][-4], page_blocks[current_page][-5])
            if len(page_blocks[current_page]) > 5:
                delta5 = 64 + self.calculate_delta(page_blocks[current_page][-4], page_blocks[current_page][-5])

            # Calculate delta for the next block (relative to current block)
            next_delta = self.calculate_delta(next_block, current_block)

            # Append input features
            input_features.append((instr_id,load_address, delta1, delta2, delta3, delta4, delta5))

            # Convert next_delta to a 128-dimensional one-hot array and append as the output label
            output_labels.append(next_delta+64)

            # Append the current block to the page's block list
            page_blocks[current_page].append(current_block)

            # Store the details for inspection without delta1, delta2, delta3, and next_delta
            details = {
                "instr_id": instr_id,
                "page": current_page,
                "block": current_block,
                "block_offset": current_block_offset
            }
            preprocessed_details.append(details)

        return input_features, output_labels, preprocessed_details

# Read Data from file Implementation

In [None]:
# Reading the data from the text file (same as before)
def read_data_from_file(filename):
    data = []
    with open(filename, 'r', newline='', encoding='utf-8') as file:  # Specify newline='' to handle any line endings
        for line in file:
            line = line.rstrip('\n')  # Strip only the trailing newline to preserve leading newlines if necessary
            if line:
                # Split by comma and remove any extra spaces
                fields = [x.strip() for x in line.split(',')]
                instr_id = int(fields[0])
                cycle_count = int(fields[1])
                load_address = fields[2]
                instr_ptr = fields[3]
                llc_hit_miss = int(fields[4])

                # Append as a tuple
                data.append((instr_id, cycle_count, load_address, instr_ptr, llc_hit_miss))
    return data


# Prepare Dataset Function Implementation

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def prepare_dataset(input_features, output_labels, batch_size=1, device='cpu'):
    """
    Prepares the dataset for training and testing without train-test split, and preserves instruction IDs.

    Args:
        input_features (list or np.array): The input features for the dataset.
        output_labels (list or np.array): The output labels for the dataset.
        batch_size (int): The batch size for data loaders. Default is 1.
        device (str): The device to store the tensors ('cpu' or 'cuda'). Default is 'cpu'.

    Returns:
        dict: A dictionary containing the processed dataset with 'input', 'labels', and 'instr_ids'.
    """
    # Assuming input_features is a list of tuples like (instr_id, feature1, feature2, ..., featureN)
    instr_ids = [x[0] for x in input_features]  # Extracting the instruction IDs
    features = [x[2:] for x in input_features]   # Extracting the actual feature data (excluding instr_id)

    # Convert to PyTorch tensors and move to the specified device
    data_tensor = torch.tensor(features, dtype=torch.float32, device=device)
    target_tensor = torch.tensor(output_labels, dtype=torch.long, device=device)  # Assuming labels are integer values

    # Create data loaders (optional, if you want to batch and shuffle the data)
    data_loader = DataLoader(TensorDataset(data_tensor, target_tensor),
                             batch_size=batch_size, shuffle=True)

    # Initialize tensors for inputs and labels
    all_inputs = torch.empty(0, data_tensor.size(1), device=device)  # Assuming data_tensor has N features
    all_labels = torch.empty(0, dtype=torch.long, device=device)

    # Concatenate all data into a single tensor on the specified device
    for data, labels in data_loader:
        all_inputs = torch.cat((all_inputs, data.to(device)), dim=0)
        all_labels = torch.cat((all_labels, labels.to(device)), dim=0)

    # Return the dataset as a dictionary, including instruction IDs
    dataset = {
        'input': all_inputs,
        'label': all_labels,
        'instr_ids': instr_ids  # Include the instruction IDs in the output (no need to move to device)
    }

    return dataset


  # ------------------------****** Load Dataset for Training only ******  ------------------------
def load_dataset(data,target):
    # Convert to PyTorch tensors
   #This needs to be torch.float32

    # Split dataset into train and test sets
    data_tensor = torch.tensor(data, dtype=torch.float32)
    target_tensor = torch.tensor(target, dtype=torch.long)

    # Split dataset deterministically (e.g., first 80% for training, last 20% for testing)
    split_index = int(len(data_tensor) * 0.8)  # 80-20 split

    train_data = data_tensor[:split_index]
    test_data = data_tensor[split_index:]
    train_target = target_tensor[:split_index]
    test_target = target_tensor[split_index:]

    # Create data loaders (optional, if you want to batch and shuffle the data)
    train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(train_data, train_target), batch_size=1, shuffle=False)
    test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(test_data, test_target), batch_size=1, shuffle=False)

    train_inputs = torch.empty(0, 5, device=device)
    train_labels = torch.empty(0, dtype=torch.long,device=device)
    test_inputs = torch.empty(0, 5, device=device)
    test_labels = torch.empty(0,dtype=torch.long,  device=device)

    # Concatenate all data into a single tensor on the specified device
    for data, labels in train_loader:
        train_inputs = torch.cat((train_inputs, data.to(device)), dim=0)
        train_labels = torch.cat((train_labels, labels.to(device)), dim=0)

    for data, labels in test_loader:
        test_inputs = torch.cat((test_inputs, data.to(device)), dim=0)
        test_labels = torch.cat((test_labels, labels.to(device)), dim=0)

    dataset = {}
    dataset['train_input'] = train_inputs
    dataset['test_input'] = test_inputs
    dataset['train_label'] = train_labels
    dataset['test_label'] = test_labels

    return dataset

# KANBoost Entry Point:

In [None]:
filename = '/content/drive/MyDrive/482.sphinx3-s2.trace.gz-hashed_perceptron-no-no-no-trace-lru-1core.txt'
data = read_data_from_file(filename)

# Initialize the KANBoost Preprocessor
page_size = 36
block_size = 6

preprocessor = Preprocessor(page_size, block_size)

# Use the preprocess_data function to process the entire data and capture details
input_features, output_labels, preprocessed_details = preprocessor.preprocess_data(data)
# dataset=prepare_dataset(input_features=input_features,output_labels=output_labels,device=device)

In [None]:
input_features[0]

(3, '5cb41dc41440', 65, 65, 65, 65, 65)

In [None]:
last_5_elements = [t[-5:] for t in input_features]

In [None]:
dataset=load_dataset(last_5_elements,output_labels)

In [None]:
print("Train data shape: {}".format(dataset['train_input'].shape))
print("Train target shape: {}".format(dataset['train_label'].shape))
print("Test data shape: {}".format(dataset['test_input'].shape))
print("Test target shape: {}".format(dataset['test_label'].shape))


Train data shape: torch.Size([234214, 5])
Train target shape: torch.Size([234214])
Test data shape: torch.Size([58554, 5])
Test target shape: torch.Size([58554])


## Creating and Training the Echo state network

*   List item
*   List item



In [None]:
class EchoStateNetwork(torch.nn.Module):
    def __init__(self, input_dim, reservoir_size, output_dim, spectral_radius=0.95):
        super(EchoStateNetwork, self).__init__()
        self.input_dim = input_dim
        self.reservoir_size = reservoir_size
        self.output_dim = output_dim

        # Initialize weights for input to reservoir
        self.Win = torch.randn(reservoir_size, input_dim) * 0.1

        # Initialize reservoir weights
        W = torch.randn(reservoir_size, reservoir_size) * 0.1
        max_eigenvalue = max(abs(torch.linalg.eigvals(W).real))
        self.W = torch.nn.Parameter(W * (spectral_radius / max_eigenvalue), requires_grad=False)

        # Readout weights (trainable)
        self.Wout = torch.nn.Linear(reservoir_size, output_dim)

    def forward(self, x):
        # Ensure weights are on the same device as input
        self.Win = self.Win.to(x.device)
        self.W = self.W.to(x.device)

        # Batch size
        batch_size = x.shape[0]

        # Initialize or reset reservoir state for each batch
        reservoir_state = torch.zeros(batch_size, self.reservoir_size, device=x.device)

        # Update reservoir state
        reservoir_state = torch.tanh(
            torch.matmul(x, self.Win.T) + torch.matmul(reservoir_state, self.W.T)
        )

        # Output
        return self.Wout(reservoir_state)


In [None]:
def clear_gpu_memory():
    """Clear GPU memory cache."""
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Wait for all streams on a CUDA device to finish
    print("GPU memory cleared.")
clear_gpu_memory()

GPU memory cleared.


In [None]:
def Train_KANBoost(dataset, model, optimizer, criterion):
    def train_acc():
        with torch.no_grad():
            predictions = torch.argmax(model(dataset['train_input']), dim=1)
            return torch.mean((predictions == dataset['train_label']).float())

    def test_acc():
        with torch.no_grad():
            predictions = torch.argmax(model(dataset['test_input']), dim=1)
            return torch.mean((predictions == dataset['test_label']).float())

    # Training step
    optimizer.zero_grad()
    outputs = model(dataset['train_input'])
    loss = criterion(outputs, dataset['train_label'])
    loss.backward()
    optimizer.step()

    # Evaluate metrics
    train_accuracy = train_acc()
    test_accuracy = test_acc()

    print(f"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}, Loss: {loss.item():.4f}")

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def train_model(dataset, num_epochs=10, input_dim=None, output_dim=None, reservoir_size=100):
    # Determine the device (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move input and labels to the appropriate device (not the TensorDataset itself)
    train_input = dataset['train_input'].to(device)
    train_label = dataset['train_label'].to(device)
    test_input = dataset['test_input'].to(device)
    test_label = dataset['test_label'].to(device)

    # Step 1: Create TensorDataset for train and test datasets
    train_dataset = TensorDataset(train_input, train_label)
    test_dataset = TensorDataset(test_input, test_label)

    # Step 2: Create DataLoader for both train and test datasets
    batch_size = 5000
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize ESN model
    esn_model = EchoStateNetwork(input_dim, reservoir_size, output_dim).to(device)  # Move model to device
    optimizer = torch.optim.Adam(esn_model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss()

    # Step 3: Iterate through the DataLoader for training
    for epoch in range(num_epochs):
        for train_batch, test_batch in zip(train_loader, test_loader):
            train_inputs, train_targets = train_batch
            test_inputs, test_targets = test_batch

            # Move batch data to the same device
            train_inputs = train_inputs.to(device)
            train_targets = train_targets.to(device)
            test_inputs = test_inputs.to(device)
            test_targets = test_targets.to(device)

            partial_dataset = {
                "train_input": train_inputs,
                "train_label": train_targets,
                "test_input": test_inputs,
                "test_label": test_targets
            }
            Train_KANBoost(partial_dataset, esn_model, optimizer, criterion)

    # Save the trained model
    torch.save(esn_model.state_dict(), "trained_esn_model.pth")

In [None]:
input_dim = dataset['train_input'].shape[1]
train_model(dataset, num_epochs=10, input_dim=input_dim, output_dim=128, reservoir_size=200)
print("Model saved to trained_esn_model.pth")

Train Accuracy: 0.0080, Test Accuracy: 0.0048, Loss: 4.9767
Train Accuracy: 0.0048, Test Accuracy: 0.0062, Loss: 4.7657
Train Accuracy: 0.2332, Test Accuracy: 0.0672, Loss: 4.5726
Train Accuracy: 0.2622, Test Accuracy: 0.1126, Loss: 4.3867
Train Accuracy: 0.2624, Test Accuracy: 0.4144, Loss: 4.2046
Train Accuracy: 0.2688, Test Accuracy: 0.4166, Loss: 3.9978
Train Accuracy: 0.1146, Test Accuracy: 0.4150, Loss: 4.3908
Train Accuracy: 0.0936, Test Accuracy: 0.3866, Loss: 3.8117
Train Accuracy: 0.2654, Test Accuracy: 0.3294, Loss: 3.6886
Train Accuracy: 0.4302, Test Accuracy: 0.2476, Loss: 3.0661
Train Accuracy: 0.4286, Test Accuracy: 0.2670, Loss: 2.9444
Train Accuracy: 0.4132, Test Accuracy: 0.2639, Loss: 2.8844
Train Accuracy: 0.2246, Test Accuracy: 0.2684, Loss: 3.6358
Train Accuracy: 0.2744, Test Accuracy: 0.1866, Loss: 3.2633
Train Accuracy: 0.2710, Test Accuracy: 0.0808, Loss: 3.3004
Train Accuracy: 0.2682, Test Accuracy: 0.1194, Loss: 3.3396
Train Accuracy: 0.2612, Test Accuracy: 0

In [None]:
cp -r /content/model /content/drive/MyDrive/kanboost3_24

# Prefetch File Generation


In [None]:
dataset=prepare_dataset(input_features=input_features,output_labels=output_labels,device=device)

In [None]:
dataset['input'][0:5]

tensor([[65., 65., 65., 65., 65.],
        [65., 65., 65., 65., 65.],
        [66., 63., 50., 63., 63.],
        [65., 65., 65., 40., 40.],
        [65., 65., 65., 65., 65.]], device='cuda:0')

In [None]:
output_dim=128
reservoir_size=200
esn_model = EchoStateNetwork(input_dim, reservoir_size, output_dim)

# Load the saved state
esn_model.load_state_dict(torch.load("trained_esn_model.pth"))

# Move to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
esn_model = esn_model.to(device)

# Set the model to evaluation mode
esn_model.eval()



  esn_model.load_state_dict(torch.load("trained_esn_model.pth"))


EchoStateNetwork(
  (Wout): Linear(in_features=200, out_features=128, bias=True)
)

In [None]:
!ls

0.7_cache_data.zip  0.7_state	   drive	kanboost  sample_data
0.7_config.yml	    0.7_state.zip  history.txt	model


In [None]:
!cd "model"
!ls

drive  model  sample_data


In [None]:

def generate_prefetch_file(path, prefetches):
    with open(path, 'w') as f:
        for instr_id, pf_addr in prefetches:
            print(instr_id, pf_addr, file=f)
def prefetch_generation(batch_size=5000):
    """
    Prepare dataset with manual batching.
    Returns the average inference time per sample in nanoseconds.
    """
    import time

    # Get the total number of samples
    total_samples = len(input_features)
    total_inference_time = 0
    num_batches = 0
    prefetches = []

    # Create batches manually by splitting the data
    for i in range(0, total_samples, batch_size):
        batch_inputs = dataset['input'][i:i + batch_size]
        num_batches += 1
        batch_size_actual = len(batch_inputs)  # Adjust for the last batch

        # Measure inference time
        start_time = time.time()
        pred = torch.argmax(esn_model(batch_inputs), dim=1)
        inference_time = time.time() - start_time

        # Accumulate total inference time
        total_inference_time += inference_time

        # Process predictions
        for j in range(len(pred)):
            instr_id, load_address, _, _, _, _, _ = input_features[i + j]
            load_addr = preprocessor.unsplit_load_address(load_address, int(pred[j].item()) - 64)
            prefetches.append((instr_id, load_addr))

    # Calculate average inference time per sample in nanoseconds
    avg_inference_time_per_sample_ns = (total_inference_time / total_samples) * 1e9 if total_samples > 0 else 0
    print(f"Average Inference Time per Sample: {avg_inference_time_per_sample_ns:.2f} nanoseconds")

    return avg_inference_time_per_sample_ns, prefetches


In [None]:
prefetches=[]
avgtime,prefetches=prefetch_generation()
generate_prefetch_file('prefetch_1M_model_v0.5.txt', prefetches)

Average Inference Time per Sample: 134.50 nanoseconds


In [None]:
len(prefetches),len(input_features)

(260492, 260492)