In [1]:
import sys
sys.path.append("../../Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages")

In [21]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset
import torch
import textwrap
import warnings
warnings.filterwarnings('ignore')
import random

In [22]:
model_to_use = "gpt2-large"
print("Using model: ", model_to_use)
tokenizer = GPT2TokenizerFast.from_pretrained(model_to_use)
model = GPT2LMHeadModel.from_pretrained(model_to_use,output_scores=True,pad_token_id=tokenizer.eos_token_id)

Using model:  gpt2-large


In [23]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Model, GPT2Tokenizer

layers_to_track=['transformer.h.0.attn.c_attn','transformer.h.0.attn.c_proj','transformer.h.1.attn.c_attn','transformer.h.1.attn.c_proj','transformer.h.2.attn.c_attn','transformer.h.2.attn.c_proj','transformer.h.3.attn.c_attn','transformer.h.3.attn.c_proj']

class NeuronCoverage:
    def __init__(self, model, tokenizer):
        self.model = model
        self.activations = {}
        self.tokenizer = tokenizer
        self.handles = []
        self.max_length=100

    def register_hooks(self):
        for name, module in self.model.named_modules():
            if any(layer_name in name for layer_name in layers_to_track):
                handle = module.register_forward_hook(self.create_hook(name))
                self.handles.append(handle)

    def create_hook(self, layer_name):
        def hook(module, input, output):
            # Apply ReLU activation
            activated_neurons = F.relu(output)
            # Count non-zero activations
            activated_neurons = activated_neurons > 0
            # Record activations per neuron
            self.activations[layer_name] = activated_neurons.detach().cpu().numpy()
        return hook

    def compute_coverage(self):
        coverage = {}
        for layer_name, activation in self.activations.items():
            covered_neurons = activation.sum()
            total_neurons = activation.size
            coverage[layer_name] = covered_neurons / total_neurons
        return coverage

    def remove_hooks(self):
        for handle in self.handles:
            handle.remove()

    def process_input(self, text):
        """Tokenizes input text and passes it through the model."""
        inputs = self.tokenizer(text, return_tensors="pt")
        return inputs

    def get_model_output(self, inputs, max_length=None):
        max_length = max_length or self.max_length
        input_ids = inputs['input_ids']

        with torch.no_grad():
            output_sequences = self.model.generate(
                input_ids=input_ids,
                max_length=max_length,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                pad_token_id=self.tokenizer.eos_token_id,
                temperature=1.0,
                top_k=50,
                top_p=0.95
            )

        return output_sequences



    def decode_output(self, output_sequences):
        generated_text = self.tokenizer.decode(output_sequences[0], skip_special_tokens=True)
        return generated_text

    
    def run_coverage_analysis(self, text, apply_mutations=False):
        if apply_mutations:
            text = self.apply_mutations(text)  # Apply mutations before processing

        inputs = self.process_input(text)
        output_sequences = self.get_model_output(inputs)
        coverage = self.compute_coverage()
        decoded_text = self.decode_output(output_sequences)

        return coverage, decoded_text, text

    def apply_mutations(self, text):
        mutation_type = random.choice([self.replace_words, self.insert_words, self.reorder_words])
        mutated_text = mutation_type(text)
        return mutated_text

    def replace_words(self, text):
        words = text.split()
        if len(words) > 0:
            random_idx = random.randint(0, len(words) - 1)
            words[random_idx] = self.generate_random_word()  # Replace random word
        return " ".join(words)

    def insert_words(self, text):
        words = text.split()
        if len(words) > 0:
            random_idx = random.randint(0, len(words) - 1)
            words.insert(random_idx, self.generate_random_word())  # Insert random word
        return " ".join(words)

    def reorder_words(self, text):
        words = text.split()
        random.shuffle(words)  # Shuffle the words randomly
        return " ".join(words)

    def generate_random_word(self):
        # For simplicity, generate a random word of length 5 from the alphabet
        return ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=5))



In [33]:
coverage_tracker = NeuronCoverage(model,tokenizer)
coverage_tracker.register_hooks()

In [34]:

# Compute neuron coverage
input_text = "When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, "

# Run with mutations applied
coverage, output_text, text = coverage_tracker.run_coverage_analysis(input_text, apply_mutations=True)

for layer, cov in coverage.items():
    print(f"Layer {layer} coverage: {cov:.2%}")

print(f"\nMutated Input: {text}")
print("\nGenerated Text Output:")
print(output_text)


coverage_tracker.remove_hooks()

Layer transformer.h.0.attn.c_attn coverage: 49.19%
Layer transformer.h.0.attn.c_proj coverage: 50.94%
Layer transformer.h.1.attn.c_attn coverage: 48.49%
Layer transformer.h.1.attn.c_proj coverage: 50.78%
Layer transformer.h.2.attn.c_attn coverage: 49.43%
Layer transformer.h.2.attn.c_proj coverage: 47.58%
Layer transformer.h.3.attn.c_attn coverage: 51.77%
Layer transformer.h.3.attn.c_proj coverage: 52.42%

Mutated Input: them When have it for connected of Course political necessary the which the one to people becomes dissolve human with bands in another, events,

Generated Text Output:
them When have it for connected of Course political necessary the which the one to people becomes dissolve human with bands in another, events, and the other to the people, the same, or the opposite, to be the cause of the former, which is the case with the present, is not the reason why the first to them is dissolved, but the fact that the latter is in the power of those who are in power, that is, of the

In [9]:
coverage_tracker.remove_hooks()