In [48]:
from trees import Tree
from wrappers import GPTneoX_DenseWrapper, ActivationWrapper

import torch
import numpy as np
import umap
import plotly.graph_objects as go

In [49]:
model = "EleutherAI/pythia-1.4b-deduped"
wrapper = ActivationWrapper(model)

layer_num = -1
layer = wrapper.make_layer_wrapper(layer_num, 'mlp')

In [50]:
vocab_dict = wrapper.get_vocab()
reversed_dict = {v: k for k, v in vocab_dict.items()}

In [51]:
class TreeNode:
    def __init__(self, value, **attrs):
        self.value    = value
        self.attrs    = attrs       # all extra kwargs go here
        self.children = []

    def add_child(self, child):
        self.children.append(child)

    def __repr__(self, level=0):
        indent   = "  " * level
        attr_str = f" {self.attrs!r}" if self.attrs else ""
        ret = f"{indent}{self.value!r}{attr_str}\n"
        for c in self.children:
            ret += c.__repr__(level+1)
        return ret

def nodes_at_depth(root, target_depth):
    """
    Return a list of all nodes exactly target_depth below root.
    root is at depth 0.
    """
    if target_depth == 0:
        return [root]
    result = []
    for child in root.children:
        result.extend(nodes_at_depth(child, target_depth - 1))
    return result

def chunked(lst, n):
    """Yield successive n‐sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def get_leaves(node):
    """
    Return a list of all leaf‐nodes in the subtree rooted at `node`.
    """
    if not node.children:
        return [node]
    leaves = []
    for child in node.children:
        leaves.extend(get_leaves(child))
    return leaves

def find_path(root, target, path=None):
    path = (path or []) + [root]
    if root is target:
        return path
    for c in root.children:
        res = find_path(c, target, path)
        if res:
            return res
    return None



In [52]:
names = ['Lymph nodes supply']

## Top 2 Probs

In [54]:
hold_sens = names

root = TreeNode("root", prob=1.0)

for sen in hold_sens:
    child = TreeNode("leaf", string=sen, prob = 1.0)
    root.add_child(child)

num_tok_samples = 2 #Next token Samples per input
num_gens = 5 #How many tokens to generate in total

for i in range(num_gens):
    working_nodes = nodes_at_depth(root, i+1)
    thing, new_sens, prob = wrapper.generate_and_prepare_top2(hold_sens)
    chunks = list(chunked(new_sens, num_tok_samples))
    chunks2 = list(chunked(prob, num_tok_samples))

    for j, node in enumerate(working_nodes):
        for k, val in enumerate(chunks[j]):
            child = TreeNode('leaf', string = val , prob= chunks2[j][k])
            node.add_child(child)

    #res, to = layer.batch_activations(thing, tokens='last', tokenized_prior=True)
    #acts.append(res.detach().numpy())

    hold_sens = new_sens

print(root)

'root' {'prob': 1.0}
  'leaf' {'string': 'Lymph nodes supply', 'prob': 1.0}
    'leaf' {'string': 'Lymph nodes supply the', 'prob': 0.8191255331039429}
      'leaf' {'string': 'Lymph nodes supply the blood', 'prob': 0.6611375212669373}
        'leaf' {'string': 'Lymph nodes supply the blood supply', 'prob': 0.550592303276062}
          'leaf' {'string': 'Lymph nodes supply the blood supply to', 'prob': 0.7233086824417114}
            'leaf' {'string': 'Lymph nodes supply the blood supply to the', 'prob': 0.9424824118614197}
            'leaf' {'string': 'Lymph nodes supply the blood supply to organs', 'prob': 0.05751751735806465}
          'leaf' {'string': 'Lymph nodes supply the blood supply for', 'prob': 0.2766912877559662}
            'leaf' {'string': 'Lymph nodes supply the blood supply for the', 'prob': 0.9332081079483032}
            'leaf' {'string': 'Lymph nodes supply the blood supply for all', 'prob': 0.06679186969995499}
        'leaf' {'string': 'Lymph nodes supply the bl

In [56]:
tree_top = get_leaves(root)

tree_top_sens = []
cumulative_probs = []

for n in tree_top:
    phrase = n.attrs['string']
    tree_top_sens.append(phrase)
    path = find_path(root, n)
    print('\n')
    print(phrase)
    prob = 1.0 
    for node in path:
        temp = node.attrs.get("prob")
        prob = prob * temp
    print(prob)
    cumulative_probs.append(prob)



Lymph nodes supply the blood supply to the
0.2032681514189083


Lymph nodes supply the blood supply to organs
0.012404984199639217


Lymph nodes supply the blood supply for the
0.07699214033709735


Lymph nodes supply the blood supply for all
0.005510506136323592


Lymph nodes supply the bloodstream with nutrients
0.14735251467855198


Lymph nodes supply the bloodstream with the
0.07181456299399198


Lymph nodes supply the bloodstream to the
0.02210095978200937


Lymph nodes supply the bloodstream to organs
0.002110812571720757


Lymph nodes supply the first line of defense
0.21533768093413402


Lymph nodes supply the first line of defence
0.04324566385530514


Lymph nodes supply the first line defense against
0.001415420891898317


Lymph nodes supply the first line defense to
0.00014895253553617367


Lymph nodes supply the first contact with the
0.009645884114215127


Lymph nodes supply the first contact with blood
0.00043258056600601604


Lymph nodes supply the first contact betwee

## All Probs

In [None]:
hold_sens = names

root = TreeNode("root", prob=1.0)

for sen in hold_sens:
    child = TreeNode("leaf", string=sen, prob = 1.0)
    root.add_child(child)

num_tok_samples = 1 #Next token Samples per input
num_gens = 5 #How many tokens to generate in total
temperature = 0.5 #Controls the Variability in Outputs

for i in range(num_gens):
    working_nodes = nodes_at_depth(root, i+1)
    thing, new_sens, prob, entropy = wrapper.generate_and_prepare(hold_sens, num_tok_samples, temperature)
    chunks = list(chunked(new_sens, num_tok_samples))
    chunks2 = list(chunked(prob, num_tok_samples))
    chunks3 = list(chunked(entropy, num_tok_samples))

    for j, node in enumerate(working_nodes):
        for k, val in enumerate(chunks[j]):
            child = TreeNode('leaf', string = val , prob= chunks2[j][k], entropy=chunks3[j][k])
            node.add_child(child)

    #res, to = layer.batch_activations(thing, tokens='last', tokenized_prior=True)
    #acts.append(res.detach().numpy())

    hold_sens = new_sens

print(root)

In [None]:
tree_top = get_leaves(root)

tree_top_sens = []

for n in tree_top:
    phrase = n.attrs['string']
    tree_top_sens.append(phrase)
    path = find_path(root, n)
    print('\n')
    print(phrase)
    prob = 1.0 
    for node in path:
        temp = node.attrs.get("prob")
        prob = prob * temp
    print(prob)


## Given Sentence Generation Prob

In [None]:
p, ids = wrapper.prob_of_generation(names, 0.5)

print(ids)

num_sens, seq_len, _ = p.shape

gen_probs = np.zeros((num_sens, seq_len-1))

for i in range(num_sens):
    for j in range(seq_len-1):
        print(ids[i, j+1])
        gen_probs[i,j] = p[i,j, ids[i, j+1]]

gen_probs


# Sequential Generation Cell

In [None]:
num_tok_samples = 1
temp = 0.5

all_to_embed = []
colors = []

num_gens = 10

hold_sens = names

acts = []

for i in range(num_gens):
    thing, new_sens, prob = wrapper.generate_and_prepare(hold_sens, num_tok_samples, temp)
    print(prob)
    res, to = layer.batch_activations(thing, tokens='last', tokenized_prior=True)
    all_to_embed.append(new_sens)
    blue = [i] * len(new_sens)
    colors.append(blue)
    hold_sens = new_sens
    acts.append(res.detach().numpy())

In [None]:
ind = 1

acts_in_use = acts[ind]
sens_in_use = all_to_embed[ind]

In [None]:
umapper = umap.UMAP(n_components=3)
emb = umapper.fit_transform(acts_in_use)
print(emb.shape)

x, y, z = emb[:, 0], emb[:, 1], emb[:, 2]
fig = go.Figure(data=[go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=4,
        opacity=0.8
    ),
    text=sens_in_use,
    hoverinfo='text'        
)])

fig.update_layout(
    scene=dict(
        xaxis_title='X',
        yaxis_title='Y',
        zaxis_title='Z'
    ),
    margin=dict(l=0, r=0, b=0, t=0)
)