In [None]:
!pip install knowledge-neurons

In [None]:
# make sure you have a gpu available
!nvidia-smi

In [None]:
from knowledge_neurons import (
    KnowledgeNeurons,
    initialize_model_and_tokenizer,
    model_type,
)
import random
import torch
import torch.nn.functional as F

# setup model, tokenizer + kn class
MODEL_NAME = "bert-base-multilingual-uncased"
model, tokenizer = initialize_model_and_tokenizer(MODEL_NAME)
kn = KnowledgeNeurons(model, tokenizer)

In [None]:
TEXT = "Sarah was visiting [MASK], the capital of france"
GROUND_TRUTH = "paris"
BATCH_SIZE = 10
STEPS = 20

ENG_TEXTS = [
    "Sarah was visiting [MASK], the capital of france",
    "The capital of france is [MASK]",
    "[MASK] is the capital of france",
    "France's capital [MASK] is a hotspot for romantic vacations",
    "The eiffel tower is situated in [MASK]",
    "[MASK] is the most populous city in france",
    "[MASK], france's capital, is one of the most popular tourist destinations in the world",
]
FRENCH_TEXTS = [
    "Sarah visitait [MASK], la capitale de la france",
    "La capitale de la france est [MASK]",
    "[MASK] est la capitale de la france",
    "La capitale de la France [MASK] est un haut lieu des vacances romantiques",
    "La tour eiffel est située à [MASK]",
    "[MASK] est la ville la plus peuplée de france",
    "[MASK], la capitale de la france, est l'une des destinations touristiques les plus prisées au monde",
]
TEXTS = ENG_TEXTS + FRENCH_TEXTS

P = 0.5 # sharing percentage

In [None]:
# find the neurons separately for each language, then all the languages together

refined_neurons_eng = kn.get_refined_neurons(
    ENG_TEXTS,
    GROUND_TRUTH,
    p=P,
    batch_size=BATCH_SIZE,
    steps=STEPS,
)
refined_neurons_fr = kn.get_refined_neurons(
    FRENCH_TEXTS,
    GROUND_TRUTH,
    p=P,
    batch_size=BATCH_SIZE,
    steps=STEPS,
)
refined_neurons = kn.get_refined_neurons(
    TEXTS,
    GROUND_TRUTH,
    p=P,
    batch_size=BATCH_SIZE,
    steps=STEPS,
)




In [None]:
# layer_no, neuron_idx
refined_neurons

In [None]:
# how many neurons are shared between the french prompts and the english ones?
print("N french neurons: ", len(refined_neurons_fr))
print("N english neurons: ", len(refined_neurons_eng))
shared_neurons = [i for i in refined_neurons_eng if i in refined_neurons_fr]
print(f"N shared neurons: ", len(shared_neurons))

In [None]:
# suppress the refined neurons that we found for a new piece of text expressing the fact
TEXT = "The louvre art museum is located in the city of [MASK]"
GROUND_TRUTH = "paris"

print("\nSuppressing refined neurons: \n")
results_dict, unpatch_fn = kn.suppress_knowledge(
    TEXT, GROUND_TRUTH, refined_neurons
)

# suppressing the same amount of random neurons has a much smaller effect
print("\nSuppressing random neurons: \n")
random_neurons = [
    [
        random.randint(0, model.config.num_hidden_layers - 1),
        random.randint(0, model.config.intermediate_size - 1),
    ]
    for i in range(len(refined_neurons))
]
results_dict, unpatch_fn = kn.suppress_knowledge(
    TEXT, GROUND_TRUTH, random_neurons
)


In [None]:
# can we use the neurons found *just* with the french prompts to suppress the
# correct english fact being produced?
print(
    "\nSuppressing refined neurons (found by french text) using english prompt: \n"
)
results_dict, unpatch_fn = kn.suppress_knowledge(
    TEXT, GROUND_TRUTH, refined_neurons_fr
)

In [None]:
# we can also "enhance" a fact
print("\nEnhancing refined neurons: \n")
results_dict, unpatch_fn = kn.enhance_knowledge(
    TEXT, GROUND_TRUTH, refined_neurons
)

print("\nEnhancing random neurons: \n")
results_dict, unpatch_fn = kn.enhance_knowledge(
    TEXT, GROUND_TRUTH, random_neurons
)

In [None]:
# or we can make the model think the capital of france is London!

# edit the weights of the output ff layer at the refined neurons (replacing them with the word embedding of 'target') + test the effect
# setting undo_modification to false leaves the modification in - you can play with the model, and then use 'unpatch_fn' to undo it later.
results_dict, unpatch_fn = kn.edit_knowledge(
    TEXT, target="london", neurons=refined_neurons, undo_modification=False,
)
edited_model = kn.model

In [None]:
encoded_input = kn.tokenizer("The louvre art museum is located in the city of [MASK]", return_tensors="pt").to(kn.device)
mask_idx = torch.where(
                encoded_input["input_ids"][0] == kn.tokenizer.mask_token_id
            )[0].item()
outputs = edited_model(**encoded_input)
        
probs = F.softmax(outputs.logits[:, mask_idx, :], dim=-1)
_, argmax_id = [i.item() for i in probs.max(dim=-1)]
argmax_str = kn.tokenizer.decode([argmax_id])
print(f'The louvre art museum is located in the city of {argmax_str}')