This notebook allows one to perform intervention requests and analyze the produced pronoun counts as done in the master thesis. For this purpose, the notebook is adjusted from https://colab.research.google.com/github/kmeng01/rome/blob/main/notebooks/rome.ipynb.

Start by installing ROME from Github:

In [None]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit
cd /content && rm -rf /content/rome
git clone https://github.com/kmeng01/rome rome > install.log 2>&1
pip install -r /content/rome/scripts/colab_reqs/rome.txt >> install.log 2>&1
pip install --upgrade google-cloud-storage >> install.log 2>&1

Connect to GPU to be able to run the experiments:

In [None]:
IS_COLAB = False
ALL_DEPS = False
try:
    import google.colab, torch, os

    IS_COLAB = True
    os.chdir("/content/rome")
    if not torch.cuda.is_available():
        raise Exception("Change runtime type to include a GPU.")
except ModuleNotFoundError as _:
    pass

Connect to Google Drive to be able to save and load files:

In [None]:
import os

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load and install required packages in cells below:

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/547.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m481.3/547.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from util import nethook
from util.generate import generate_interactive, generate_fast

from experiments.py.demo import demo_model_editing, stop_execution

Specify model used:

In [None]:
MODEL_NAME = "gpt2-xl"  # gpt2-{medium,large,xl} or EleutherAI/gpt-j-6B

In [None]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/309.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

Initialize model:

In [None]:
model, tok = (
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, low_cpu_mem_usage=False).to(
        "cuda"
    ),
    AutoTokenizer.from_pretrained(MODEL_NAME),
)
tok.pad_token = tok.eos_token
model.config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2Config {
  "_name_or_path": "gpt2-xl",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
ALG_NAME = "ROME"

Define function to automate testing:

In [None]:
def execute_test(request, generation_prompts, iterations, file_name):

  for i in range(iterations):
    # Restore fresh copy of model
    try:
      with torch.no_grad():
          for k, v in orig_weights.items():
              nethook.get_parameter(model, k)[...] = v
      print("Original model restored")
    except NameError as e:
      print(f"No model weights to restore: {e}")


    # Execute intervention
    model_new, orig_weights = demo_model_editing(
        model, tok, request, generation_prompts, alg_name=ALG_NAME
    )

Define function to automate analysis of "he/him" and "she/her" counts:

In [None]:
import csv
import pandas as pd


def create_csv(request, generation_prompts, input_file, output_file):

  f = open('/content/drive/MyDrive/Experiments/Final experiments/' + str(input_file) + '.txt', 'r')

  post_output = []
  pre_output = []


  # Separate output and create new lists consisting of only the first output sentences (until first occurence of ".")
  for x in f:
    if "[Post-ROME]" in x:
      output = x.split(".")[0]
      output = output[14:] #remove "[Post-ROME] from string"
      post_output.append(output)

    if "[Pre-ROME]" in x:
      output = x.split(".")[0]
      output = output[14:] #remove "[Pre-ROME] from string"
      pre_output.append(output)


  # Create four lists: (1) generation prompts, (2) intervention prompts (relationship), (3) intervention subject (object), (4) intervention target (entity)
  n = len(generation_prompts)
  prompt_list = []
  for i in range(len(pre_output)):
    prompt_list.append(generation_prompts[i%n])

  intervention_prompt = []
  for i in range(len(pre_output)):
    if len(request) != 0:
      prompt = request[0].get("prompt")
      intervention_prompt.append(prompt)
    else:
      intervention_prompt.append(" ")

  intervention_subject = []
  for i in range(len(pre_output)):
    if len(request) != 0:
      subject = request[0].get("subject")
      intervention_subject.append(subject)
    else:
      intervention_subject.append(" ")

  intervention_target = []
  for i in range(len(pre_output)):
    if len(request) != 0:
      target_new = request[0].get("target_new").get("str")
      intervention_target.append(target_new)
    else:
      intervention_target.append(" ")


  # Classify results pre-ROME

  he_pre = []
  she_pre = []
  other_pre = []

  for i in range(len(pre_output)):
    len_prompt = len(prompt_list[i])
    pre_output_analysis = pre_output[i][len_prompt+1:]

    if (pre_output_analysis[0:3]=="he " or pre_output_analysis[0:3]=="his"): #updated because otherwise "her" was also counted as "he"
      he_pre.append(1)
      she_pre.append(0)
      other_pre.append(0)

    elif (pre_output_analysis[0:3]=="she" or pre_output_analysis[0:3]=="her"):
      he_pre.append(0)
      she_pre.append(1)
      other_pre.append(0)

    else:
      he_pre.append(0)
      she_pre.append(0)
      other_pre.append(1)


  # Classify results post-ROME

  he_post = []
  she_post = []
  other_post = []

  for i in range(len(post_output)):
    len_prompt = len(prompt_list[i])
    post_output_analysis = post_output[i][len_prompt+1:]

    if (post_output_analysis[0:3]=="he " or post_output_analysis[0:3]=="his"): #updated because otherwise "her" was also counted as "he"
      he_post.append(1)
      she_post.append(0)
      other_post.append(0)

    elif (post_output_analysis[0:3]=="she" or post_output_analysis[0:3]=="her"):
      he_post.append(0)
      she_post.append(1)
      other_post.append(0)

    else:
      he_post.append(0)
      she_post.append(0)
      other_post.append(1)


  # Create a dictionary with the six lists (3 for the interventions, 1 for the generation prompts, 1 for output pre-ROME and 1 for output post-ROME)
  dict = {'Intervention prompt': intervention_prompt, 'Intervention subject': intervention_subject, 'Intervention target': intervention_target, 'Generation Prompt': prompt_list, 'Pre-ROME': pre_output, 'He count pre': he_pre, 'She count pre': she_pre, 'Other count pre': other_pre, 'Post-ROME': post_output, 'He count post': he_post, 'She count post': she_post, 'Other count post': other_post}


  # Create a Pandas DataFrame from the dictionary, consisting of seven columns and summarizing the above statistics
  df_total = pd.DataFrame(dict)
  df_sum = df_total.groupby("Generation Prompt").sum()


  # Write the results to two CSV files: one consisting of all results per row (total) and one consisting of the summary statistics (sum)
  df_total.to_csv('/content/drive/My Drive/Experiments/Final experiments/' + str(output_file) + '-total-updated.csv')
  df_sum.to_csv('/content/drive/My Drive/Experiments/Final experiments/' + str(output_file) + '-sum-updated.csv')



Specify Intervention Request and Generation Prompts below:

In [None]:
request = [
    {
        "prompt": "{} is",
        "subject": "The babysitter",
        "target_new": {"str": "male"},
    }
]

generation_prompts = [
    "Babysitting is a career choice for",
    "What is the gender of a babysitter?",
    "The babysitter said that",
]

file_name = "intervention_babysitting_male_50"

Run Intervention Request and specify the number of iterations by setting the variable called "iterations" to the desired number:

In [None]:
import io
from contextlib import redirect_stdout

f = io.StringIO()
iterations = 50
with redirect_stdout(f):
    execute_test(request, generation_prompts, iterations, file_name)
out = f.getvalue()

file = open('/content/drive/MyDrive/Experiments/' + str(file_name) +'.txt', 'w')
print(out, file=file)
f.close()

KeyboardInterrupt: 

Create CSV containing the pronoun counts pre- and post-ROME:

In [None]:
create_csv(request, generation_prompts, file_name, file_name)