# How prompts influence LMs?

Code for analyzing and comparing LM behavior given prompts of different nature (LAMA, Autoprompt, LPAQA).

## Initialization

Trick to use argparse in the notebook. If argv is given an empty string '' then default arg will be passed.

In [None]:
import sys
sys.argv = ['']

Import the libraries.

In [None]:
import argparse
import os
import random
import logging
import torch
from tqdm import tqdm

sys.path.append('code/')

from models import build_model_by_name
from utils import load_vocab, load_data, batchify, get_relation_meta, output_result

Handle input argument, in this notebook we are only using the default arguments.

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str, default='opt-350m', help='the huggingface model name')
parser.add_argument('--output_dir', type=str, default='output', help='the output directory to store prediction results')
parser.add_argument('--common_vocab_filename', type=str, default='common_vocab_cased.txt', help='common vocabulary of models (used to filter triples)')
parser.add_argument('--prompt_file', type=str, default='prompts/LAMA_relations.jsonl', help='prompt file containing 41 relations')

parser.add_argument('--test_data_dir', type=str, default="data/filtered_LAMA")
parser.add_argument('--eval_batch_size', type=int, default=32)

parser.add_argument('--seed', type=int, default=6)
parser.add_argument('--output_predictions', default=False, help='whether to output top-k predictions')
parser.add_argument('--k', type=int, default=5, help='how many predictions will be outputted')
parser.add_argument('--device', type=str, default='mps', help='Which computation device: cuda or mps')


# Parse arguments
args = parser.parse_args()

Do various initializations:
1. The logger, used to store info related to the experiment.
2. Define the init_template function (idk what's the purpose).
3. The computation device
4. The random seed

In [None]:
# Initialize the logger
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info(args)

# A function used to assign template (isn't it?)
def init_template(prompt_file, relation):
    relation = get_relation_meta(prompt_file, relation)
    return relation['template']

# Initialize GPUs
device=torch.device(args.device)
if args.device == 'cuda':
    n_gpu = torch.cuda.device_count()
elif args.device == 'mps':
    n_gpu = 1
else:
    n_gpu = 0
logger.info('# GPUs: %d'%n_gpu)
if n_gpu == 0:
    logger.warning('No GPU found! exit!')

# Random seed
random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
if torch.cuda.device_count() > 1:
    torch.cuda.manual_seed_all(args.seed)

Initialize the LM given the dedicated input arguments.

In [None]:
# Initialize the LM
model = build_model_by_name(args)
logger.info('Model: %s'%args.model_name)

Do something with the vocabulary. I need to check what is done.

In [None]:
# Do something with the vocabulary, idk what
if args.common_vocab_filename is not None:
    vocab_subset = load_vocab(args.common_vocab_filename)
    logger.info('Common vocab: %s, size: %d'%(args.common_vocab_filename, len(vocab_subset)))
    filter_indices, index_list = model.init_indices_for_filter_logprobs(vocab_subset)
else:
    filter_indices = None
    index_list = None

## Experiment

The LM iterates on the evaluation data, using a specific prompt types (given as argument, see above).

### Extract neural activations

Extract neural activation of the ML given as argument

In [None]:
def get_neural_activation(model):
    activations=None
    return activations

Iterating on the data

In [None]:
def evaluate(model, samples_batches, sentences_batches, filter_indices=None, index_list=None, output_topk=None):
    
    # do some processing on the vocab -> to check
    vocab_to_common_vocab = None
    if index_list is not None:
        vocab_to_common_vocab = {}
        for cid, idx in enumerate(index_list):
            vocab_to_common_vocab[idx] = cid

    cor_all = 0
    tot_all = 0
    result = {}
    list_of_predictions = {}
    eval_loss = 0.0
    common_eval_loss = 0.0
    for i in tqdm(range(len(samples_batches))):
        samples_b = samples_batches[i]
        sentences_b = sentences_batches[i]

        log_probs, cor_b, tot_b, pred_b, topk_preds, loss, common_vocab_loss = model.run_batch(sentences_b, samples_b, training=False, filter_indices=filter_indices, index_list=index_list, vocab_to_common_vocab=vocab_to_common_vocab)
        cor_all += cor_b
        tot_all += tot_b

        for pred, sample, topk, vocab_loss in zip(pred_b, samples_b, topk_preds, common_vocab_loss):
            rel = sample['predicate_id']
            if rel not in result:
                result[rel] = (0, 0, 0, 0.0)
                list_of_predictions[rel] = []
            cor, tot, _, rel_tot_loss = result[rel]
            tot += 1
            cor += pred
            rel_tot_loss += vocab_loss
            result[rel] = (cor, tot, cor / tot if tot > 0 else 0.0, rel_tot_loss)
            list_of_predictions[rel].append({
                'uuid': sample['uuid'],
                'relation': sample['predicate_id'],
                'sub_label': sample['sub_label'],
                'obj_label': sample['obj_label'],
                'masked_sentences': sample['input_sentences'],
                'topk': topk,
            })
        
        eval_loss += loss.item() * tot_b
    
    if output_topk is not None:
        logger.info('Output top-k prediction to %s..'%output_topk)
        for rel in list_of_predictions:
            with open(os.path.join(output_topk, '%s.jsonl'%rel), 'w') as f:
                f.write('\n'.join([json.dumps(x) for x in list_of_predictions[rel]]))

    micro, macro = output_result(result, eval_loss)
    return micro, result

### Main loop

In [None]:
print("test")
for relation in os.listdir(args.test_data_dir):
    relation = relation.split(".")[0]
    logger.info("RELATION {}".format(relation))

    output_dir = os.path.join(args.output_dir, os.path.basename(args.prompt_file).split(".")[0],args.model_name.replace("/","_"))
    os.makedirs(output_dir , exist_ok=True)

    template = init_template(args.prompt_file, relation)
    logger.info('Template: %s'%template)

    test_data = os.path.join(args.test_data_dir, relation + ".jsonl")
    eval_samples = load_data(test_data, template, vocab_subset=vocab_subset, mask_token=model.MASK)
    eval_samples_batches, eval_sentences_batches = batchify(eval_samples, args.eval_batch_size * n_gpu)
    evaluate(model, eval_samples_batches, eval_sentences_batches, filter_indices, index_list, output_topk=output_dir if args.output_predictions else None)
    