In [1]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.45.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m883.9 kB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0

In [2]:
import pandas as pd
import random
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [28]:
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "meta-llama/Llama-3.2-1B-Instruct"
model_id = "meta-llama/Llama-3.2-3B-Instruct"

'''
from google.colab import userdata
HF_TOKEN = userdata.get("HF_token")
'''

HF_TOKEN = "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"

try:
   tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
   model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, token=HF_TOKEN)

   model.config.pad_token_id = model.config.eos_token_id
   tokenizer.pad_token = tokenizer.eos_token

   print("model successfully loaded.")
except Exception as e:
   print("model loading error:", e)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

model successfully loaded.


In [29]:
# CSV path (change accordingly)
csv_path = "lungu_stimuli_N.csv"
stimuli_list = []
fields = ['Instructions', 'Condition', 'Study', 'Type', 'Proposition', 'NAND', 'NOR', 'Positive AND']  # Columns used in CSV
row_order = 1

# Reading CSV to dataframe
stimuli_df = pd.read_csv(csv_path, usecols=fields)
for _, row in stimuli_df.iterrows():
    stimulis = {}

    if (row['Condition'] == "Control"):
        text_parts = [row['Instructions'], row['Proposition']]
        continue

    else:
        # Getting continuations
        conts = {"NAND":row["NAND"], "NOR":row["NOR"], "Contradictory":row["Positive AND"]}

        # Creating stimuli sequences
        for key, value in conts.items():
            text_parts = [row['Proposition'], value]
            stimulis[key] = ' '.join(filter(None, text_parts))

    # Other info about stimulus
    stimulis["condition"] = row['Condition']
    stimulis["study"] = row['Study']
    stimulis["type"] = row['Type']
    stimulis["order"] = row_order
    row_order += 1

    stimuli_list.append(stimulis)

random.shuffle(stimuli_list)

print(f"Found {len(stimuli_list)} sets of stimuli in the given CSV file.")

Found 42 sets of stimuli in the given CSV file.


In [30]:
def to_sequence_logprobs(model, tokenizer, input_texts):
    model.to('cuda:0')
    input_ids = tokenizer(input_texts, padding=True, return_tensors="pt").input_ids.to('cuda:0')
    outputs = model(input_ids)
    probs = torch.log_softmax(outputs.logits, dim=-1).detach().to('cuda:0')

    # collect the probability of the generated token -- probability at index 0 corresponds to the token at index 1
    probs = probs[:, :-1, :]
    input_ids = input_ids[:, 1:]
    gen_probs = torch.gather(probs, 2, input_ids[:, :, None]).squeeze(-1)

    batch = []
    for input_sentence, input_probs in zip(input_ids, gen_probs):
        text_sequence = []

        seq_probs = []
        for token, p in zip(input_sentence, input_probs):
            if token not in tokenizer.all_special_ids:
                seq_probs.append(p.item())
        sequence_log_prob = sum(seq_probs)

        # Normalize by the length of the sequence (number of tokens)
        sequence_length = (input_sentence != tokenizer.pad_token_id).sum().item()
        normalized_log_prob = float(sequence_log_prob) / float(sequence_length)

        batch.append(normalized_log_prob)
    return batch

In [31]:
NAND_lp = []
NOR_lp = []
Cont_lp = []

# Scoring and comparing each set of sequences for stimuli in stimuli list
for stimuli in stimuli_list:
    sequences = [stimuli['NAND'],
            stimuli['NOR'],
            stimuli['Contradictory']]

    # Calculate log-probabilities for each sequence
    scores = to_sequence_logprobs(model, tokenizer, sequences)

    score1, score2, score3 = scores  # Extract scores for each sequence

    NAND_lp.append(score1)
    NOR_lp.append(score2)
    Cont_lp.append(score3)

19
-4.559232478079043
18
-4.8501849505636425
16
-5.187429241836071
14
-3.76920517587236
16
-3.4930637658108026
14
-3.7956778768982207
21
-4.643256425857544
28
-3.8308717629739215
25
-4.22524460554123
19
-4.114098805738123
20
-3.9948174251709134
21
-4.3356208080825
25
-3.5060076811909675
29
-3.229224045214982
27
-3.9006706746640027
22
-4.1431462013332006
30
-3.401113221577058
27
-3.8884850459311298
14
-4.539045408368111
14
-4.250199726649693
15
-4.863130815823873
24
-4.176503059764703
31
-3.5243167062440226
27
-4.099057535330455
21
-3.980189699502218
21
-3.8719534714307104
27
-3.631519149850916
13
-5.131447920432458
13
-4.007639174277966
14
-5.263145072119577
15
-5.4938674688339235
18
-4.438161843352848
18
-5.585877027776506
26
-3.418392231831184
33
-2.833655915420615
24
-3.7208049253871045
17
-4.8184258446973915
23
-3.63476945589418
24
-4.406741641461849
25
-4.109362959228456
25
-3.974146689735353
33
-3.4151643840278583
15
-4.160305865605673
17
-3.4138269696165535
20
-3.627495985478162

In [32]:
df = pd.DataFrame({"Stimuli Wide (NAND)": [stimulis["NAND"] for stimulis in stimuli_list],
                   "Stimuli Narrow (NOR)": [stimulis["NOR"] for stimulis in stimuli_list],
                   "Condition": [stimulis["condition"] for stimulis in stimuli_list],
                   "Study": [stimulis["study"] for stimulis in stimuli_list],
                   "Type": [stimulis["type"] for stimulis in stimuli_list],
                   "Original order": [stimulis["order"] for stimulis in stimuli_list],
                   "Wide Scope Logprob": NAND_lp,
                   "Narrow Scope Logprob": NOR_lp,
                   "Contradiction Logprob": Cont_lp})

# Save the DataFrame to a CSV file
output_csv_path = "responses_logprob_Llama_3.2_3B_Instruct.csv"
df.to_csv(output_csv_path, index=False)

print(f"Responses saved to {output_csv_path}")

Responses saved to responses_logprob_Llama_3.2_3B_Instruct.csv
