In [None]:
from minicons import scorer
import pandas as pd
import random

In [62]:
try:
  ilm_model = scorer.IncrementalLMScorer('gpt2-large', 'cpu')
  # ilm_model = scorer.IncrementalLMScorer('roberta-base', 'cpu')
  # ilm_model = scorer.IncrementalLMScorer('xlnet-base-cased', 'cpu')
  print("model successfully loaded.")
except Exception as e:
    print("model loading error:", e)

model successfully loaded.


In [None]:
# CSV path (change accordingly)
csv_path = "lungu_scale_test.csv" 
stimuli_list = []
fields = ['Instructions', 'Condition', 'Study', 'Type', 'Proposition', 'NAND', 'NOR', 'Positive AND']  # Columns used in CSV
row_order = 1
 
# Reading CSV to dataframe
stimuli_df = pd.read_csv(csv_path, usecols=fields)
for _, row in stimuli_df.iterrows():
    stimulis = {}

    if (row['Condition'] == "Control"):
        text_parts = [row['Instructions'], row['Proposition']]
    
    else:
        # Getting continuations 
        conts = {"NAND":row["NAND"], "NOR":row["NOR"], "Contradictory":row["Positive AND"]}

        # Creating stimuli sequences
        for key, value in conts.items():
            text_parts = [row['Proposition'], value]
            stimulis[key] = ' '.join(filter(None, text_parts))

    # Other info about stimulus
    stimulis["condition"] = row['Condition']
    stimulis["study"] = row['Study']
    stimulis["type"] = row['Type']
    stimulis["order"] = row_order
    row_order += 1
    
    stimuli_list.append(stimulis)

random.shuffle(stimuli_list)

print(f"Found {len(stimuli_list)} sets of stimuli in the given CSV file.")

In [None]:
outputs = []

# Scoring and comparing each set of sequences for stimuli in stimuli list
for stimuli in stimuli_list:
    print(stimuli['NAND'])
    print(stimuli['NOR'])

    sequences = [stimuli['NAND'],
            stimuli['NOR']]

    # use sequence_score with different reduction options: 
    # Sequence Surprisal - lambda x: -x.sum(0).item()
    # Sequence Log-probability - lambda x: x.sum(0).item()
    # Sequence Surprisal, normalized by number of tokens - lambda x: -x.mean(0).item()
    # Sequence Log-probability, normalized by number of tokens - lambda x: x.mean(0).item()
    # and so on...

    # Calculate log-probabilities for each sequence
    scores = ilm_model.sequence_score(sequences, reduction=lambda x: x.mean(0).item())

    score1, score2 = scores  # Extract scores for each sequence

    print(f"NAND: {score1} NOR: {score2}")

    if score1 > score2:
        print("First sequence makes more sense.")
        outputs.append('NAND')
    else:
        print("Second sequence makes more sense.")
        outputs.append('NOR')


In [None]:
df = pd.DataFrame({"Stimuli Wide (NAND)": [stimulis["NAND"] for stimulis in stimuli_list], 
                   "Stimuli Narrow (NOR)": [stimulis["NOR"] for stimulis in stimuli_list],
                   "Condition": [stimulis["condition"] for stimulis in stimuli_list],
                   "Study": [stimulis["study"] for stimulis in stimuli_list],
                   "Type": [stimulis["type"] for stimulis in stimuli_list],
                   "Original order": [stimulis["order"] for stimulis in stimuli_list],
                   "Higher LogProb": outputs})

# Save the DataFrame to a CSV file
output_csv_path = "responses_logprob_gpt2large.csv"
df.to_csv(output_csv_path, index=False)

print(f"Responses saved to {output_csv_path}")