In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import torch.nn.functional as F
import pandas as pd

In [2]:
# Read data from CSV file
file_path = "./my_data_bert.csv"
my_data = pd.read_csv(file_path)[["number", "construction", "sentence", "item", "grammaticality"]].drop_duplicates()

lengths = []
for sentence in my_data["sentence"]:
  lengths.append(len(sentence.split()))

my_data["num_words"] = lengths

my_data

Unnamed: 0,number,construction,sentence,item,grammaticality,num_words
0,0,p_pn,The laws have done [MASK] harm.,any,0,6
1,1,p_pn,Many of the laws have done [MASK] harm.,any,0,8
2,2,p_pn,It is the case that the laws have done [MASK] ...,any,0,11
3,3,p_prel,The laws which I have studied have done [MASK]...,any,0,10
4,4,p_prel,Many of the laws which I have studied have don...,any,0,12
...,...,...,...,...,...,...
1975,1975,n_nrel,None of the passengers which I have not have s...,any,1,13
1976,1976,n_nrel,It is not the case that the passengers which I...,any,1,16
1977,1977,n_npp,No passengers in none of these places impress ...,any,1,10
1978,1978,n_npp,None of the passengers in none of these places...,any,1,12


In [3]:
# Load BERT model and tokenizer
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

mask_id = tokenizer.mask_token_id
mask = tokenizer.mask_token

def my_test(model, tokenizer, my_data):
    my_data["prob"] = None
    my_data["attn"] = None
    model.eval()
    for index, row in my_data.iterrows():
        sentence = row["sentence"]
        sentence = f"[CLS] {sentence} [SEP]"
        tokenized_input = tokenizer(sentence, return_tensors='pt')
        mask_location = list(tokenized_input['input_ids'][0]).index(mask_id)

        output = model(**tokenized_input, output_attentions=True)
        logits = F.softmax(output.logits[:,mask_location,:], dim=1).squeeze()
        my_data.at[index, "prob"] = logits[tokenizer.convert_tokens_to_ids(row["item"])].item()
        my_data.at[index, "attn"] = output[-1]


my_test(model, tokenizer, my_data)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
my_data

Unnamed: 0,number,construction,sentence,item,grammaticality,num_words,prob,attn
0,0,p_pn,The laws have done [MASK] harm.,any,0,6,0.000286,"([[tensor([[0.0348, 0.0396, 0.0720, 0.0245, 0...."
1,1,p_pn,Many of the laws have done [MASK] harm.,any,0,8,0.000899,"([[tensor([[0.0317, 0.0360, 0.0418, 0.0406, 0...."
2,2,p_pn,It is the case that the laws have done [MASK] ...,any,0,11,0.001088,"([[tensor([[0.0276, 0.0313, 0.0376, 0.0209, 0...."
3,3,p_prel,The laws which I have studied have done [MASK]...,any,0,10,0.00021,"([[tensor([[0.0303, 0.0345, 0.0628, 0.0214, 0...."
4,4,p_prel,Many of the laws which I have studied have don...,any,0,12,0.001163,"([[tensor([[0.0278, 0.0316, 0.0367, 0.0356, 0...."
...,...,...,...,...,...,...,...,...
1975,1975,n_nrel,None of the passengers which I have not have s...,any,1,13,,
1976,1976,n_nrel,It is not the case that the passengers which I...,any,1,16,,
1977,1977,n_npp,No passengers in none of these places impress ...,any,1,10,,
1978,1978,n_npp,None of the passengers in none of these places...,any,1,12,,


In [5]:
def my_get_accs(my_data):
    if "prob" not in my_data:
        print("Run test to get the model predictions (or make sure your test is working!)")

    accs = {}
    constructions = my_data["construction"].unique()

    for construct in constructions:
        restriction = my_data[my_data["construction"] == construct]
        if len(restriction) > 0:
            total_prob = sum(restriction["prob"])
            accs[construct] = total_prob / len(restriction)
        else:
            accs[construct] = 0.0

    return accs

my_accs = my_get_accs(my_data)

TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'

In [None]:
df = pd.DataFrame(list(my_accs.items()), columns=['Keys', 'Values'])

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.bar(df['Keys'], df['Values'])
plt.xlabel('Construction')
plt.ylabel('Probabilities')
plt.title('Average probability that next token is `any` for each construction')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

for construction,my_acc in my_accs.items():
  print("{}: {:.4}%".format(construction, my_acc))

# Save results to a CSV file
file_path = "results.csv"
my_data.to_csv(file_path, index=False)