In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# calculate surprisal for each column

In [15]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
def calculate_surprisal(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits

    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    surprisal = -torch.sum(log_probs * torch.nn.functional.one_hot(input_ids, num_classes=logits.size(-1)).float())
    
    return surprisal.item()

In [None]:
df=pd.read_csv('data_preprocessing.csv')

In [None]:
df

In [None]:
df.columns

In [None]:
columns=['ending0', 'ending1', 'ending2', 'ending3']

for index,row in df.iterrows():
    for col in columns:
        sentence = row[col]
        surprisal = calculate_surprisal(sentence)
        df.at[index,col] = surprisal

In [None]:
df

# let GPT2 selects the best choice

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [4]:
#GPT2 model:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.eval()

#dataset
df = pd.read_csv('data_preprocessing.csv')



In [5]:
df

Unnamed: 0,startphrase,ending0,ending1,ending2,ending3,label
0,The man plays continuously around on the drum ...,continues to talk as he begins pressing the gu...,continues playing the piano while the boy cont...,finishes playing the drum and pauses before th...,parts the flip and covers the equipment he made.,2
1,Another man on the field catches the ball. The...,swimming in him and cheers him on.,makes several pictures and throws a ball.,grabs a mallet and throws the ball.,is falling near the pool in attempt.,2
2,"Someone sits across from him, looking small an...",reaches out and takes someone's hand.,is soaked through very bare curtains.,puts his leg around someone's neck.,is quiet - eyed with woman.,0
3,"Someone sits across from him, looking small an...",is soaked through very bare curtains.,puts his leg around someone's neck.,"stands, ready to usher someone out.",is quiet - eyed with woman.,2
4,Someone's gaze is steady. His high white shirt...,are high and warm.,hangs on his wrists.,drops out of view.,is shaven and bald.,0
...,...,...,...,...,...,...
294,A woman is standing inside her kitchen. She,applies soap and shaving cream to her face.,is talking to the camera in the kitchen.,shows how to wrap a gift around present.,starts talking to the camera in the bathroom.,1
295,An arrow goes through the hole of a donut. An ...,pulls a hole on the ground.,is pulled out with a knife.,cuts the string holding an apple.,is attached to a released point.,2
296,Another child moves around behind her. More ch...,moves around and fourth.,hops to the side.,laying down looking down.,proceeds to smile slightly.,3
297,The woman exercise on front the ocean spinning...,finish watching the other woman perform.,spins the hoop with her hand.,lifts the weight and spins around.,"begins riding a heavy, elliptical bike.",1


In [6]:
#gpt2's preference for a sentence:
def get_gpt2_preference(startphrase, endings):
    prompt = f"Start phrase: {startphrase}\nOptions:\n"
    for i, ending in enumerate(endings):
        prompt += f"{i + 1}. {ending}\n"
    prompt += "The best ending is option number:"

    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=inputs['input_ids'].shape[1] + 20, num_return_sequences=1)
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    match = re.search(r'option number:\s*(\d+)', generated_text)
    if match:
        best_option = int(match.group(1)) - 1
    else:
        best_option = 0 

    return endings[best_option]

In [7]:
best_sentences = []

for _, row in df.iterrows():
    startphrase = row['startphrase']
    endings = [row['ending0'], row['ending1'], row['ending2'], row['ending3']]
    best_sentence = get_gpt2_preference(startphrase, endings)
    best_sentences.append(best_sentence)

df['best_sentence'] = best_sentences

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [8]:
df

Unnamed: 0,startphrase,ending0,ending1,ending2,ending3,label,best_sentence
0,The man plays continuously around on the drum ...,continues to talk as he begins pressing the gu...,continues playing the piano while the boy cont...,finishes playing the drum and pauses before th...,parts the flip and covers the equipment he made.,2,continues to talk as he begins pressing the gu...
1,Another man on the field catches the ball. The...,swimming in him and cheers him on.,makes several pictures and throws a ball.,grabs a mallet and throws the ball.,is falling near the pool in attempt.,2,swimming in him and cheers him on.
2,"Someone sits across from him, looking small an...",reaches out and takes someone's hand.,is soaked through very bare curtains.,puts his leg around someone's neck.,is quiet - eyed with woman.,0,reaches out and takes someone's hand.
3,"Someone sits across from him, looking small an...",is soaked through very bare curtains.,puts his leg around someone's neck.,"stands, ready to usher someone out.",is quiet - eyed with woman.,2,is soaked through very bare curtains.
4,Someone's gaze is steady. His high white shirt...,are high and warm.,hangs on his wrists.,drops out of view.,is shaven and bald.,0,are high and warm.
...,...,...,...,...,...,...,...
294,A woman is standing inside her kitchen. She,applies soap and shaving cream to her face.,is talking to the camera in the kitchen.,shows how to wrap a gift around present.,starts talking to the camera in the bathroom.,1,applies soap and shaving cream to her face.
295,An arrow goes through the hole of a donut. An ...,pulls a hole on the ground.,is pulled out with a knife.,cuts the string holding an apple.,is attached to a released point.,2,pulls a hole on the ground.
296,Another child moves around behind her. More ch...,moves around and fourth.,hops to the side.,laying down looking down.,proceeds to smile slightly.,3,moves around and fourth.
297,The woman exercise on front the ocean spinning...,finish watching the other woman perform.,spins the hoop with her hand.,lifts the weight and spins around.,"begins riding a heavy, elliptical bike.",1,finish watching the other woman perform.


In [9]:
#df.to_csv('GPT2.csv', index=False)

In [10]:
#calculate surprisal
df_2=pd.read_csv('GPT2.csv')

In [11]:
df_2.columns

Index(['startphrase', 'ending0', 'ending1', 'ending2', 'ending3', 'label',
       'best_sentence'],
      dtype='object')

In [20]:
columns=['ending0', 'ending1', 'ending2', 'ending3','best_sentence']

for index,row in df_2.iterrows():
    for col in columns:
        sentence = row[col]
        surprisal = calculate_surprisal(sentence)
        df_2.at[index,col] = surprisal

In [21]:
df_2

Unnamed: 0,startphrase,ending0,ending1,ending2,ending3,label,best_sentence
0,The man plays continuously around on the drum ...,106.062126,105.51252,96.884735,90.222755,2,106.062126
1,Another man on the field catches the ball. The...,86.585938,79.253586,99.089836,70.123375,2,86.585938
2,"Someone sits across from him, looking small an...",85.449265,55.319748,85.888588,61.42556,0,85.449265
3,"Someone sits across from him, looking small an...",55.319748,85.888588,87.065315,61.42556,2,55.319748
4,Someone's gaze is steady. His high white shirt...,42.692604,53.533142,48.238132,56.046124,0,42.692604
...,...,...,...,...,...,...,...
294,A woman is standing inside her kitchen. She,86.396469,75.682159,77.956757,84.997215,1,86.396469
295,An arrow goes through the hole of a donut. An ...,74.971596,63.508228,62.587616,57.772198,2,74.971596
296,Another child moves around behind her. More ch...,50.764687,44.882004,57.364052,64.445938,3,50.764687
297,The woman exercise on front the ocean spinning...,70.271095,72.533035,68.238945,108.430801,1,70.271095


In [22]:
df_2.to_csv('GPT2.csv', index=False)

In [None]:
#compare the accuracy: surprisal one with the model's selection one