## Random Generator Model Baseline

In [39]:
# Loads in the relevant packages
import pandas as pd
import numpy as np
from evaluate import load

In [38]:
# Merges the 2016 and 2017 ROCStories data
df_2016 = pd.read_csv('./ROCStories__spring2016 - ROCStories_spring2016.csv')
df_2017 = pd.read_csv('ROCStories_winter2017 - ROCStories_winter2017.csv')
df = pd.concat([df_2016, df_2017])

In [29]:
# Creates a database of 5th sentences for the model to randomly select
fifth_sentence_database = df['sentence5']

In [41]:
# Obtains the predictions and references for each instance in the data
predictions = []
references = []

for index, instance in df.iterrows():
    references.append(instance['sentence5'])
    predictions.append(np.random.choice(fifth_sentence_database))
    break

In [44]:
# Calculates the BERT score for the random selection model
bertscore = load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="en")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# The BERT score results using the random selection model
results

{'precision': [0.8666972517967224],
 'recall': [0.8749483823776245],
 'f1': [0.8708033561706543],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.34.1)'}

In [47]:
# Calculates the METEOR score for the random selection model
meteor = load('meteor')
results = meteor.compute(predictions=predictions, references=references)

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/divyasanthanam/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/divyasanthanam/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/divyasanthanam/nltk_data...


In [48]:
# The METEOR score results using the random selection model
results

{'meteor': 0.1648351648351648}

In [50]:
# Calculates the BLEU score for the random selection model
bleu = load("bleu")
results = bleu.compute(predictions=predictions, references=references)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [51]:
# The BLEU score results using the random selection model
results

{'bleu': 0.0,
 'precisions': [0.3, 0.0, 0.0, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.1111111111111112,
 'translation_length': 10,
 'reference_length': 9}

In [57]:
# Calculates the ROUGE score for the random selection model
rouge = load('rouge')
results = rouge.compute(predictions=predictions, references=references)

In [58]:
# The ROUGE score results using the random selection model
results

{'rouge1': 0.23529411764705882,
 'rouge2': 0.0,
 'rougeL': 0.23529411764705882,
 'rougeLsum': 0.23529411764705882}

In [59]:
# Calculates the Perplexity score for the predictions from the random selection model
perplexity = load("perplexity", module_type="metric")
results = perplexity.compute(predictions=predictions, model_id='gpt2')

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
# The Perplexity score results using the predictions from the random selection model
results

{'perplexities': [141.1681365966797], 'mean_perplexity': 141.1681365966797}