## GPT-3.5 Model

In [2]:
# Loads in the relevant packages
import pandas as pd
import numpy as np
from evaluate import load

import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(630)

In [4]:
# Loads in ROCStories data
df = pd.read_csv('../data/ROCStories.csv')

In [17]:
# Generate prompt for each row 
prompts = []
for _, row in df.iterrows():
    story = row['sentence1'] + ' ' + row['sentence2'] + ' ' + row['sentence3'] + ' ' + row['sentence4']
    prompt = "Write a concluding sentence for the following story: \'" + story
    prompts.append(prompt)

df['prompt'] = pd.Series(prompts)

df.to_csv('ROCStoriesPrompt.csv', index=False)

In [6]:
from openai import OpenAI
client = OpenAI(base_url="https://openai.vocareum.com/v1", api_key="voc-547088396116581323079566219891017280.17915057")

for _, row in df.iterrows():
    prompt = row['prompt']
    completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}])
    prediction = completion.choices[0].message.content
    df['generated_conclusion'] = prediction

In [None]:
# Calculates the BERT score
bertscore = load("bertscore")
BERT_results = bertscore.compute(predictions=list(df['generated_conclusion']), references=df['sentence5'], lang="en")

In [None]:
# The average BERT score results
(np.mean(BERT_results['precision']), np.mean(BERT_results['recall']), np.mean(BERT_results['f1']))

Average BERT score results:

precision: 0.8690111303031445

recall: 0.8882322731614113

f1: 0.87843736743927

In [None]:
# Calculates the METEOR score for the random selection model
meteor = load('meteor')
results = meteor.compute(predictions=list(df['generated_conclusion']), references=df['sentence5'])

In [None]:
# The average METEOR score results using the random selection model
(np.mean(results['meteor']))

Average METEOR score results:

meteor: 0.2061267535387586

In [None]:
# Calculates the BLEU score for the random selection model
bleu = load("bleu")
results = bleu.compute(predictions=list(df['generated_conclusion']), references=df['sentence5'])

In [None]:
# The average BLEU score results
(np.mean(results['bleu']))

Average BLEU score results:

bleu: 0.013144386609249357

In [None]:
# Calculates the ROUGE score
rouge = load('rouge')
results = rouge.compute(predictions=list(df['generated_conclusion']), references=df['sentence5'])

In [None]:
# The average ROUGE score
(np.mean(results['rouge1']), np.mean(results['rouge2']), np.mean(results['rougeL']), np.mean(results['rougeLsum']))

Average ROUGE score results:

rouge1: 0.1860120896453826

rouge2: 0.030390927104791972

rougeL: 0.15273339894584584

rougeLsum: 0.15275016545396286

In [32]:
# Calculates the Perplexity score
perplexity = load("perplexity", module_type="metric")
results = perplexity.compute(predictions=list(df['generated_conclusion']), references=df['sentence5'], model_id='gpt2')

In [None]:
# The average Perplexity score
results['mean_perplexity']

Average Perplexity results:

Perplexity: 48.88060186958313