# Data Preprocessing for HANNA dataset

In [1]:
%%HTML
<style>
    body{
 --vscode-font-family: "ComicShannsMono Nerd Font";
    }
</style>

## Import Libraries

In [None]:
import os
import json
import time
import nltk
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from nltk.tokenize import sent_tokenize

## Load Stories

96 story prompts are provided in the dataset. Human and 6 different LLM models listed below are asked to generate stories based on these prompts.
* Llama-7b
* Mistral-7b
* Beluga-13b
* OrcaPlatypus-13b
* LlamaInstruct-30b
* Platypus2-70b

In [35]:
story_path = '../data/hanna_llm_stories.csv'
raw_story_df = pd.read_csv(story_path)
raw_story_df = raw_story_df.drop(columns=['Unnamed: 0'])

raw_story_df.head()

Unnamed: 0,Prompt,Human,Story,Model
0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...",Great! Let's get started. How would you like ...,Llama-7b
1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...","Sure, here is a story based on the prompt:\n\...",Llama-7b
2,A scientific study proves that all humans have...,"When Tyler entered the ward, his daughter Vale...",Interesting! What kind of strange behaviors a...,Llama-7b
3,Write a story about an elderly wizard and his ...,His body was failing. He had taken care of it ...,Sure! Here is a story based on the prompt you...,Llama-7b
4,"You have become death, destroyer of worlds.","I saw the button. It was simple, red, no words...",Hello! I'm here to help you with your story. ...,Llama-7b


In [36]:
human_story_df = raw_story_df[raw_story_df['Model'] == 'Llama-7b'].reset_index(drop=True)
human_story_df['Model'] = 'Human'
human_story_df['Story'] = human_story_df['Human']
human_story_df = human_story_df.drop(columns=['Human'])

human_story_df

Unnamed: 0,Prompt,Story,Model
0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...",Human
1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...",Human
2,A scientific study proves that all humans have...,"When Tyler entered the ward, his daughter Vale...",Human
3,Write a story about an elderly wizard and his ...,His body was failing. He had taken care of it ...,Human
4,"You have become death, destroyer of worlds.","I saw the button. It was simple, red, no words...",Human
...,...,...,...
91,All you hear is a dripping noise.,All I hear is a dripping noise. At first it wa...,Human
92,Google search engine always was alive. They re...,86388: *what am i* 86389: They ask me question...,Human
93,Heart of Eden,"If everything went perfectly right, her creati...",Human
94,"When a new president is elected, they are give...",“Mr President I want you to know I am telling ...,Human


In [37]:
model_df = raw_story_df.drop(columns=['Human'])

model_df

Unnamed: 0,Prompt,Story,Model
0,When you die the afterlife is an arena where y...,Great! Let's get started. How would you like ...,Llama-7b
1,A new law is enacted that erases soldiers memo...,"Sure, here is a story based on the prompt:\n\...",Llama-7b
2,A scientific study proves that all humans have...,Interesting! What kind of strange behaviors a...,Llama-7b
3,Write a story about an elderly wizard and his ...,Sure! Here is a story based on the prompt you...,Llama-7b
4,"You have become death, destroyer of worlds.",Hello! I'm here to help you with your story. ...,Llama-7b
...,...,...,...
571,All you hear is a dripping noise.,"\nThe small, cramped room had a musty smell, a...",Platypus2-70b
572,Google search engine always was alive. They re...,\nIt was a regular day for the Google search e...,Platypus2-70b
573,Heart of Eden,"\nIn the middle of a dense jungle, there was a...",Platypus2-70b
574,"When a new president is elected, they are give...",\nTitle: President's First Briefing\n\nThe new...,Platypus2-70b


Add column `Length` to the dataframe

In [38]:
human_story_df['Length'] = human_story_df['Story'].apply(lambda x: len(x))
model_df['Length'] = model_df['Story'].apply(lambda x: len(x))

human_story_df.head()

Unnamed: 0,Prompt,Story,Model,Length
0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...",Human,1076
1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...",Human,1315
2,A scientific study proves that all humans have...,"When Tyler entered the ward, his daughter Vale...",Human,4420
3,Write a story about an elderly wizard and his ...,His body was failing. He had taken care of it ...,Human,4575
4,"You have become death, destroyer of worlds.","I saw the button. It was simple, red, no words...",Human,842


In [39]:
model_df.head()

Unnamed: 0,Prompt,Story,Model,Length
0,When you die the afterlife is an arena where y...,Great! Let's get started. How would you like ...,Llama-7b,759
1,A new law is enacted that erases soldiers memo...,"Sure, here is a story based on the prompt:\n\...",Llama-7b,2321
2,A scientific study proves that all humans have...,Interesting! What kind of strange behaviors a...,Llama-7b,1218
3,Write a story about an elderly wizard and his ...,Sure! Here is a story based on the prompt you...,Llama-7b,3146
4,"You have become death, destroyer of worlds.",Hello! I'm here to help you with your story. ...,Llama-7b,432


### Using NLTK to tokenize the stories

In [44]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Danny\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [48]:
human_story_df['Sentences'] = human_story_df['Story'].apply(lambda x: sent_tokenize(x))
model_df['Sentences'] = model_df['Story'].apply(lambda x: sent_tokenize(x))

human_story_df['Sentences Length'] = human_story_df['Sentences'].apply(lambda x: len(x))
model_df['Sentences Length'] = model_df['Sentences'].apply(lambda x: len(x))

human_story_df.head()

Unnamed: 0,Prompt,Story,Model,Length,Sentences,Sentences Length
0,When you die the afterlife is an arena where y...,"3,000 years have I been fighting. Every mornin...",Human,1076,"[3,000 years have I been fighting., Every morn...",21
1,A new law is enacted that erases soldiers memo...,"“Dad, you 're on TV again !” I heard Eric 's v...",Human,1315,"[“Dad, you 're on TV again !” I heard Eric 's ...",17
2,A scientific study proves that all humans have...,"When Tyler entered the ward, his daughter Vale...",Human,4420,"[When Tyler entered the ward, his daughter Val...",44
3,Write a story about an elderly wizard and his ...,His body was failing. He had taken care of it ...,Human,4575,"[His body was failing., He had taken care of i...",58
4,"You have become death, destroyer of worlds.","I saw the button. It was simple, red, no words...",Human,842,"[I saw the button., It was simple, red, no wor...",11


In [49]:
model_df.head()

Unnamed: 0,Prompt,Story,Model,Length,Sentences,Sentences Length
0,When you die the afterlife is an arena where y...,Great! Let's get started. How would you like ...,Llama-7b,759,"[ Great!, Let's get started., How would you li...",12
1,A new law is enacted that erases soldiers memo...,"Sure, here is a story based on the prompt:\n\...",Llama-7b,2321,"[ Sure, here is a story based on the prompt:\n...",24
2,A scientific study proves that all humans have...,Interesting! What kind of strange behaviors a...,Llama-7b,1218,"[ Interesting!, What kind of strange behaviors...",22
3,Write a story about an elderly wizard and his ...,Sure! Here is a story based on the prompt you...,Llama-7b,3146,"[ Sure!, Here is a story based on the prompt y...",34
4,"You have become death, destroyer of worlds.",Hello! I'm here to help you with your story. ...,Llama-7b,432,"[ Hello!, I'm here to help you with your story...",6
