In [1]:
from google.colab import files
uploaded = files.upload()

Saving hanna_stories_annotations.csv to hanna_stories_annotations.csv


# Total number of Tokens for each model


In [6]:
import pandas as pd
import re

df = pd.read_csv('/content/hanna_stories_annotations.csv')

def simple_tokenize(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return tokens

results = []

for model, group in df.groupby('Model'):
    all_tokens = []

    for story in group['Story']:
        tokens = simple_tokenize(story)
        all_tokens.extend(tokens)

    total_tokens = len(all_tokens)
    unique_terms = len(set(all_tokens))

    results.append({
        'Topic (Model)': model,
        'Tokens': total_tokens,
        'Unique terms': unique_terms
    })

table_df = pd.DataFrame(results)
table_df = table_df.sort_values(by='Topic (Model)').reset_index(drop=True)

table_df


Unnamed: 0,Topic (Model),Tokens,Unique terms
0,BertGeneration,59142,3015
1,CTRL,60051,2884
2,Fusion,34755,1162
3,GPT,55344,3052
4,GPT-2,93321,4028
5,GPT-2 (tag),87069,3593
6,HINT,25356,786
7,Human,139386,6740
8,RoBERTa,56280,3123
9,TD-VAE,90846,5268


# Here is we will get #tokens on average

In [7]:
import pandas as pd
import re

df = pd.read_csv('/content/hanna_stories_annotations.csv')

def simple_tokenize(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()  # split by spaces to get words
    return tokens

results = []

for model, group in df.groupby('Model'):
    all_tokens = []
    num_stories = len(group)

    for story in group['Story']:
        tokens = simple_tokenize(story)
        all_tokens.extend(tokens)

    total_tokens = len(all_tokens)
    unique_terms = len(set(all_tokens))
    avg_tokens = total_tokens / num_stories

    results.append({
        'Topic (Model)': model,
        'Tokens': total_tokens,
        'Unique terms': unique_terms,
        'Avg Tokens per Story': avg_tokens
    })

table_df = pd.DataFrame(results)
table_df = table_df.sort_values(by='Topic (Model)').reset_index(drop=True)

table_df


Unnamed: 0,Topic (Model),Tokens,Unique terms,Avg Tokens per Story
0,BertGeneration,59142,3015,205.354167
1,CTRL,60051,2884,208.510417
2,Fusion,34755,1162,120.677083
3,GPT,55344,3052,192.166667
4,GPT-2,93321,4028,324.03125
5,GPT-2 (tag),87069,3593,302.322917
6,HINT,25356,786,88.041667
7,Human,139386,6740,483.979167
8,RoBERTa,56280,3123,195.416667
9,TD-VAE,90846,5268,315.4375


# Avg tokens for AI models

In [8]:
import pandas as pd

df = pd.read_csv('/content/hanna_stories_annotations.csv')

def count_tokens(text):
    return len(str(text).split())

df['Token Count'] = df['Story'].apply(count_tokens)

df_filtered = df[df['Model'] != 'Human']

average_token_count = df_filtered['Token Count'].mean()

print(f"The average token count for all models excluding 'Human' is: {average_token_count}")


The average token count for all models excluding 'Human' is: 228.740625


# #stories for each type

In [11]:
df_unique = df.drop_duplicates(subset=['Story'])

unique_stories_per_model = df_unique.groupby('Model')['Story'].count().reset_index()

print(unique_stories_per_model)

             Model  Story
0   BertGeneration     96
1             CTRL     96
2           Fusion     96
3              GPT     96
4            GPT-2     96
5      GPT-2 (tag)     96
6             HINT     96
7            Human     96
8          RoBERTa     96
9           TD-VAE     96
10           XLNet     96
