In [4]:
import pandas as pd
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import torch

nltk.download('punkt')
pd.set_option('display.max_columns', None)  # Show all columns

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adamvinestock/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Load the HuggingFace wiki_intro_long dataset
hf_wiki_dataset = load_dataset('alonkipnis/wiki-intro-long', split='train')
df_wiki = pd.DataFrame(hf_wiki_dataset)

# Add columns for Llama2 and Falcon7B model outputs
df_wiki['human_len'] = None
df_wiki['gpt_len'] = None
df_wiki['Llama2'], df_wiki['Llama2_len'] = None, None
df_wiki['Falcon7B'], df_wiki['Falcon7B_len'] = None, None

df_wiki.rename(columns={
    'wiki_intro': 'human_text',
    # 'wiki_intro_len': 'human_len',
    'generated_intro': 'gpt'
    }, inplace=True)

columns_to_drop = ['prompt_tokens', 'generated_text', 'generated_intro_len']
df_wiki.drop(columns=columns_to_drop, inplace=True)

new_order = [
    'id', 'url', 'title', 'title_len', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama2', 'Llama2_len',
    'Falcon7B', 'Falcon7B_len']

df_wiki = df_wiki[new_order]

In [6]:
print(df_wiki.columns)
print(df_wiki.shape[0])

Index(['id', 'url', 'title', 'title_len', 'prompt', 'human_text', 'human_len',
       'gpt', 'gpt_len', 'Llama2', 'Llama2_len', 'Falcon7B', 'Falcon7B_len'],
      dtype='object')
39495


In [7]:
# Load the HuggingFace news dataset
hf_news_dataset = load_dataset('alonkipnis/news-chatgpt-long', split='train')
df_news = pd.DataFrame(hf_news_dataset)

df_news.rename(columns={
    'article': 'human_text',
    'chatgpt': 'gpt'
}, inplace=True)

df_news['human_len'], df_news['gpt_len'] = None, None
df_news['Llama2'], df_news['Llama2_len'] = None, None
df_news['Falcon7B'], df_news['Falcon7B_len'] = None, None
df_news['prompt'] = None


columns_to_drop = ['highlights']  
df_news.drop(columns=columns_to_drop, inplace=True)

new_order = [
    'id', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama2', 'Llama2_len',
    'Falcon7B', 'Falcon7B_len'
]

df_news = df_news[new_order]

In [8]:
print(df_news.columns)
print(df_news.shape[0])

Index(['id', 'prompt', 'human_text', 'human_len', 'gpt', 'gpt_len', 'Llama2',
       'Llama2_len', 'Falcon7B', 'Falcon7B_len'],
      dtype='object')
13025


In [9]:
# Load the HuggingFace research absracts dataset
hf_abstracts_dataset = load_dataset('NicolaiSivesind/ChatGPT-Research-Abstracts', split='train')
df_abstracts = pd.DataFrame(hf_abstracts_dataset)

df_abstracts.rename(columns={
    'real_abstract': 'human_text',
    'real_word_count': 'human_len',
    'generated_abstract': 'gpt',
    'generated_word_count': 'gpt_len'
}, inplace=True)

df_abstracts['Llama2'], df_abstracts['Llama2_len'] = None, None
df_abstracts['Falcon7B'], df_abstracts['Falcon7B_len'] = None, None
df_abstracts['prompt'] = None

new_order = [
    'title', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama2', 'Llama2_len',
    'Falcon7B', 'Falcon7B_len'
]

df_abstracts = df_abstracts[new_order]

In [10]:
print(df_abstracts.columns)
print(df_abstracts.shape[0])

Index(['title', 'prompt', 'human_text', 'human_len', 'gpt', 'gpt_len',
       'Llama2', 'Llama2_len', 'Falcon7B', 'Falcon7B_len'],
      dtype='object')
10000


In [11]:
def count_words_and_sentences(text):
    """
    Tokenizes the text into words and sentences using nltk 
    Returns a tuple of (n_words,n_sentences)
    """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return (len(words), len(sentences))

def generate_text(prompt, model, tokenizer, max_length=500):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    
    # Set padding token to eos
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    inputs = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    attention_mask = inputs['attention_mask']
    attention_mask = torch.fliplr(attention_mask)    # Adjusting the attention mask for left padding
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        max_length=max_length+100, 
        num_return_sequences=1)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences

def create_wiki_prompt(row):
    """
    Creates the wiki dataset prompt using the title and first 7 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:7]) 
    prompt = f"Write a Wikipedia-style intro covering the topic '{row['title']}', it should be detailed and span approximately {row['human_len'][1]} sentences long. {first_few_words}"
    return prompt

In [12]:
# populate length of human text with tuple(word_count, sentence_count)
df_wiki['human_len'] = df_wiki['human_text'].apply(count_words_and_sentences)
df_news['human_len'] = df_news['human_text'].apply(count_words_and_sentences)
df_abstracts['human_len'] = df_abstracts['human_text'].apply(count_words_and_sentences)

In [13]:
# df_news['human_len'].iloc[0:5]
# for text in df_news['human_text'].iloc[0:5]:
#     print(text)
#     print("\n---\n") 

In [14]:
df_wiki['prompt'] = df_wiki.iloc[0:10].apply(create_wiki_prompt, axis=1)

In [15]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load gpt2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = 'left'
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

for index, row in df_wiki.head(2).iterrows():
    prompt = row['prompt']
    # generated_text, (word_count, sentence_count) = generate_text(prompt, model, tokenizer, max_length=500)
    generated_text, word_count, sent_count = generate_text(prompt, model, tokenizer, max_length=500)
    df_wiki.at[index, 'gpt'] = generated_text
    df_wiki.at[index, 'gpt_len'] = [(word_count,sent_count)]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [16]:

# Export to CSV for external analysis
df_wiki.iloc[0:2].to_csv('generated_text_comparison.csv', index=False)

