In [1]:
# !pip install transformers
# !pip install torch  
# !pip install datasets  
# !hostname
# !pip install --upgrade transformers

In [8]:
import pandas as pd
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import torch
from tqdm import tqdm

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import set_seed

from huggingface_hub import notebook_login

from PrepareSentenceContext import PrepareSentenceContext

nltk.download('punkt')
pd.set_option('display.max_columns', None)  # Show all columns
set_seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adamvinestock/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Check current PyTorch and CUDA availability
print("PyTorch version:", torch.__version__)
print("CUDA is available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

PyTorch version: 2.3.0
CUDA is available: False
CUDA version: None


In [20]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
# Load the HuggingFace wiki_intro_long dataset
hf_wiki_dataset = load_dataset('alonkipnis/wiki-intro-long', split='train')
df_wiki = pd.DataFrame(hf_wiki_dataset)

# Add columns for Llama2 and Falcon7B model outputs
df_wiki['human_len'] = None
df_wiki['gpt_len'] = None
df_wiki['Llama2'], df_wiki['Llama2_len'] = None, None
df_wiki['Falcon'], df_wiki['Falcon_len'] = None, None

df_wiki.rename(columns={
    'wiki_intro': 'human_text',
    # 'wiki_intro_len': 'human_len',
    'generated_intro': 'gpt'
    }, inplace=True)

columns_to_drop = ['prompt_tokens', 'generated_text', 'generated_intro_len']
df_wiki.drop(columns=columns_to_drop, inplace=True)

new_order = [
    'id', 'url', 'title', 'title_len', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama2', 'Llama2_len',
    'Falcon', 'Falcon_len']

df_wiki = df_wiki[new_order]

In [11]:
print(df_wiki.columns)
print(df_wiki.shape[0])

Index(['id', 'url', 'title', 'title_len', 'prompt', 'human_text', 'human_len',
       'gpt', 'gpt_len', 'Llama2', 'Llama2_len', 'Falcon', 'Falcon_len'],
      dtype='object')
39495


In [12]:
# Load the HuggingFace news dataset
hf_news_dataset = load_dataset('alonkipnis/news-chatgpt-long', split='train')
df_news = pd.DataFrame(hf_news_dataset)

df_news.rename(columns={
    'article': 'human_text',
    'chatgpt': 'gpt'
}, inplace=True)

df_news['human_len'], df_news['gpt_len'] = None, None
df_news['Llama2'], df_news['Llama2_len'] = None, None
df_news['Falcon'], df_news['Falcon_len'] = None, None
df_news['prompt'] = None

new_order = [
    'id', 'highlights', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama2', 'Llama2_len',
    'Falcon', 'Falcon_len'
]

df_news = df_news[new_order]

In [13]:
print(df_news.columns)
print(df_news.shape[0])

Index(['id', 'highlights', 'prompt', 'human_text', 'human_len', 'gpt',
       'gpt_len', 'Llama2', 'Llama2_len', 'Falcon', 'Falcon_len'],
      dtype='object')
13025


In [14]:
# Load the HuggingFace research absracts dataset
hf_abstracts_dataset = load_dataset('NicolaiSivesind/ChatGPT-Research-Abstracts', split='train')
df_abstracts = pd.DataFrame(hf_abstracts_dataset)

df_abstracts.rename(columns={
    'real_abstract': 'human_text',
    'real_word_count': 'human_len',
    'generated_abstract': 'gpt',
    'generated_word_count': 'gpt_len'
}, inplace=True)

df_abstracts['Llama2'], df_abstracts['Llama2_len'] = None, None
df_abstracts['Falcon'], df_abstracts['Falcon_len'] = None, None
df_abstracts['prompt'] = None

new_order = [
    'title', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama2', 'Llama2_len',
    'Falcon', 'Falcon_len'
]

df_abstracts = df_abstracts[new_order]

In [15]:
print(df_abstracts.columns)
print(df_abstracts.shape[0])

Index(['title', 'prompt', 'human_text', 'human_len', 'gpt', 'gpt_len',
       'Llama2', 'Llama2_len', 'Falcon', 'Falcon_len'],
      dtype='object')
10000


In [10]:
def count_words_and_sentences(text):
    """
    Tokenizes the text into words and sentences using nltk 
    Returns a tuple of (n_words,n_sentences)
    """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return (len(words), len(sentences))

def create_wiki_prompt(row):
    """
    Creates the wiki dataset prompt using the title and first 7 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:7]) 
    # prompt = f"Write a Wikipedia-style intro covering the topic '{row['title']}', it should be detailed and span approximately {row['human_len'][1]} sentences long. {first_few_words}"
    prompt = (
        f"Your role is a Wikipedia contributor. "
        f"Compose a Wikipedia-style introduction for the topic '{row['title']}' that spans {row['human_len'][1]} sentences long. "
        f"Start with a clear definition, followed by context and key details that are essential for understanding the topic. "
        f"Introduction: {first_few_words}"
    )
    
    return prompt

def create_news_prompt(row):
    """
    Creates the news dataset prompt using the first 15 words written by humans, and the article highlights
    """
    first_few_words = ' '.join(row['human_text'].split()[:15]) 
    highlights = row['highlights'] 
    # prompt = f"Complete the news article, make sure to be detailed, the article should span approximately {row['human_len'][1]} sentences long.\nArticle highlights: {highlights}\nArticle:{first_few_words}"
    prompt = (
        f"Your role is a news journalist. "
        f"Write a news article based on the given highlights, ensure the article is detailed and spans approximately {row['human_len'][1]} sentences long. "
        f"Incorporate the following key highlights: {highlights} "
        f"Article: {first_few_words}"
    )
    return prompt

def create_abstracts_prompt(row):
    """
    Creates the abstracts dataset prompt using the title and first 15 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:15]) 
    # prompt = f"Write a research abstract on the paper '{row['title']}'. Make sure to be detailed and span approximately {row['human_len'][1]} sentences long.\n{first_few_words}"
    prompt = (
        f"Your role is a scientist writing a paper for publication. "
        f"Write a concise research abstract for the paper titled '{row['title']}'. "
        f"Ensure the abstract is detailed, clear, and spans {row['human_len'][1]} sentences long. "
        f"Abstract: {first_few_words}"
    )
    return prompt

def generate_text_gpt2xl(prompt, model, tokenizer, max_length=1024):
    """
    Encodes the prompt using the model tokenizer - max context windows of 1024, left padding
    Returns the generated text, word count and sentence count
    """

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length) # Prepare input encoding with padding and truncation
    attention_mask = torch.fliplr(inputs['attention_mask'])                                                       # Adjust attention mask for left padding 
    max_new_tokens = 1024 - inputs['input_ids'].shape[1]                                                          # gpt2 is limited to generating 1024 token including prompt
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=attention_mask,  
        max_new_tokens=max_new_tokens,
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences

def generate_text_gpt2xl_v2(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=1024,  # Set to the maximum length of the model
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences



def generate_text_llama2(prompt, model, tokenizer, max_length=500):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=max_length)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        do_sample=True,       # Enable sampling to generate more diverse responses
        # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
        # top_k=50,           # Consider top 50 tokens for sampling at each step
        # top_p=0.95,          
        top_p = 0.9,
        num_beams = 5,
        temperature=0.7,
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_text = generated_text[len(prompt):].strip()
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences


def generate_text_falcon(prompt, model, tokenizer, length_params):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    
    # Adjust tokenizer padding for decoding
    tokenizer.padding_side = 'left'
    if tokenizer.pad_token is None:
        if tokenizer.eos_token:
            tokenizer.pad_token = tokenizer.eos_token
        else:
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Encode the prompt to tensor of input ids
    inputs = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')

    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=length_params["max_length"], 
        num_return_sequences=1,
        no_repeat_ngram_size=4,
        num_beams=5,
        early_stopping=True,
        # do_sample=True,
        # top_p=0.9,
        # temperature=0.7,
    )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Decode the output ids to text
    generated_text = generated_text[len(prompt):].strip()
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences

In [11]:
# populate length of human text with tuple(word_count, sentence_count)
df_wiki['human_len'] = df_wiki['human_text'].apply(count_words_and_sentences)
df_news['human_len'] = df_news['human_text'].apply(count_words_and_sentences)
df_abstracts['human_len'] = df_abstracts['human_text'].apply(count_words_and_sentences)

In [12]:
# create prompts
df_wiki['prompt'] = df_wiki.iloc[0:10].apply(create_wiki_prompt, axis=1)
df_news['prompt'] = df_news.iloc[0:10].apply(create_news_prompt, axis=1)
df_abstracts['prompt'] = df_abstracts.iloc[0:10].apply(create_abstracts_prompt, axis=1)

In [13]:
# domain articles length stats for tuning max length generation 
def calc_sentence_stats(df, task_name):
    sentence_counts = [t[1] for t in df['human_len']]
    avg_sentences = pd.Series(sentence_counts).mean()
    median_sentences = pd.Series(sentence_counts).median()
    min_sentences = pd.Series(sentence_counts).min()
    max_sentences = pd.Series(sentence_counts).max()
    
    print(f"{task_name} - Average sentences: {avg_sentences}")
    print(f"{task_name} - Median sentences: {median_sentences}")
    print(f"{task_name} - Minimum sentences: {min_sentences}")
    print(f"{task_name} - Maximum sentences: {max_sentences}")

calc_sentence_stats(df_wiki, 'Wikipedia')
calc_sentence_stats(df_news, 'News')
calc_sentence_stats(df_abstracts, 'Abstracts')

Wikipedia - Average sentences: 9.162501582478795
Wikipedia - Median sentences: 9.0
Wikipedia - Minimum sentences: 1
Wikipedia - Maximum sentences: 71
News - Average sentences: 22.985335892514396
News - Median sentences: 22.0
News - Minimum sentences: 9
News - Maximum sentences: 126
Abstracts - Average sentences: 8.0332
Abstracts - Median sentences: 8.0
Abstracts - Minimum sentences: 1
Abstracts - Maximum sentences: 35


In [14]:
def get_length_params(task_type):
    if task_type == "wikipedia":
        return {"max_length": 512}  
    elif task_type == "news":
        return {"max_length": 728}  # For longer news articles
    elif task_type == "abstract":
        return {"max_length": 512} 
    else:
        return {"max_length": 1024} 

In [2]:
# # Load gpt2-xl model and tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
# tokenizer.padding_side = 'left'  # Ensure padding from the left for gpt2
# model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
# model.eval()

# for index, row in df_wiki.head(1).iterrows():  
#     prompt = row['prompt'] 
#     generated_text, word_count, sent_count = generate_text_gpt2xl_v2(prompt, model, tokenizer)
#     df_wiki.at[index, 'gpt'] = generated_text
#     df_wiki.at[index, 'gpt_len'] = [(word_count, sent_count)]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B")

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [7]:
##### Toy example to test llama3.1 8B model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
if torch.cuda.is_available():
    model.cuda()  
model.eval()

prompt = (
    f"Your role is a Wikipedia contributor. "
    f"Compose a Wikipedia-style introduction for the topic ''Moluccans''. "
    f"Start with a clear definition, followed by key details and context that is essential for understanding the subject. "
    f"Ensure the introduction is detailed and spans approximately 11 sentences long. "
    f"Introduction: Moluccans are the Austronesian-speaking and Papuan-speaking ethnic"
)

inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
if torch.cuda.is_available():
    inputs = inputs.to('cuda')
    model = model.to('cuda')
output_ids = model.generate(
    input_ids=inputs['input_ids'], 
    attention_mask=inputs['attention_mask'],
    do_sample=True,       # Enable sampling to generate more diverse responses
    # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
    # top_k=50,           # Consider top 50 tokens for sampling at each step
    # top_p=0.95,          
    top_p = 0.9,
    # num_beams = 5,
    temperature=0.7,
    num_return_sequences=1,
    max_length = 512
)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
n_words, n_sentences = count_words_and_sentences(generated_text)

##### Toy example to test model and tokenizer

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [16]:
##### Toy example to test model and tokenizer
# Load llama2 7B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
if torch.cuda.is_available():
    model.cuda()  
model.eval()

prompt = (
    f"Your role is a Wikipedia contributor. "
    f"Compose a Wikipedia-style introduction for the topic ''Moluccans''. "
    f"Start with a clear definition, followed by key details and context that is essential for understanding the subject. "
    f"Ensure the introduction is detailed and spans approximately 11 sentences long. "
    f"Introduction: Moluccans are the Austronesian-speaking and Papuan-speaking ethnic"
)

inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
if torch.cuda.is_available():
    inputs = inputs.to('cuda')
    model = model.to('cuda')
output_ids = model.generate(
    input_ids=inputs['input_ids'], 
    attention_mask=inputs['attention_mask'],
    do_sample=True,       # Enable sampling to generate more diverse responses
    # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
    # top_k=50,           # Consider top 50 tokens for sampling at each step
    # top_p=0.95,          
    top_p = 0.9,
    num_beams = 5,
    temperature=0.7,
    num_return_sequences=1
)
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
n_words, n_sentences = count_words_and_sentences(generated_text)

##### Toy example to test model and tokenizer

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
generated_text

In [None]:
##### Toy example to test model and tokenizer
# Load falcon 7B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
if torch.cuda.is_available():
    model.cuda()  
model.eval()


tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Encode the prompt to tensor of input ids
inputs = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
if torch.cuda.is_available():
    inputs = inputs.to('cuda')
    model = model.to('cuda')

output_ids = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=4096, 
    num_return_sequences=1,
    no_repeat_ngram_size=4,  # Prevents the model from repeating the same 4-gram
    top_p=0.92,
    top_k=50,
    temperature=0.7
)

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) 
n_words, n_sentences = count_words_and_sentences(generated_text)

##### Toy example to test model and tokenizer

In [None]:
generated_text

In [None]:
# Load llama2 7B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
if torch.cuda.is_available():
    model.cuda()  
model.eval()

# Generate text for the wiki dataset
for index, row in tqdm(df_wiki.head(3).iterrows(), total=3, desc="Generating Wiki Dataset"):
    prompt = row['prompt']
    generated_text, word_count, sent_count = generate_text_llama2(prompt, model, tokenizer)
    df_wiki.at[index, 'Llama2'] = generated_text
    df_wiki.at[index, 'Llama2_len'] = [(word_count, sent_count)]

# Generate text for the news dataset
for index, row in tqdm(df_news.head(3).iterrows(), total=3, desc="Generating News Dataset"):
    prompt = row['prompt']
    generated_text, word_count, sent_count = generate_text_llama2(prompt, model, tokenizer)
    df_news.at[index, 'Llama2'] = generated_text
    df_news.at[index, 'Llama2_len'] = [(word_count, sent_count)]

# Generate text for abstracts dataset
for index, row in tqdm(df_abstracts.head(3).iterrows(), total=3, desc="Generating Abstracts Dataset"):
    prompt = row['prompt']
    generated_text, word_count, sent_count = generate_text_llama2(prompt, model, tokenizer)
    df_abstracts.at[index, 'Llama2'] = generated_text
    df_abstracts.at[index, 'Llama2_len'] = [(word_count, sent_count)]


In [None]:
# Load falcon 7B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
if torch.cuda.is_available():
    model.cuda()  
model.eval()

# generate text for wiki dataset
task_type = "wikipedia"  
length_params = get_length_params(task_type)
for index, row in tqdm(df_wiki.head(3).iterrows(), total=3, desc="Generating Wiki Dataset"):
    prompt = row['prompt']
    generated_text, word_count, sent_count = generate_text_falcon(prompt, model, tokenizer)
    df_wiki.at[index, 'Falcon'] = generated_text
    df_wiki.at[index, 'Falcon_len'] = [(word_count, sent_count)]

# generate text for news dataset
task_type = "news"  
length_params = get_length_params(task_type)
for index, row in tqdm(df_news.head(3).iterrows(), total=3, desc="Generating News Dataset"):
    prompt = row['prompt']
    generated_text, word_count, sent_count = generate_text_falcon(prompt, model, tokenizer)
    df_news.at[index, 'Falcon'] = generated_text
    df_news.at[index, 'Falcon_len'] = [(word_count, sent_count)]

# generate text for abstracts dataset
task_type = "abstract"  
length_params = get_length_params(task_type)
for index, row in tqdm(df_abstracts.head(3).iterrows(), total=3, desc="Generating Abstracts Dataset"):
    prompt = row['prompt']
    generated_text, word_count, sent_count = generate_text_falcon(prompt, model, tokenizer)
    df_abstracts.at[index, 'Falcon'] = generated_text
    df_abstracts.at[index, 'Falcon_len'] = [(word_count, sent_count)]

In [30]:
# Export to CSV for log-ppx response calculation

df_wiki.iloc[0:10].to_csv('src/wiki_dataset.csv', index=False)
df_news.iloc[0:4].to_csv('src/news_dataset.csv', index=False)
df_abstracts.iloc[0:4].to_csv('src/abstracts_dataset.csv', index=False)