In [1]:
# !pip install transformers
# !pip install torch  
# !pip install datasets  
# !hostname
# !pip install --upgrade transformers
# !pip install spacy
# !pip install ipywidgets
# !pip install --upgrade transformers "numpy<2" "pyarrow<14"

In [2]:
####
# This notebook consists of two main parts:
#   1. Loading of the datasets, populating article lengths, crating the prompts and saving to domain_dataset.csv
#   2. Generating articles for each model in batches and save results incrementally to domain_dataset_out.csv
###

In [3]:
# !nvcc --version
!nvidia-smi

Sat Jan  4 16:35:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000001:00:00.0 Off |                    0 |
| N/A   30C    P0             41W /  300W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import torch
import gc
from tqdm import tqdm
import re
import os

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import set_seed

from huggingface_hub import notebook_login

from PrepareSentenceContext import PrepareSentenceContext

nltk.download('punkt')
pd.set_option('display.max_columns', None)  # Show all columns
set_seed(42)

[nltk_data] Downloading package punkt to /home/akp3user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Check current PyTorch and CUDA availability
print("PyTorch version:", torch.__version__)
print("CUDA is available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

PyTorch version: 2.5.1+cu124
CUDA is available: True
CUDA version: 12.4


In [22]:
# # notebook_login()
# from huggingface_hub import login
# login("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

In [23]:
##############
### Part 1 ###
##############

In [24]:
# Load the HuggingFace wiki_intro_long dataset
hf_wiki_dataset = load_dataset('alonkipnis/wiki-intro-long', split='train')
df_wiki = pd.DataFrame(hf_wiki_dataset)

# Add columns for Llama2 and Falcon7B model outputs
df_wiki['human_len'] = None
df_wiki['gpt_len'] = None
df_wiki['Llama3.1'], df_wiki['Llama3.1_len'] = None, None
df_wiki['Falcon'], df_wiki['Falcon_len'] = None, None

df_wiki.rename(columns={
    'wiki_intro': 'human_text',
    'generated_intro': 'gpt'
    }, inplace=True)

columns_to_drop = ['prompt_tokens', 'generated_text', 'generated_intro_len']
df_wiki.drop(columns=columns_to_drop, inplace=True)

new_order = [
    'id', 'url', 'title', 'title_len', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama3.1', 'Llama3.1_len',
    'Falcon', 'Falcon_len']

df_wiki = df_wiki[new_order]
print(df_wiki.columns)
print(df_wiki.shape[0])

Index(['id', 'url', 'title', 'title_len', 'prompt', 'human_text', 'human_len',
       'gpt', 'gpt_len', 'Llama3.1', 'Llama3.1_len', 'Falcon', 'Falcon_len'],
      dtype='object')
39495


In [25]:
# Load the HuggingFace news dataset
hf_news_dataset = load_dataset('alonkipnis/news-chatgpt-long', split='train')
df_news = pd.DataFrame(hf_news_dataset)

df_news.rename(columns={
    'article': 'human_text',
    'chatgpt': 'gpt'
}, inplace=True)

df_news['human_len'], df_news['gpt_len'] = None, None
df_news['Llama3.1'], df_news['Llama3.1_len'] = None, None
df_news['Falcon'], df_news['Falcon_len'] = None, None
df_news['prompt'] = None

new_order = [
    'id', 'highlights', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama3.1', 'Llama3.1_len',
    'Falcon', 'Falcon_len'
]

df_news = df_news[new_order]
print(df_news.columns)
print(df_news.shape[0])

Index(['id', 'highlights', 'prompt', 'human_text', 'human_len', 'gpt',
       'gpt_len', 'Llama3.1', 'Llama3.1_len', 'Falcon', 'Falcon_len'],
      dtype='object')
13025


In [26]:
# Load the HuggingFace research absracts dataset
hf_abstracts_dataset = load_dataset('NicolaiSivesind/ChatGPT-Research-Abstracts', split='train')
df_abstracts = pd.DataFrame(hf_abstracts_dataset)

df_abstracts.rename(columns={
    'real_abstract': 'human_text',
    'real_word_count': 'human_len',
    'generated_abstract': 'gpt',
    'generated_word_count': 'gpt_len'
}, inplace=True)

df_abstracts['Llama3.1'], df_abstracts['Llama3.1_len'] = None, None
df_abstracts['Falcon'], df_abstracts['Falcon_len'] = None, None
df_abstracts['prompt'] = None

new_order = [
    'title', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama3.1', 'Llama3.1_len',
    'Falcon', 'Falcon_len'
]

df_abstracts = df_abstracts[new_order]
print(df_abstracts.columns)
print(df_abstracts.shape[0])

Index(['title', 'prompt', 'human_text', 'human_len', 'gpt', 'gpt_len',
       'Llama3.1', 'Llama3.1_len', 'Falcon', 'Falcon_len'],
      dtype='object')
10000


In [5]:
# Defining methods for word/sentence count, prompt generation, model generation 

def count_words_and_sentences(text):
    """
    Tokenizes the text into words and sentences using nltk 
    Returns a tuple of (n_words,n_sentences)
    """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return (len(words), len(sentences))

def batch_count_words_and_sentences(texts):
    """
    Tokenizes the text into words and sentences using nltk 
    Returns a tuple of (n_words,n_sentences)
    """
    return [(len(word_tokenize(text)), len(sent_tokenize(text))) for text in texts]

def create_wiki_prompt(row):
    """
    Creates the wiki dataset prompt using the title and first 7 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:7]) 
    # prompt = (
    # f"### | Instruction | ###\n"
    # f"Your role is a Wikipedia contributor.\n"
    # f"Your task is to compose a detailed Wikipedia-style introduction for the topic '{row['title']}'.\n"
    # f"The introduction should be at least {row['human_len'][0]} words long and {row['human_len'][1]} sentences long.\n"
    # f"Begin with a clear definition, followed by key details and context essential for understanding the topic.\n"
    # f"Do not include headings or editorial notes.\n"
    # f"### | Response | ###\n"
    # f"{first_few_words} "
    # )
    prompt = (
    f"### | Instruction | ###\n"
    f"Your role is a Wikipedia contributor.\n"
    f"Your task is to compose a detailed Wikipedia-style introduction for the topic '{row['title']}'.\n"
    f"The introduction should be approximately {row['human_len'][0]} words and {row['human_len'][1]} sentences."
    f"Begin with a clear definition, followed by key details and context essential for understanding the topic.\n"
    f"Provide a clean, standalone response without extra instructions or editorial comments.\n"
    f"### | Response | ###\n"
    f"{first_few_words} "
    )
    return prompt

def create_news_prompt(row):
    """
    Creates the news dataset prompt using the first 15 words written by humans, and the article highlights
    """
    first_few_words = ' '.join(row['human_text'].split()[:15]) 
    highlights = row['highlights'] 
    # prompt = (
    # f"### | Instruction | ###\n"
    # f"Your role is a professional news journalist.\n"
    # f"Your task is to write a detailed news article based on the given highlights.\n"
    # f"The article should be at least {row['human_len'][0]} words long and {row['human_len'][1]} sentences long.\n"
    # f"Incorporate the following key highlights: {highlights}\n"
    # f"Begin with the most important details, followed by context and supporting information.\n"
    # f"Do not include headings or editorial notes.\n"
    # f"### | Response | ###\n"
    # f"{first_few_words} "
    # )
    prompt = (
    f"### | Instruction | ###\n"
    f"Your role is a professional news journalist.\n"
    f"Your task is to write a detailed news article based on the given highlights.\n"
    f"The article should be at least {row['human_len'][0]} words and {row['human_len'][1]} sentences.\n"
    f"Incorporate the following key highlights: {highlights}\n"
    f"Begin with the most important details, followed by context and supporting information.\n"
    f"Provide a clean, standalone response without extra instructions or editorial comments.\n"
    f"### | Response | ###\n"
    f"{first_few_words} "
    )
    return prompt

def create_abstracts_prompt(row):
    """
    Creates the abstracts dataset prompt using the title and first 15 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:15]) 
    # prompt = (
    # f"### | Instruction | ###\n"
    # f"Your role is a scientist writing a research abstract for the paper titled '{row['title']}'.\n"
    # f"Your task is to write a concise, detailed abstract for this paper.\n"
    # f"The abstract should be at least {row['human_len'][0]} words long and {row['human_len'][1]} sentences long.\n"
    # f"Begin with a clear statement of the research question or problem, followed by methodology, findings, and implications.\n"
    # f"Do not include headings or editorial notes.\n"
    # f"### | Response | ###\n"
    # f"{first_few_words} "
    # )
    prompt = (
    f"### | Instruction | ###\n"
    f"Your role is a scientist writing a paper titled '{row['title']}'.\n"
    f"Your task is to write a concise, detailed abstract for this paper.\n"
    f"The abstract should be at least {row['human_len'][0]} words and {row['human_len'][1]} sentences.\n"
    f"Ensure the abstract is comprehensive and provides sufficient detail to reach the specified word and sentence counts.\n"
    f"Begin with a clear statement of the research question or problem, followed by methodology, findings and implications.\n"
    f"Provide a clean, standalone response without extra instructions or editorial comments.\n"
    f"### | Response | ###\n"
    f"{first_few_words} "
    )
    return prompt

def get_length_params(task_type):
    if task_type == "wikipedia":
        return {"max_length": 512}  
    elif task_type == "news":
        return {"max_length": 1024}  # For longer news articles
    elif task_type == "abstract":
        return {"max_length": 512} 
    else:
        return {"max_length": 1024} 

def generate_text_gpt2xl_v2(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=1024,  # Set to the maximum length of the model
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences


def generate_text_llama2(prompt, model, tokenizer, max_length=500):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=max_length)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        do_sample=True,       # Enable sampling to generate more diverse responses
        # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
        # top_k=50,           # Consider top 50 tokens for sampling at each step
        # top_p=0.95,          
        top_p = 0.9,
        num_beams = 5,
        temperature=0.7,
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    start_index = min((generated_text.find(key) for key in ["Introduction:", "Article:", "Abstract:"] if generated_text.find(key) != -1), default=0)
    generated_text = generated_text[start_index:].strip()
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences

def generate_text_llama3(prompt, model, tokenizer, length_params):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        num_return_sequences=1,
        max_new_tokens=length_params["max_length"],
        do_sample=False,      # Enable sampling to generate more diverse responses
        # temperature=0.5,    # Slightly randomize the outputs to prevent repetition
        # top_k=20,           # Consider top 50 tokens for sampling at each step      
        # top_p = 0.9,
        num_beams = 5,     # cant run with v100 16gGB
        no_repeat_ngram_size=5,
        repetition_penalty=1.5,
        length_penalty=0.9,
        early_stopping=True
    )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
    if match:
        response_text = match.group(1).strip()
    else:
        print("Response delimiter not found in generated text!")
        response_text = generated_text
    n_words, n_sentences = count_words_and_sentences(response_text)
    return response_text, n_words, n_sentences

def generate_text_falcon(prompt, model, tokenizer, length_params):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    # Adjust tokenizer padding for decoding
    tokenizer.padding_side = 'left'
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token or '[PAD]'

    # encode the prompt
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')

    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=length_params["max_length"], 
        num_return_sequences=1,
        no_repeat_ngram_size=4,
        num_beams=5,
        early_stopping=True,
        do_sample=False,
        # top_p=0.9,
        # temperature=0.7,
    )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
    if match:
        response_text = match.group(1).strip()
    else:
        print("Response delimiter not found in generated text!")
        response_text = generated_text 
    n_words, n_sentences = count_words_and_sentences(response_text)
    return response_text, n_words, n_sentences

def generate_text_llama3_batch(prompts, model, tokenizer, length_params):
    """
    Encodes the prompts using the model tokenizer in batches.
    Returns the generated text, word count, and sentence count for all prompts in the batch.
    """
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = {key: value.to('cuda') for key, value in inputs.items()}

    # Generate output for the batch
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        num_return_sequences=1,
        max_new_tokens=length_params["max_length"],
        do_sample=False,
        num_beams=5,
        no_repeat_ngram_size=5,
        repetition_penalty=1.5,
        length_penalty=0.9,
        early_stopping=True,
        temperature=None, 
        top_p=None         
    )
    generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    # extract the generated sections using regex
    response_texts = []
    for generated_text in generated_texts:
        match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
        if match:
            response_texts.append(match.group(1).strip())
        else:
            print("Response delimiter not found in generated text!")
            response_texts.append(generated_text.strip())
    lengths = batch_count_words_and_sentences(response_texts)
    return response_texts, lengths

def generate_text_falcon_batch(prompts, model, tokenizer, length_params):
    """
    Encodes the prompts using the model tokenizer in batches
    Returns the generated text, word count, and sentence count for all prompts in the batch
    """
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = {key: value.to('cuda') for key, value in inputs.items()}
    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=length_params["max_length"],
        num_return_sequences=1,
        no_repeat_ngram_size=4,
        num_beams=5,
        early_stopping=True,
        do_sample=False
    )
    generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    response_texts = []
    for generated_text in generated_texts:
        match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
        if match:
            response_texts.append(match.group(1).strip())
        else:
            print("Response delimiter not found in generated text!")
            response_texts.append(generated_text.strip())

    lengths = batch_count_words_and_sentences(response_texts)
    return response_texts, lengths


def process_dataset_in_batches(df, model, tokenizer, batch_size, length_params, model_name, generate_method, output_path, checkpoint_path, target_rows):
    """
    Process the dataset in batches and save results incrementally.
    Supports resuming from a checkpoint and ensures each model processes up to target rows considering already generated rows.
    """
    # Initialize 
    processed_count = 0
    start_index = 0

    # output file
    if not os.path.exists(output_path):
        df.to_csv(output_path, index=False, encoding="utf-8-sig")
    output_df = pd.read_csv(output_path)
    if model_name not in output_df.columns:
        output_df[model_name] = None  # Initialize the column if it doesn't exist
    output_df[model_name] = output_df[model_name].astype(object)  # Explicitly set as object type

    length_column = f"{model_name}_len"
    if length_column not in output_df.columns:
        output_df[length_column] = None  # Initialize the column if it doesn't exist
    output_df[length_column] = output_df[length_column].astype(object) 

    # check how many rows within the first `target_rows` indices were generated
    first_target_indices = output_df.index[:target_rows] 
    processed_rows = output_df.loc[first_target_indices, model_name].notna()
    processed_count = processed_rows.sum()
    print(f"{processed_count} out of {target_rows} rows already generated for {model_name}.")

    # rows that need to be generated
    remaining_to_target = max(0, target_rows - processed_count)
    unprocessed_indices = output_df[output_df[model_name].isna()].index[:remaining_to_target]
    rows_to_generate = unprocessed_indices[:remaining_to_target]
    
    if remaining_to_target == 0:
        print(f"Target of {target_rows} rows already reached for {model_name}. Skipping further processing.")
        return
    rows_to_generate = rows_to_generate[:remaining_to_target]

    # total batches for progress tracking
    total_batches = (len(unprocessed_indices) + batch_size - 1) // batch_size

    # generate in batches
    with tqdm(total=total_batches, desc=f"Generating {model_name}", unit="batch") as pbar:
        for batch_start in range(0, len(rows_to_generate), batch_size):
            batch_indices = rows_to_generate[batch_start:batch_start + batch_size]
            batch_df = df.loc[batch_indices].reset_index(drop=True)

            # Generate text
            prompts = batch_df['prompt'].tolist()
            generated_texts, lengths = generate_method(prompts, model, tokenizer, length_params)

            # insert to df
            for i, (text, length) in enumerate(zip(generated_texts, lengths)):
                output_df.loc[batch_indices[i], model_name] = text
                output_df.loc[batch_indices[i], length_column] = str(length)

            # save
            processed_count += len(batch_indices)
            with open(checkpoint_path, 'w') as f_ckpt:
                f_ckpt.write(str(processed_count))
            output_df.to_csv(output_path, index=False, encoding="utf-8-sig")
            
            pbar.update(1)
            print(f"Processed batch of {len(batch_indices)} rows. Total processed: {processed_count}.")
        torch.cuda.empty_cache()
        gc.collect()

def batch_generator(df, batch_size, start_index=0):
    """
    Splits the DataFrame into batches starting from `start_index`
    Yields (start_index, batch_df) for each batch
    """
    for i in range(start_index, len(df), batch_size):
        yield i, df.iloc[i:i + batch_size]

def clear_gpu_memory(model, tokenizer):
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

In [29]:
# populate length of human text with tuple(word_count, sentence_count)
df_wiki['human_len'] = batch_count_words_and_sentences(df_wiki['human_text'].tolist())
df_news['human_len'] = batch_count_words_and_sentences(df_news['human_text'].tolist())
df_abstracts['human_len'] = batch_count_words_and_sentences(df_abstracts['human_text'].tolist())

In [30]:
# populate length of GPT text with tuple(word_count, sentence_count)
df_wiki['gpt_len'] = batch_count_words_and_sentences(df_wiki['gpt'].tolist())
df_news['gpt_len'] = batch_count_words_and_sentences(df_news['gpt'].tolist())
df_abstracts['gpt_len'] = batch_count_words_and_sentences(df_abstracts['gpt'].tolist())

In [31]:
# create prompts
df_wiki['prompt'] = df_wiki.iloc[0:2500].apply(create_wiki_prompt, axis=1)
df_news['prompt'] = df_news.iloc[0:2500].apply(create_news_prompt, axis=1)
df_abstracts['prompt'] = df_abstracts.iloc[0:2500].apply(create_abstracts_prompt, axis=1)

In [32]:
# save loaded datasets
df_wiki[0:2500].to_csv('src/wiki_dataset.csv', index=False, encoding="utf-8-sig")
df_news[0:2500].to_csv('src/news_dataset.csv', index=False, encoding="utf-8-sig")
df_abstracts[0:2500].to_csv('src/abstracts_dataset.csv', index=False, encoding="utf-8-sig")

In [33]:
##############
### Part 2 ###
##############

In [34]:
# domain articles length stats for tuning max length generation 
def calc_sentence_stats(df, task_name):
    sentence_counts = [t[1] for t in df['human_len']]
    avg_sentences = pd.Series(sentence_counts).mean()
    median_sentences = pd.Series(sentence_counts).median()
    min_sentences = pd.Series(sentence_counts).min()
    max_sentences = pd.Series(sentence_counts).max()
    
    print(f"{task_name} - Average sentences: {avg_sentences}")
    print(f"{task_name} - Median sentences: {median_sentences}")
    print(f"{task_name} - Minimum sentences: {min_sentences}")
    print(f"{task_name} - Maximum sentences: {max_sentences}")

calc_sentence_stats(df_wiki, 'Wikipedia')
calc_sentence_stats(df_news, 'News')
calc_sentence_stats(df_abstracts, 'Abstracts')

Wikipedia - Average sentences: 9.162501582478795
Wikipedia - Median sentences: 9.0
Wikipedia - Minimum sentences: 1
Wikipedia - Maximum sentences: 71
News - Average sentences: 22.985335892514396
News - Median sentences: 22.0
News - Minimum sentences: 9
News - Maximum sentences: 126
Abstracts - Average sentences: 8.0332
Abstracts - Median sentences: 8.0
Abstracts - Minimum sentences: 1
Abstracts - Maximum sentences: 35


In [35]:
# #### Toy example to test llama3.1 8B model and tokenizer

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
# # model = model.half() # required for 16GB gpu (8B 32float = 32gb) so reduce to 16float
# if torch.cuda.is_available():
#     model.cuda()  
# model.eval()

# prompt = (
#     f"Your role is a Wikipedia contributor. "
#     f"Compose a Wikipedia-style introduction for the topic 'Moluccans'. "
#     f"Start with a clear definition, followed by key details and context that is essential for understanding the subject. "
#     f"Ensure the introduction is detailed and spans approximately 11 sentences long. "
#     f"Introduction: Moluccans are the Austronesian-speaking and Papuan-speaking ethnic"
# )

# inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
# if torch.cuda.is_available():
#     inputs = inputs.to('cuda')
#     model = model.to('cuda')
# output_ids = model.generate(
#     input_ids=inputs['input_ids'], 
#     attention_mask=inputs['attention_mask'],
#     do_sample=True,       # Enable sampling to generate more diverse responses
#     # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
#     # top_k=50,           # Consider top 50 tokens for sampling at each step
#     # top_p=0.95,          
#     top_p = 0.9,
#     # num_beams = 5,
#     temperature=0.7,
#     num_return_sequences=1,
#     max_length = 512
# )

# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# start_index = -1
# for keyword in ["Introduction:", "Article:", "Abstract:"]:
#     start_index = generated_text.find(keyword)
#     if start_index != -1:
#         start_index += len(keyword)
#         break  # Stop once we find a valid keyword

# if start_index != -1:
#     generated_text = generated_text[start_index:].strip()
# else:
#     print("Keyword not found in generated text!")

# n_words, n_sentences = count_words_and_sentences(generated_text)

In [36]:
# ##### Toy example to test model and tokenizer
# Load falcon 7B tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
# if torch.cuda.is_available():
#     model.cuda()  
# model.eval()

# tokenizer.padding_side = 'left'
# if tokenizer.pad_token is None:
#     if tokenizer.eos_token:
#         tokenizer.pad_token = tokenizer.eos_token
#     else:
#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# # Encode the prompt to tensor of input ids
# inputs = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
# if torch.cuda.is_available():
#     inputs = inputs.to('cuda')
#     model = model.to('cuda')

# output_ids = model.generate(
#     input_ids=inputs['input_ids'],
#     attention_mask=inputs['attention_mask'],
#     max_length=4096, 
#     num_return_sequences=1,
#     no_repeat_ngram_size=4,  # Prevents the model from repeating the same 4-gram
#     top_p=0.92,
#     top_k=50,
#     temperature=0.7
# )

# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) 
# n_words, n_sentences = count_words_and_sentences(generated_text)

# ##### Toy example to test model and tokenizer

In [8]:
# Paths for generation
wiki_path = "src/wiki_dataset.csv"
news_path = "src/news_dataset.csv"
abstracts_path = "src/abstracts_dataset.csv"

output_wiki = "src/wiki_dataset_generated.csv"
output_news = "src/news_dataset_generated.csv"
output_abstracts = "src/abstracts_dataset_generated.csv"

checkpoint_wiki = "src/wiki_checkpoint.txt"
checkpoint_news = "src/news_checkpoint.txt"
checkpoint_abstracts = "src/abstracts_checkpoint.txt"

# Process datasets for LLaMA 3.1
datasets = [
    ("wikipedia", wiki_path, output_wiki, checkpoint_wiki),
    ("news", news_path, output_news, checkpoint_news),
    ("abstract", abstracts_path, output_abstracts, checkpoint_abstracts)
]

In [7]:
# Load Llama 3.1, move to cuda, ensure pad token for tokenizer, define batch size
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:  # Set pad_token to eos_token if not set
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
# model = model.half() # required for 16GB gpu, (8B 32 float = 32gb)
if torch.cuda.is_available():
    model = model.to('cuda')
model.eval()
batch_size = 12

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# Generate Llama 3.1
for task_type, input_path, output_path, checkpoint_path in datasets:
    print(f"Processing {task_type} dataset with Llama 3.1-Instruct")
    df = pd.read_csv(input_path)
    length_params = get_length_params(task_type)
    process_dataset_in_batches(
        df, model, tokenizer, batch_size, length_params,
        "Llama3.1", generate_text_llama3_batch,
        output_path, checkpoint_path, target_rows=250
    )

Processing wikipedia dataset with Llama 3.1-Instruct
250 out of 250 rows already generated for Llama3.1.
Target of 250 rows already reached for Llama3.1. Skipping further processing.
Processing news dataset with Llama 3.1-Instruct
100 out of 250 rows already generated for Llama3.1.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 0/13 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. [05:30<1:06:01, 330.11s/batch]


Processed batch of 12 rows. Total processed: 112.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. [11:10<1:01:35, 335.93s/batch]


Processed batch of 12 rows. Total processed: 124.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [16:26<54:30, 327.08s/batch]


Processed batch of 12 rows. Total processed: 136.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [21:49<48:46, 325.22s/batch]


Processed batch of 12 rows. Total processed: 148.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [27:09<43:07, 323.43s/batch]


Processed batch of 12 rows. Total processed: 160.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [32:25<37:27, 321.12s/batch]


Processed batch of 12 rows. Total processed: 172.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [37:55<32:22, 323.75s/batch]


Processed batch of 12 rows. Total processed: 184.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [43:12<26:48, 321.74s/batch]


Processed batch of 12 rows. Total processed: 196.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [48:37<21:30, 322.67s/batch]


Processed batch of 12 rows. Total processed: 208.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [54:07<16:14, 324.99s/batch]


Processed batch of 12 rows. Total processed: 220.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [59:34<10:51, 325.66s/batch]


Processed batch of 12 rows. Total processed: 232.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. [1:05:03<05:26, 326.61s/batch]


Processed batch of 12 rows. Total processed: 244.


Generating Llama3.1: 100%|███████████████████████████████████████| 13/13 [1:07:51<00:00, 313.22s/batch]


Processed batch of 6 rows. Total processed: 250.
Processing abstract dataset with Llama 3.1-Instruct
100 out of 250 rows already generated for Llama3.1.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 0/13 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [02:01<24:16, 121.34s/batch]


Processed batch of 12 rows. Total processed: 112.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [04:02<22:16, 121.51s/batch]


Processed batch of 12 rows. Total processed: 124.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [06:10<20:42, 124.27s/batch]


Processed batch of 12 rows. Total processed: 136.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [08:10<18:22, 122.54s/batch]


Processed batch of 12 rows. Total processed: 148.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [10:20<16:41, 125.17s/batch]


Processed batch of 12 rows. Total processed: 160.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [12:22<14:29, 124.17s/batch]


Processed batch of 12 rows. Total processed: 172.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [14:23<12:18, 123.15s/batch]


Processed batch of 12 rows. Total processed: 184.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [16:27<10:16, 123.31s/batch]


Processed batch of 12 rows. Total processed: 196.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [18:29<08:12, 123.13s/batch]


Processed batch of 12 rows. Total processed: 208.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [20:30<06:07, 122.47s/batch]


Processed batch of 12 rows. Total processed: 220.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [22:32<04:04, 122.16s/batch]


Processed batch of 12 rows. Total processed: 232.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.13 [24:34<02:02, 122.23s/batch]


Processed batch of 12 rows. Total processed: 244.


Generating Llama3.1: 100%|█████████████████████████████████████████| 13/13 [25:35<00:00, 118.10s/batch]

Processed batch of 6 rows. Total processed: 250.





In [9]:
clear_gpu_memory(model, tokenizer)

In [6]:
# Load Falcon 7B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b")
if torch.cuda.is_available():
    model = model.to('cuda')
model.eval()
batch_size = 16

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Generate Falcon
for task_type, input_path, output_path, checkpoint_path in datasets:
    print(f"Processing {task_type} dataset with Falcon")
    df = pd.read_csv(input_path)
    length_params = get_length_params(task_type)
    process_dataset_in_batches(
        df, model, tokenizer, batch_size, length_params,
        "Falcon", generate_text_falcon_batch,
        output_path, checkpoint_path, target_rows=250
    )

Processing wikipedia dataset with Falcon
250 out of 250 rows already generated for Falcon.
Target of 250 rows already reached for Falcon. Skipping further processing.
Processing news dataset with Falcon
176 out of 250 rows already generated for Falcon.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation. | 0/5 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.4:21<17:26, 261.71s/batch]


Processed batch of 16 rows. Total processed: 192.
