In [1]:
# !pip install transformers
# !pip install torch  
# !pip install datasets  
# !hostname
# !pip install --upgrade transformers
# !pip install spacy
# !pip install ipywidgets
# !pip install --upgrade transformers "numpy<2" "pyarrow<14"

In [1]:
import torch
torch.cuda.empty_cache()

In [10]:
####
# This notebook consists of two main parts:
#   1. Loading of the datasets, populating article lengths, crating the prompts and saving to domain_dataset.csv
#   2. Generating articles for each model in batches and save results incrementally to domain_dataset_out.csv
###

In [2]:
# !nvcc --version
!nvidia-smi

Fri Jan 24 22:52:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000001:00:00.0 Off |                    0 |
| N/A   48C    P0             67W /  300W |     533MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import torch
import gc
from tqdm import tqdm
import re
import os

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import set_seed

from huggingface_hub import notebook_login

from PrepareSentenceContext import PrepareSentenceContext

nltk.download('punkt')
pd.set_option('display.max_columns', None)  # Show all columns
set_seed(42)

[nltk_data] Downloading package punkt to /home/akp3user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Check current PyTorch and CUDA availability
print("PyTorch version:", torch.__version__)
print("CUDA is available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)

PyTorch version: 2.5.1+cu124
CUDA is available: True
CUDA version: 12.4


In [22]:
# # notebook_login()
# from huggingface_hub import login
# login("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

In [23]:
##############
### Part 1 ###
##############

In [24]:
# Load the HuggingFace wiki_intro_long dataset
hf_wiki_dataset = load_dataset('alonkipnis/wiki-intro-long', split='train')
df_wiki = pd.DataFrame(hf_wiki_dataset)

# Add columns for Llama2 and Falcon7B model outputs
df_wiki['human_len'] = None
df_wiki['gpt_len'] = None
df_wiki['Llama3.1'], df_wiki['Llama3.1_len'] = None, None
df_wiki['Falcon'], df_wiki['Falcon_len'] = None, None

df_wiki.rename(columns={
    'wiki_intro': 'human_text',
    'generated_intro': 'gpt'
    }, inplace=True)

columns_to_drop = ['prompt_tokens', 'generated_text', 'generated_intro_len']
df_wiki.drop(columns=columns_to_drop, inplace=True)

new_order = [
    'id', 'url', 'title', 'title_len', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama3.1', 'Llama3.1_len',
    'Falcon', 'Falcon_len']

df_wiki = df_wiki[new_order]
print(df_wiki.columns)
print(df_wiki.shape[0])

Index(['id', 'url', 'title', 'title_len', 'prompt', 'human_text', 'human_len',
       'gpt', 'gpt_len', 'Llama3.1', 'Llama3.1_len', 'Falcon', 'Falcon_len'],
      dtype='object')
39495


In [25]:
# Load the HuggingFace news dataset
hf_news_dataset = load_dataset('alonkipnis/news-chatgpt-long', split='train')
df_news = pd.DataFrame(hf_news_dataset)

df_news.rename(columns={
    'article': 'human_text',
    'chatgpt': 'gpt'
}, inplace=True)

df_news['human_len'], df_news['gpt_len'] = None, None
df_news['Llama3.1'], df_news['Llama3.1_len'] = None, None
df_news['Falcon'], df_news['Falcon_len'] = None, None
df_news['prompt'] = None

new_order = [
    'id', 'highlights', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama3.1', 'Llama3.1_len',
    'Falcon', 'Falcon_len'
]

df_news = df_news[new_order]
print(df_news.columns)
print(df_news.shape[0])

Index(['id', 'highlights', 'prompt', 'human_text', 'human_len', 'gpt',
       'gpt_len', 'Llama3.1', 'Llama3.1_len', 'Falcon', 'Falcon_len'],
      dtype='object')
13025


In [26]:
# Load the HuggingFace research absracts dataset
hf_abstracts_dataset = load_dataset('NicolaiSivesind/ChatGPT-Research-Abstracts', split='train')
df_abstracts = pd.DataFrame(hf_abstracts_dataset)

df_abstracts.rename(columns={
    'real_abstract': 'human_text',
    'real_word_count': 'human_len',
    'generated_abstract': 'gpt',
    'generated_word_count': 'gpt_len'
}, inplace=True)

df_abstracts['Llama3.1'], df_abstracts['Llama3.1_len'] = None, None
df_abstracts['Falcon'], df_abstracts['Falcon_len'] = None, None
df_abstracts['prompt'] = None

new_order = [
    'title', 'prompt',
    'human_text', 'human_len',
    'gpt', 'gpt_len',
    'Llama3.1', 'Llama3.1_len',
    'Falcon', 'Falcon_len'
]

df_abstracts = df_abstracts[new_order]
print(df_abstracts.columns)
print(df_abstracts.shape[0])

Index(['title', 'prompt', 'human_text', 'human_len', 'gpt', 'gpt_len',
       'Llama3.1', 'Llama3.1_len', 'Falcon', 'Falcon_len'],
      dtype='object')
10000


In [4]:
# Defining methods for word/sentence count, prompt generation, model generation 

def count_words_and_sentences(text):
    """
    Tokenizes the text into words and sentences using nltk 
    Returns a tuple of (n_words,n_sentences)
    """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return (len(words), len(sentences))

def batch_count_words_and_sentences(texts):
    """
    Tokenizes the text into words and sentences using nltk 
    Returns a tuple of (n_words,n_sentences)
    """
    return [(len(word_tokenize(text)), len(sent_tokenize(text))) for text in texts]

def create_wiki_prompt(row):
    """
    Creates the wiki dataset prompt using the title and first 7 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:7]) 
    # prompt = (
    # f"### | Instruction | ###\n"
    # f"Your role is a Wikipedia contributor.\n"
    # f"Your task is to compose a detailed Wikipedia-style introduction for the topic '{row['title']}'.\n"
    # f"The introduction should be at least {row['human_len'][0]} words long and {row['human_len'][1]} sentences long.\n"
    # f"Begin with a clear definition, followed by key details and context essential for understanding the topic.\n"
    # f"Do not include headings or editorial notes.\n"
    # f"### | Response | ###\n"
    # f"{first_few_words} "
    # )
    prompt = (
    f"### | Instruction | ###\n"
    f"Your role is a Wikipedia contributor.\n"
    f"Your task is to compose a detailed Wikipedia-style introduction for the topic '{row['title']}'.\n"
    f"The introduction should be approximately {row['human_len'][0]} words and {row['human_len'][1]} sentences."
    f"Begin with a clear definition, followed by key details and context essential for understanding the topic.\n"
    f"Provide a clean, standalone response without extra instructions or editorial comments.\n"
    f"### | Response | ###\n"
    f"{first_few_words} "
    )
    return prompt

def create_news_prompt(row):
    """
    Creates the news dataset prompt using the first 15 words written by humans, and the article highlights
    """
    first_few_words = ' '.join(row['human_text'].split()[:15]) 
    highlights = row['highlights'] 
    # prompt = (
    # f"### | Instruction | ###\n"
    # f"Your role is a professional news journalist.\n"
    # f"Your task is to write a detailed news article based on the given highlights.\n"
    # f"The article should be at least {row['human_len'][0]} words long and {row['human_len'][1]} sentences long.\n"
    # f"Incorporate the following key highlights: {highlights}\n"
    # f"Begin with the most important details, followed by context and supporting information.\n"
    # f"Do not include headings or editorial notes.\n"
    # f"### | Response | ###\n"
    # f"{first_few_words} "
    # )
    prompt = (
    f"### | Instruction | ###\n"
    f"Your role is a professional news journalist.\n"
    f"Your task is to write a detailed news article based on the given highlights.\n"
    f"The article should be at least {row['human_len'][0]} words and {row['human_len'][1]} sentences.\n"
    f"Incorporate the following key highlights: {highlights}\n"
    f"Begin with the most important details, followed by context and supporting information.\n"
    f"Provide a clean, standalone response without extra instructions or editorial comments.\n"
    f"### | Response | ###\n"
    f"{first_few_words} "
    )
    return prompt

def create_abstracts_prompt(row):
    """
    Creates the abstracts dataset prompt using the title and first 15 words written by humans
    """
    first_few_words = ' '.join(row['human_text'].split()[:15]) 
    # prompt = (
    # f"### | Instruction | ###\n"
    # f"Your role is a scientist writing a research abstract for the paper titled '{row['title']}'.\n"
    # f"Your task is to write a concise, detailed abstract for this paper.\n"
    # f"The abstract should be at least {row['human_len'][0]} words long and {row['human_len'][1]} sentences long.\n"
    # f"Begin with a clear statement of the research question or problem, followed by methodology, findings, and implications.\n"
    # f"Do not include headings or editorial notes.\n"
    # f"### | Response | ###\n"
    # f"{first_few_words} "
    # )
    prompt = (
    f"### | Instruction | ###\n"
    f"Your role is a scientist writing a paper titled '{row['title']}'.\n"
    f"Your task is to write a concise, detailed abstract for this paper.\n"
    f"The abstract should be at least {row['human_len'][0]} words and {row['human_len'][1]} sentences.\n"
    f"Ensure the abstract is comprehensive and provides sufficient detail to reach the specified word and sentence counts.\n"
    f"Begin with a clear statement of the research question or problem, followed by methodology, findings and implications.\n"
    f"Provide a clean, standalone response without extra instructions or editorial comments.\n"
    f"### | Response | ###\n"
    f"{first_few_words} "
    )
    return prompt

def get_length_params(task_type):
    if task_type == "wikipedia":
        return {"max_length": 512}  
    elif task_type == "news":
        return {"max_length": 1024}  # For longer news articles
    elif task_type == "abstract":
        return {"max_length": 512} 
    else:
        return {"max_length": 1024} 

def generate_text_gpt2xl_v2(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=1024,  # Set to the maximum length of the model
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences


def generate_text_llama2(prompt, model, tokenizer, max_length=500):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=max_length)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        do_sample=True,       # Enable sampling to generate more diverse responses
        # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
        # top_k=50,           # Consider top 50 tokens for sampling at each step
        # top_p=0.95,          
        top_p = 0.9,
        num_beams = 5,
        temperature=0.7,
        num_return_sequences=1
    )
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    start_index = min((generated_text.find(key) for key in ["Introduction:", "Article:", "Abstract:"] if generated_text.find(key) != -1), default=0)
    generated_text = generated_text[start_index:].strip()
    n_words, n_sentences = count_words_and_sentences(generated_text)
    return generated_text, n_words, n_sentences

def generate_text_llama3(prompt, model, tokenizer, length_params):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        num_return_sequences=1,
        max_new_tokens=length_params["max_length"],
        do_sample=False,      # Enable sampling to generate more diverse responses
        # temperature=0.5,    # Slightly randomize the outputs to prevent repetition
        # top_k=20,           # Consider top 50 tokens for sampling at each step      
        # top_p = 0.9,
        num_beams = 5,     # cant run with v100 16gGB
        no_repeat_ngram_size=5,
        repetition_penalty=1.5,
        length_penalty=0.9,
        early_stopping=True
    )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
    if match:
        response_text = match.group(1).strip()
    else:
        print("Response delimiter not found in generated text!")
        response_text = generated_text
    n_words, n_sentences = count_words_and_sentences(response_text)
    return response_text, n_words, n_sentences

def generate_text_falcon(prompt, model, tokenizer, length_params):
    """
    Encodes the prompt using the model tokenizer
    Returns the generated text, word count and sentence count
    """
    # Adjust tokenizer padding for decoding
    tokenizer.padding_side = 'left'
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token or '[PAD]'

    # encode the prompt
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')

    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=length_params["max_length"], 
        num_return_sequences=1,
        no_repeat_ngram_size=4,
        num_beams=5,
        early_stopping=True,
        do_sample=False,
        # top_p=0.9,
        # temperature=0.7,
    )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
    if match:
        response_text = match.group(1).strip()
    else:
        print("Response delimiter not found in generated text!")
        response_text = generated_text 
    n_words, n_sentences = count_words_and_sentences(response_text)
    return response_text, n_words, n_sentences

def generate_text_llama3_batch(prompts, model, tokenizer, length_params):
    """
    Encodes the prompts using the model tokenizer in batches.
    Returns the generated text, word count, and sentence count for all prompts in the batch.
    """
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = {key: value.to('cuda') for key, value in inputs.items()}

    # Generate output for the batch
    output_ids = model.generate(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask'],
        num_return_sequences=1,
        max_new_tokens=length_params["max_length"],
        do_sample=False,
        num_beams=5,
        no_repeat_ngram_size=5,
        repetition_penalty=1.5,
        length_penalty=0.9,
        early_stopping=True,
        temperature=None, 
        top_p=None         
    )
    generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    # extract the generated sections using regex
    response_texts = []
    for generated_text in generated_texts:
        match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
        if match:
            response_texts.append(match.group(1).strip())
        else:
            print("Response delimiter not found in generated text!")
            response_texts.append(generated_text.strip())
    lengths = batch_count_words_and_sentences(response_texts)
    return response_texts, lengths

def generate_text_falcon_batch(prompts, model, tokenizer, length_params):
    """
    Encodes the prompts using the model tokenizer in batches
    Returns the generated text, word count, and sentence count for all prompts in the batch
    """
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = {key: value.to('cuda') for key, value in inputs.items()}
    output_ids = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=length_params["max_length"],
        num_return_sequences=1,
        no_repeat_ngram_size=4,
        num_beams=5,
        early_stopping=True,
        do_sample=False
    )
    generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    response_texts = []
    for generated_text in generated_texts:
        match = re.search(r"### \| Response \| ###\s*(.*)", generated_text, re.DOTALL)
        if match:
            response_texts.append(match.group(1).strip())
        else:
            print("Response delimiter not found in generated text!")
            response_texts.append(generated_text.strip())

    lengths = batch_count_words_and_sentences(response_texts)
    return response_texts, lengths


def process_dataset_in_batches(df, model, tokenizer, batch_size, length_params, model_name, generate_method, output_path, checkpoint_path, target_rows):
    """
    Process the dataset in batches and save results incrementally.
    Supports resuming from a checkpoint and ensures each model processes up to target rows considering already generated rows.
    """
    # Initialize 
    processed_count = 0
    start_index = 0

    # output file
    if not os.path.exists(output_path):
        df.to_csv(output_path, index=False, encoding="utf-8-sig")
    output_df = pd.read_csv(output_path)
    if model_name not in output_df.columns:
        output_df[model_name] = None  # Initialize the column if it doesn't exist
    output_df[model_name] = output_df[model_name].astype(object)  # Explicitly set as object type

    length_column = f"{model_name}_len"
    if length_column not in output_df.columns:
        output_df[length_column] = None  # Initialize the column if it doesn't exist
    output_df[length_column] = output_df[length_column].astype(object) 

    # check how many rows within the first `target_rows` indices were generated
    first_target_indices = output_df.index[:target_rows] 
    processed_rows = output_df.loc[first_target_indices, model_name].notna()
    processed_count = processed_rows.sum()
    print(f"{processed_count} out of {target_rows} rows already generated for {model_name}.")

    # rows that need to be generated
    remaining_to_target = max(0, target_rows - processed_count)
    unprocessed_indices = output_df[output_df[model_name].isna()].index[:remaining_to_target]
    rows_to_generate = unprocessed_indices[:remaining_to_target]
    
    if remaining_to_target == 0:
        print(f"Target of {target_rows} rows already reached for {model_name}. Skipping further processing.")
        return
    rows_to_generate = rows_to_generate[:remaining_to_target]

    # total batches for progress tracking
    total_batches = (len(unprocessed_indices) + batch_size - 1) // batch_size

    # generate in batches
    with tqdm(total=total_batches, desc=f"Generating {model_name}", unit="batch") as pbar:
        for batch_start in range(0, len(rows_to_generate), batch_size):
            batch_indices = rows_to_generate[batch_start:batch_start + batch_size]
            batch_df = df.loc[batch_indices].reset_index(drop=True)

            # Generate text
            prompts = batch_df['prompt'].tolist()
            generated_texts, lengths = generate_method(prompts, model, tokenizer, length_params)

            # insert to df
            for i, (text, length) in enumerate(zip(generated_texts, lengths)):
                output_df.loc[batch_indices[i], model_name] = text
                output_df.loc[batch_indices[i], length_column] = str(length)

            # save
            processed_count += len(batch_indices)
            with open(checkpoint_path, 'w') as f_ckpt:
                f_ckpt.write(str(processed_count))
            output_df.to_csv(output_path, index=False, encoding="utf-8-sig")
            
            pbar.update(1)
            print(f"Processed batch of {len(batch_indices)} rows. Total processed: {processed_count}.")
        torch.cuda.empty_cache()
        gc.collect()

def batch_generator(df, batch_size, start_index=0):
    """
    Splits the DataFrame into batches starting from `start_index`
    Yields (start_index, batch_df) for each batch
    """
    for i in range(start_index, len(df), batch_size):
        yield i, df.iloc[i:i + batch_size]

def clear_gpu_memory(model, tokenizer):
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()

In [29]:
# populate length of human text with tuple(word_count, sentence_count)
df_wiki['human_len'] = batch_count_words_and_sentences(df_wiki['human_text'].tolist())
df_news['human_len'] = batch_count_words_and_sentences(df_news['human_text'].tolist())
df_abstracts['human_len'] = batch_count_words_and_sentences(df_abstracts['human_text'].tolist())

In [30]:
# populate length of GPT text with tuple(word_count, sentence_count)
df_wiki['gpt_len'] = batch_count_words_and_sentences(df_wiki['gpt'].tolist())
df_news['gpt_len'] = batch_count_words_and_sentences(df_news['gpt'].tolist())
df_abstracts['gpt_len'] = batch_count_words_and_sentences(df_abstracts['gpt'].tolist())

In [31]:
# create prompts
df_wiki['prompt'] = df_wiki.iloc[0:2500].apply(create_wiki_prompt, axis=1)
df_news['prompt'] = df_news.iloc[0:2500].apply(create_news_prompt, axis=1)
df_abstracts['prompt'] = df_abstracts.iloc[0:2500].apply(create_abstracts_prompt, axis=1)

In [32]:
# save loaded datasets
df_wiki[0:2500].to_csv('src/wiki_dataset.csv', index=False, encoding="utf-8-sig")
df_news[0:2500].to_csv('src/news_dataset.csv', index=False, encoding="utf-8-sig")
df_abstracts[0:2500].to_csv('src/abstracts_dataset.csv', index=False, encoding="utf-8-sig")

In [33]:
##############
### Part 2 ###
##############

In [34]:
# domain articles length stats for tuning max length generation 
def calc_sentence_stats(df, task_name):
    sentence_counts = [t[1] for t in df['human_len']]
    avg_sentences = pd.Series(sentence_counts).mean()
    median_sentences = pd.Series(sentence_counts).median()
    min_sentences = pd.Series(sentence_counts).min()
    max_sentences = pd.Series(sentence_counts).max()
    
    print(f"{task_name} - Average sentences: {avg_sentences}")
    print(f"{task_name} - Median sentences: {median_sentences}")
    print(f"{task_name} - Minimum sentences: {min_sentences}")
    print(f"{task_name} - Maximum sentences: {max_sentences}")

calc_sentence_stats(df_wiki, 'Wikipedia')
calc_sentence_stats(df_news, 'News')
calc_sentence_stats(df_abstracts, 'Abstracts')

Wikipedia - Average sentences: 9.162501582478795
Wikipedia - Median sentences: 9.0
Wikipedia - Minimum sentences: 1
Wikipedia - Maximum sentences: 71
News - Average sentences: 22.985335892514396
News - Median sentences: 22.0
News - Minimum sentences: 9
News - Maximum sentences: 126
Abstracts - Average sentences: 8.0332
Abstracts - Median sentences: 8.0
Abstracts - Minimum sentences: 1
Abstracts - Maximum sentences: 35


In [35]:
# #### Toy example to test llama3.1 8B model and tokenizer

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
# # model = model.half() # required for 16GB gpu (8B 32float = 32gb) so reduce to 16float
# if torch.cuda.is_available():
#     model.cuda()  
# model.eval()

# prompt = (
#     f"Your role is a Wikipedia contributor. "
#     f"Compose a Wikipedia-style introduction for the topic 'Moluccans'. "
#     f"Start with a clear definition, followed by key details and context that is essential for understanding the subject. "
#     f"Ensure the introduction is detailed and spans approximately 11 sentences long. "
#     f"Introduction: Moluccans are the Austronesian-speaking and Papuan-speaking ethnic"
# )

# inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=4096)
# if torch.cuda.is_available():
#     inputs = inputs.to('cuda')
#     model = model.to('cuda')
# output_ids = model.generate(
#     input_ids=inputs['input_ids'], 
#     attention_mask=inputs['attention_mask'],
#     do_sample=True,       # Enable sampling to generate more diverse responses
#     # temperature=0.9,    # Slightly randomize the outputs to prevent repetition
#     # top_k=50,           # Consider top 50 tokens for sampling at each step
#     # top_p=0.95,          
#     top_p = 0.9,
#     # num_beams = 5,
#     temperature=0.7,
#     num_return_sequences=1,
#     max_length = 512
# )

# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# start_index = -1
# for keyword in ["Introduction:", "Article:", "Abstract:"]:
#     start_index = generated_text.find(keyword)
#     if start_index != -1:
#         start_index += len(keyword)
#         break  # Stop once we find a valid keyword

# if start_index != -1:
#     generated_text = generated_text[start_index:].strip()
# else:
#     print("Keyword not found in generated text!")

# n_words, n_sentences = count_words_and_sentences(generated_text)

In [36]:
# ##### Toy example to test model and tokenizer
# Load falcon 7B tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
# if torch.cuda.is_available():
#     model.cuda()  
# model.eval()

# tokenizer.padding_side = 'left'
# if tokenizer.pad_token is None:
#     if tokenizer.eos_token:
#         tokenizer.pad_token = tokenizer.eos_token
#     else:
#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# # Encode the prompt to tensor of input ids
# inputs = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=4096)
# if torch.cuda.is_available():
#     inputs = inputs.to('cuda')
#     model = model.to('cuda')

# output_ids = model.generate(
#     input_ids=inputs['input_ids'],
#     attention_mask=inputs['attention_mask'],
#     max_length=4096, 
#     num_return_sequences=1,
#     no_repeat_ngram_size=4,  # Prevents the model from repeating the same 4-gram
#     top_p=0.92,
#     top_k=50,
#     temperature=0.7
# )

# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) 
# n_words, n_sentences = count_words_and_sentences(generated_text)

# ##### Toy example to test model and tokenizer

In [5]:
# Paths for generation
wiki_path = "src/wiki_dataset.csv"
news_path = "src/news_dataset.csv"
abstracts_path = "src/abstracts_dataset.csv"

output_wiki = "src/wiki_dataset_generated.csv"
output_news = "src/news_dataset_generated.csv"
output_abstracts = "src/abstracts_dataset_generated.csv"

checkpoint_wiki = "src/wiki_checkpoint.txt"
checkpoint_news = "src/news_checkpoint.txt"
checkpoint_abstracts = "src/abstracts_checkpoint.txt"

# Process datasets for LLaMA 3.1
datasets = [
    ("wikipedia", wiki_path, output_wiki, checkpoint_wiki),
    ("news", news_path, output_news, checkpoint_news),
    ("abstract", abstracts_path, output_abstracts, checkpoint_abstracts)
]

In [6]:
# Load Llama 3.1, move to cuda, ensure pad token for tokenizer, define batch size
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:  # Set pad_token to eos_token if not set
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
# model = model.half() # required for 16GB gpu, (8B 32 float = 32gb)
if torch.cuda.is_available():
    model = model.to('cuda')
model.eval()
batch_size = 12

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
# Generate Llama 3.1
for task_type, input_path, output_path, checkpoint_path in datasets:
    print(f"Processing {task_type} dataset with Llama 3.1-Instruct")
    df = pd.read_csv(input_path)
    length_params = get_length_params(task_type)
    process_dataset_in_batches(
        df, model, tokenizer, batch_size, length_params,
        "Llama3.1", generate_text_llama3_batch,
        output_path, checkpoint_path, target_rows=1500
    )

Processing wikipedia dataset with Llama 3.1-Instruct
1000 out of 1500 rows already generated for Llama3.1.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                 | 0/42 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 1/42 [01:49<1:14:57, 109.69s/batch]


Processed batch of 12 rows. Total processed: 1012.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 2/42 [03:39<1:13:09, 109.73s/batch]


Processed batch of 12 rows. Total processed: 1024.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 3/42 [05:29<1:11:29, 109.99s/batch]


Processed batch of 12 rows. Total processed: 1036.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 4/42 [07:19<1:09:34, 109.86s/batch]


Processed batch of 12 rows. Total processed: 1048.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 5/42 [09:10<1:08:01, 110.30s/batch]


Processed batch of 12 rows. Total processed: 1060.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 6/42 [11:00<1:06:09, 110.26s/batch]


Processed batch of 12 rows. Total processed: 1072.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 7/42 [12:51<1:04:20, 110.29s/batch]


Processed batch of 12 rows. Total processed: 1084.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 8/42 [14:40<1:02:24, 110.15s/batch]


Processed batch of 12 rows. Total processed: 1096.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 9/42 [16:32<1:00:48, 110.57s/batch]


Processed batch of 12 rows. Total processed: 1108.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 10/42 [18:24<59:17, 111.17s/batch]


Processed batch of 12 rows. Total processed: 1120.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 11/42 [20:15<57:16, 110.86s/batch]


Processed batch of 12 rows. Total processed: 1132.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 12/42 [22:06<55:35, 111.19s/batch]


Processed batch of 12 rows. Total processed: 1144.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 13/42 [23:56<53:33, 110.82s/batch]


Processed batch of 12 rows. Total processed: 1156.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 14/42 [25:47<51:37, 110.63s/batch]


Processed batch of 12 rows. Total processed: 1168.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 15/42 [27:38<49:56, 110.97s/batch]


Processed batch of 12 rows. Total processed: 1180.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 16/42 [29:28<47:57, 110.68s/batch]


Processed batch of 12 rows. Total processed: 1192.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 17/42 [31:18<46:00, 110.41s/batch]


Processed batch of 12 rows. Total processed: 1204.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 18/42 [33:07<43:59, 109.97s/batch]


Processed batch of 12 rows. Total processed: 1216.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 19/42 [34:56<42:04, 109.77s/batch]


Processed batch of 12 rows. Total processed: 1228.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 20/42 [36:45<40:09, 109.51s/batch]


Processed batch of 12 rows. Total processed: 1240.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 21/42 [38:35<38:22, 109.62s/batch]


Processed batch of 12 rows. Total processed: 1252.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 22/42 [40:25<36:34, 109.71s/batch]


Processed batch of 12 rows. Total processed: 1264.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 23/42 [42:15<34:44, 109.72s/batch]


Processed batch of 12 rows. Total processed: 1276.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 24/42 [44:06<33:00, 110.03s/batch]


Processed batch of 12 rows. Total processed: 1288.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 25/42 [45:56<31:14, 110.28s/batch]


Processed batch of 12 rows. Total processed: 1300.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 26/42 [47:46<29:22, 110.14s/batch]


Processed batch of 12 rows. Total processed: 1312.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 27/42 [49:36<27:28, 109.90s/batch]


Processed batch of 12 rows. Total processed: 1324.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 28/42 [51:25<25:36, 109.73s/batch]


Processed batch of 12 rows. Total processed: 1336.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 29/42 [53:15<23:47, 109.82s/batch]


Processed batch of 12 rows. Total processed: 1348.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 30/42 [55:05<21:56, 109.72s/batch]


Processed batch of 12 rows. Total processed: 1360.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 31/42 [56:54<20:07, 109.76s/batch]


Processed batch of 12 rows. Total processed: 1372.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.       | 32/42 [58:44<18:17, 109.79s/batch]


Processed batch of 12 rows. Total processed: 1384.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 33/42 [1:00:34<16:28, 109.88s/batch]


Processed batch of 12 rows. Total processed: 1396.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 34/42 [1:02:24<14:38, 109.77s/batch]


Processed batch of 12 rows. Total processed: 1408.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 35/42 [1:04:13<12:47, 109.60s/batch]


Processed batch of 12 rows. Total processed: 1420.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 36/42 [1:06:04<10:59, 109.97s/batch]


Processed batch of 12 rows. Total processed: 1432.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 37/42 [1:07:54<09:09, 109.92s/batch]


Processed batch of 12 rows. Total processed: 1444.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.▏    | 38/42 [1:09:44<07:19, 109.98s/batch]


Processed batch of 12 rows. Total processed: 1456.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.█▎   | 39/42 [1:11:33<05:29, 109.88s/batch]


Processed batch of 12 rows. Total processed: 1468.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.██▌  | 40/42 [1:13:23<03:39, 109.93s/batch]


Processed batch of 12 rows. Total processed: 1480.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.███▊ | 41/42 [1:15:14<01:50, 110.18s/batch]


Processed batch of 12 rows. Total processed: 1492.


Generating Llama3.1: 100%|███████████████████████████████████████████████████| 42/42 [1:16:39<00:00, 109.51s/batch]


Processed batch of 8 rows. Total processed: 1500.
Processing news dataset with Llama 3.1-Instruct
932 out of 1500 rows already generated for Llama3.1.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                 | 0/48 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 1/48 [05:28<4:17:35, 328.83s/batch]


Processed batch of 12 rows. Total processed: 944.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 2/48 [10:51<4:09:12, 325.05s/batch]


Processed batch of 12 rows. Total processed: 956.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 3/48 [16:12<4:02:22, 323.17s/batch]


Processed batch of 12 rows. Total processed: 968.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 4/48 [21:46<4:00:17, 327.68s/batch]


Processed batch of 12 rows. Total processed: 980.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 5/48 [27:12<3:54:13, 326.82s/batch]


Processed batch of 12 rows. Total processed: 992.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 6/48 [32:37<3:48:20, 326.20s/batch]


Processed batch of 12 rows. Total processed: 1004.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 7/48 [37:58<3:41:54, 324.76s/batch]


Processed batch of 12 rows. Total processed: 1016.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 8/48 [43:29<3:37:40, 326.52s/batch]


Processed batch of 12 rows. Total processed: 1028.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 9/48 [48:52<3:31:33, 325.49s/batch]


Processed batch of 12 rows. Total processed: 1040.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 10/48 [54:13<3:25:16, 324.12s/batch]


Processed batch of 12 rows. Total processed: 1052.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 11/48 [59:38<3:20:07, 324.54s/batch]


Processed batch of 12 rows. Total processed: 1064.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 12/48 [1:05:16<3:17:09, 328.60s/batch]


Processed batch of 12 rows. Total processed: 1076.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 13/48 [1:10:38<3:10:25, 326.43s/batch]


Processed batch of 12 rows. Total processed: 1088.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 14/48 [1:16:01<3:04:30, 325.61s/batch]


Processed batch of 12 rows. Total processed: 1100.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 15/48 [1:21:39<3:01:05, 329.25s/batch]


Processed batch of 12 rows. Total processed: 1112.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 16/48 [1:27:06<2:55:15, 328.60s/batch]


Processed batch of 12 rows. Total processed: 1124.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 17/48 [1:32:37<2:50:10, 329.36s/batch]


Processed batch of 12 rows. Total processed: 1136.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 18/48 [1:38:06<2:44:35, 329.19s/batch]


Processed batch of 12 rows. Total processed: 1148.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 19/48 [1:43:30<2:38:21, 327.62s/batch]


Processed batch of 12 rows. Total processed: 1160.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 20/48 [1:49:02<2:33:30, 328.94s/batch]


Processed batch of 12 rows. Total processed: 1172.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 21/48 [1:54:30<2:27:49, 328.51s/batch]


Processed batch of 12 rows. Total processed: 1184.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 22/48 [1:59:53<2:21:43, 327.05s/batch]


Processed batch of 12 rows. Total processed: 1196.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 23/48 [2:05:21<2:16:20, 327.23s/batch]


Processed batch of 12 rows. Total processed: 1208.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 24/48 [2:10:48<2:10:50, 327.12s/batch]


Processed batch of 12 rows. Total processed: 1220.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 25/48 [2:16:18<2:05:44, 328.01s/batch]


Processed batch of 12 rows. Total processed: 1232.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 26/48 [2:21:40<1:59:39, 326.36s/batch]


Processed batch of 12 rows. Total processed: 1244.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 27/48 [2:26:59<1:53:24, 324.02s/batch]


Processed batch of 12 rows. Total processed: 1256.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 28/48 [2:32:27<1:48:26, 325.32s/batch]


Processed batch of 12 rows. Total processed: 1268.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 29/48 [2:37:53<1:43:03, 325.43s/batch]


Processed batch of 12 rows. Total processed: 1280.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 30/48 [2:43:12<1:37:04, 323.58s/batch]


Processed batch of 12 rows. Total processed: 1292.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 31/48 [2:48:38<1:31:49, 324.11s/batch]


Processed batch of 12 rows. Total processed: 1304.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 32/48 [2:53:59<1:26:11, 323.20s/batch]


Processed batch of 12 rows. Total processed: 1316.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 33/48 [2:59:24<1:20:55, 323.73s/batch]


Processed batch of 12 rows. Total processed: 1328.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 34/48 [3:04:53<1:15:56, 325.44s/batch]


Processed batch of 12 rows. Total processed: 1340.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 35/48 [3:10:27<1:11:05, 328.10s/batch]


Processed batch of 12 rows. Total processed: 1352.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 36/48 [3:16:05<1:06:10, 330.85s/batch]


Processed batch of 12 rows. Total processed: 1364.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 37/48 [3:21:24<1:00:02, 327.50s/batch]


Processed batch of 12 rows. Total processed: 1376.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 38/48 [3:26:43<54:08, 324.84s/batch]


Processed batch of 12 rows. Total processed: 1388.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 39/48 [3:32:03<48:30, 323.40s/batch]


Processed batch of 12 rows. Total processed: 1400.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 40/48 [3:37:24<43:02, 322.81s/batch]


Processed batch of 12 rows. Total processed: 1412.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 41/48 [3:42:48<37:41, 323.02s/batch]


Processed batch of 12 rows. Total processed: 1424.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 42/48 [3:48:12<32:19, 323.31s/batch]


Processed batch of 12 rows. Total processed: 1436.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 43/48 [3:53:41<27:05, 325.14s/batch]


Processed batch of 12 rows. Total processed: 1448.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.▊    | 44/48 [3:59:07<21:41, 325.38s/batch]


Processed batch of 12 rows. Total processed: 1460.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.█▊   | 45/48 [4:04:25<16:09, 323.07s/batch]


Processed batch of 12 rows. Total processed: 1472.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.██▉  | 46/48 [4:09:44<10:43, 321.92s/batch]


Processed batch of 12 rows. Total processed: 1484.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.███▉ | 47/48 [4:15:12<05:23, 323.76s/batch]


Processed batch of 12 rows. Total processed: 1496.


Generating Llama3.1: 100%|███████████████████████████████████████████████████| 48/48 [4:17:17<00:00, 321.62s/batch]


Processed batch of 4 rows. Total processed: 1500.
Processing abstract dataset with Llama 3.1-Instruct
500 out of 1500 rows already generated for Llama3.1.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.                 | 0/84 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 1/84 [02:03<2:50:44, 123.43s/batch]


Processed batch of 12 rows. Total processed: 512.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 2/84 [04:06<2:48:08, 123.03s/batch]


Processed batch of 12 rows. Total processed: 524.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 3/84 [06:12<2:47:48, 124.30s/batch]


Processed batch of 12 rows. Total processed: 536.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 4/84 [08:14<2:44:58, 123.73s/batch]


Processed batch of 12 rows. Total processed: 548.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 5/84 [10:19<2:43:16, 124.00s/batch]


Processed batch of 12 rows. Total processed: 560.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 6/84 [12:23<2:41:07, 123.94s/batch]


Processed batch of 12 rows. Total processed: 572.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 7/84 [14:31<2:40:44, 125.25s/batch]


Processed batch of 12 rows. Total processed: 584.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 8/84 [16:33<2:37:35, 124.41s/batch]


Processed batch of 12 rows. Total processed: 596.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.      | 9/84 [18:36<2:35:02, 124.03s/batch]


Processed batch of 12 rows. Total processed: 608.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 10/84 [20:38<2:32:00, 123.25s/batch]


Processed batch of 12 rows. Total processed: 620.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 11/84 [22:46<2:31:36, 124.61s/batch]


Processed batch of 12 rows. Total processed: 632.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 12/84 [24:57<2:32:02, 126.71s/batch]


Processed batch of 12 rows. Total processed: 644.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 13/84 [27:01<2:28:51, 125.80s/batch]


Processed batch of 12 rows. Total processed: 656.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 14/84 [29:01<2:24:53, 124.19s/batch]


Processed batch of 12 rows. Total processed: 668.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 15/84 [31:04<2:22:13, 123.67s/batch]


Processed batch of 12 rows. Total processed: 680.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 16/84 [33:08<2:20:17, 123.78s/batch]


Processed batch of 12 rows. Total processed: 692.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 17/84 [35:10<2:17:49, 123.43s/batch]


Processed batch of 12 rows. Total processed: 704.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 18/84 [37:20<2:17:56, 125.40s/batch]


Processed batch of 12 rows. Total processed: 716.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 19/84 [39:24<2:15:12, 124.81s/batch]


Processed batch of 12 rows. Total processed: 728.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 20/84 [41:29<2:13:05, 124.77s/batch]


Processed batch of 12 rows. Total processed: 740.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 21/84 [43:34<2:11:12, 124.96s/batch]


Processed batch of 12 rows. Total processed: 752.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 22/84 [45:37<2:08:32, 124.39s/batch]


Processed batch of 12 rows. Total processed: 764.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 23/84 [47:48<2:08:32, 126.43s/batch]


Processed batch of 12 rows. Total processed: 776.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 24/84 [49:48<2:04:33, 124.56s/batch]


Processed batch of 12 rows. Total processed: 788.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 25/84 [51:51<2:01:53, 123.95s/batch]


Processed batch of 12 rows. Total processed: 800.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 26/84 [53:53<1:59:23, 123.50s/batch]


Processed batch of 12 rows. Total processed: 812.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 27/84 [56:00<1:58:09, 124.38s/batch]


Processed batch of 12 rows. Total processed: 824.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 28/84 [58:03<1:55:53, 124.18s/batch]


Processed batch of 12 rows. Total processed: 836.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 29/84 [1:00:08<1:53:51, 124.22s/batch]


Processed batch of 12 rows. Total processed: 848.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 30/84 [1:02:12<1:51:42, 124.11s/batch]


Processed batch of 12 rows. Total processed: 860.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 31/84 [1:04:13<1:48:49, 123.21s/batch]


Processed batch of 12 rows. Total processed: 872.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 32/84 [1:06:25<1:49:08, 125.94s/batch]


Processed batch of 12 rows. Total processed: 884.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 33/84 [1:08:31<1:47:05, 125.99s/batch]


Processed batch of 12 rows. Total processed: 896.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 34/84 [1:10:35<1:44:20, 125.22s/batch]


Processed batch of 12 rows. Total processed: 908.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 35/84 [1:12:37<1:41:38, 124.45s/batch]


Processed batch of 12 rows. Total processed: 920.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 36/84 [1:14:41<1:39:20, 124.17s/batch]


Processed batch of 12 rows. Total processed: 932.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 37/84 [1:16:47<1:37:47, 124.85s/batch]


Processed batch of 12 rows. Total processed: 944.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 38/84 [1:18:52<1:35:37, 124.73s/batch]


Processed batch of 12 rows. Total processed: 956.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 39/84 [1:20:52<1:32:38, 123.52s/batch]


Processed batch of 12 rows. Total processed: 968.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 40/84 [1:22:59<1:31:17, 124.49s/batch]


Processed batch of 12 rows. Total processed: 980.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 41/84 [1:25:04<1:29:17, 124.60s/batch]


Processed batch of 12 rows. Total processed: 992.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 42/84 [1:27:07<1:26:51, 124.09s/batch]


Processed batch of 12 rows. Total processed: 1004.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 43/84 [1:29:13<1:25:11, 124.68s/batch]


Processed batch of 12 rows. Total processed: 1016.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 44/84 [1:31:15<1:22:34, 123.87s/batch]


Processed batch of 12 rows. Total processed: 1028.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 45/84 [1:33:18<1:20:17, 123.52s/batch]


Processed batch of 12 rows. Total processed: 1040.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 46/84 [1:35:23<1:18:36, 124.11s/batch]


Processed batch of 12 rows. Total processed: 1052.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 47/84 [1:37:28<1:16:36, 124.22s/batch]


Processed batch of 12 rows. Total processed: 1064.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 48/84 [1:39:29<1:14:06, 123.50s/batch]


Processed batch of 12 rows. Total processed: 1076.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 49/84 [1:41:31<1:11:40, 122.87s/batch]


Processed batch of 12 rows. Total processed: 1088.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 50/84 [1:43:31<1:09:14, 122.19s/batch]


Processed batch of 12 rows. Total processed: 1100.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 51/84 [1:45:32<1:06:55, 121.67s/batch]


Processed batch of 12 rows. Total processed: 1112.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 52/84 [1:47:36<1:05:20, 122.52s/batch]


Processed batch of 12 rows. Total processed: 1124.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 53/84 [1:49:41<1:03:33, 123.03s/batch]


Processed batch of 12 rows. Total processed: 1136.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 54/84 [1:51:53<1:02:52, 125.76s/batch]


Processed batch of 12 rows. Total processed: 1148.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.   | 55/84 [1:53:57<1:00:37, 125.42s/batch]


Processed batch of 12 rows. Total processed: 1160.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 56/84 [1:56:04<58:40, 125.73s/batch]


Processed batch of 12 rows. Total processed: 1172.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 57/84 [1:58:06<56:09, 124.80s/batch]


Processed batch of 12 rows. Total processed: 1184.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 58/84 [2:00:18<54:57, 126.82s/batch]


Processed batch of 12 rows. Total processed: 1196.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 59/84 [2:02:23<52:39, 126.39s/batch]


Processed batch of 12 rows. Total processed: 1208.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 60/84 [2:04:25<49:58, 124.95s/batch]


Processed batch of 12 rows. Total processed: 1220.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 61/84 [2:06:26<47:23, 123.63s/batch]


Processed batch of 12 rows. Total processed: 1232.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 62/84 [2:08:31<45:29, 124.05s/batch]


Processed batch of 12 rows. Total processed: 1244.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 63/84 [2:10:33<43:13, 123.51s/batch]


Processed batch of 12 rows. Total processed: 1256.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 64/84 [2:12:34<40:53, 122.68s/batch]


Processed batch of 12 rows. Total processed: 1268.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 65/84 [2:14:35<38:44, 122.33s/batch]


Processed batch of 12 rows. Total processed: 1280.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 66/84 [2:16:36<36:34, 121.93s/batch]


Processed batch of 12 rows. Total processed: 1292.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 67/84 [2:18:40<34:44, 122.63s/batch]


Processed batch of 12 rows. Total processed: 1304.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 68/84 [2:20:44<32:47, 122.97s/batch]


Processed batch of 12 rows. Total processed: 1316.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 69/84 [2:22:45<30:34, 122.28s/batch]


Processed batch of 12 rows. Total processed: 1328.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 70/84 [2:24:47<28:31, 122.27s/batch]


Processed batch of 12 rows. Total processed: 1340.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 71/84 [2:26:48<26:25, 122.00s/batch]


Processed batch of 12 rows. Total processed: 1352.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 72/84 [2:28:49<24:19, 121.62s/batch]


Processed batch of 12 rows. Total processed: 1364.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 73/84 [2:30:51<22:18, 121.72s/batch]


Processed batch of 12 rows. Total processed: 1376.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 74/84 [2:32:52<20:13, 121.37s/batch]


Processed batch of 12 rows. Total processed: 1388.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.     | 75/84 [2:34:54<18:16, 121.83s/batch]


Processed batch of 12 rows. Total processed: 1400.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.▏    | 76/84 [2:36:56<16:14, 121.77s/batch]


Processed batch of 12 rows. Total processed: 1412.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.▊    | 77/84 [2:39:00<14:16, 122.40s/batch]


Processed batch of 12 rows. Total processed: 1424.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.█▎   | 78/84 [2:41:01<12:12, 122.00s/batch]


Processed batch of 12 rows. Total processed: 1436.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.█▉   | 79/84 [2:43:02<10:08, 121.75s/batch]


Processed batch of 12 rows. Total processed: 1448.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.██▌  | 80/84 [2:45:03<08:05, 121.49s/batch]


Processed batch of 12 rows. Total processed: 1460.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.███▏ | 81/84 [2:47:06<06:05, 121.95s/batch]


Processed batch of 12 rows. Total processed: 1472.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.███▊ | 82/84 [2:49:06<04:02, 121.20s/batch]


Processed batch of 12 rows. Total processed: 1484.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.████▍| 83/84 [2:51:07<02:01, 121.36s/batch]


Processed batch of 12 rows. Total processed: 1496.


Generating Llama3.1: 100%|███████████████████████████████████████████████████| 84/84 [2:51:56<00:00, 122.82s/batch]

Processed batch of 4 rows. Total processed: 1500.





In [15]:
clear_gpu_memory(model, tokenizer)

In [16]:
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()
gc.collect()

0

In [10]:
# Load Falcon 7B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b")
if torch.cuda.is_available():
    model = model.to('cuda')
model.eval()
batch_size = 16

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# Generate Falcon
for task_type, input_path, output_path, checkpoint_path in datasets:
    print(f"Processing {task_type} dataset with Falcon")
    df = pd.read_csv(input_path)
    length_params = get_length_params(task_type)
    process_dataset_in_batches(
        df, model, tokenizer, batch_size, length_params,
        "Falcon", generate_text_falcon_batch,
        output_path, checkpoint_path, target_rows=1500
    )

Processing wikipedia dataset with Falcon
500 out of 1500 rows already generated for Falcon.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.                     | 0/63 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 1/63 [01:32<1:35:07, 92.06s/batch]


Processed batch of 16 rows. Total processed: 516.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 2/63 [03:03<1:33:08, 91.62s/batch]


Processed batch of 16 rows. Total processed: 532.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 3/63 [04:35<1:31:41, 91.69s/batch]


Processed batch of 16 rows. Total processed: 548.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 4/63 [06:06<1:30:06, 91.64s/batch]


Processed batch of 16 rows. Total processed: 564.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 5/63 [07:39<1:29:04, 92.15s/batch]


Processed batch of 16 rows. Total processed: 580.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 6/63 [09:10<1:27:14, 91.83s/batch]


Processed batch of 16 rows. Total processed: 596.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 7/63 [10:43<1:25:48, 91.93s/batch]


Processed batch of 16 rows. Total processed: 612.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 8/63 [12:14<1:24:07, 91.76s/batch]


Processed batch of 16 rows. Total processed: 628.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 9/63 [13:45<1:22:29, 91.66s/batch]


Processed batch of 16 rows. Total processed: 644.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 10/63 [15:17<1:21:00, 91.70s/batch]


Processed batch of 16 rows. Total processed: 660.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 11/63 [16:49<1:19:23, 91.61s/batch]


Processed batch of 16 rows. Total processed: 676.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 12/63 [18:21<1:18:02, 91.82s/batch]


Processed batch of 16 rows. Total processed: 692.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 13/63 [19:53<1:16:31, 91.84s/batch]


Processed batch of 16 rows. Total processed: 708.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 14/63 [21:24<1:14:42, 91.49s/batch]


Processed batch of 16 rows. Total processed: 724.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 15/63 [22:54<1:13:02, 91.31s/batch]


Processed batch of 16 rows. Total processed: 740.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 16/63 [24:26<1:11:29, 91.27s/batch]


Processed batch of 16 rows. Total processed: 756.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 17/63 [25:55<1:09:34, 90.74s/batch]


Processed batch of 16 rows. Total processed: 772.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 18/63 [27:26<1:08:08, 90.85s/batch]


Processed batch of 16 rows. Total processed: 788.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 19/63 [28:56<1:06:24, 90.56s/batch]


Processed batch of 16 rows. Total processed: 804.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 20/63 [30:27<1:04:52, 90.53s/batch]


Processed batch of 16 rows. Total processed: 820.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 21/63 [31:57<1:03:23, 90.55s/batch]


Processed batch of 16 rows. Total processed: 836.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 22/63 [33:29<1:02:10, 91.00s/batch]


Processed batch of 16 rows. Total processed: 852.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 23/63 [35:00<1:00:38, 90.96s/batch]


Processed batch of 16 rows. Total processed: 868.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 24/63 [36:32<59:16, 91.18s/batch]


Processed batch of 16 rows. Total processed: 884.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 25/63 [38:03<57:41, 91.11s/batch]


Processed batch of 16 rows. Total processed: 900.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 26/63 [39:32<55:46, 90.45s/batch]


Processed batch of 16 rows. Total processed: 916.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 27/63 [41:02<54:13, 90.37s/batch]


Processed batch of 16 rows. Total processed: 932.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 28/63 [42:32<52:40, 90.31s/batch]


Processed batch of 16 rows. Total processed: 948.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 29/63 [44:03<51:22, 90.67s/batch]


Processed batch of 16 rows. Total processed: 964.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 30/63 [45:35<49:59, 90.89s/batch]


Processed batch of 16 rows. Total processed: 980.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 31/63 [47:07<48:40, 91.25s/batch]


Processed batch of 16 rows. Total processed: 996.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 32/63 [48:38<47:10, 91.31s/batch]


Processed batch of 16 rows. Total processed: 1012.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 33/63 [50:09<45:29, 91.00s/batch]


Processed batch of 16 rows. Total processed: 1028.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 34/63 [51:39<43:52, 90.78s/batch]


Processed batch of 16 rows. Total processed: 1044.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 35/63 [53:08<42:07, 90.28s/batch]


Processed batch of 16 rows. Total processed: 1060.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 36/63 [54:39<40:40, 90.38s/batch]


Processed batch of 16 rows. Total processed: 1076.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 37/63 [56:10<39:17, 90.67s/batch]


Processed batch of 16 rows. Total processed: 1092.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 38/63 [57:40<37:40, 90.43s/batch]


Processed batch of 16 rows. Total processed: 1108.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 39/63 [59:08<35:56, 89.86s/batch]


Processed batch of 16 rows. Total processed: 1124.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 40/63 [1:00:40<34:36, 90.30s/batch]


Processed batch of 16 rows. Total processed: 1140.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 41/63 [1:02:11<33:14, 90.66s/batch]


Processed batch of 16 rows. Total processed: 1156.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 42/63 [1:03:42<31:43, 90.62s/batch]


Processed batch of 16 rows. Total processed: 1172.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 43/63 [1:05:12<30:12, 90.62s/batch]


Processed batch of 16 rows. Total processed: 1188.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 44/63 [1:06:44<28:47, 90.90s/batch]


Processed batch of 16 rows. Total processed: 1204.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 45/63 [1:08:15<27:18, 91.00s/batch]


Processed batch of 16 rows. Total processed: 1220.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 46/63 [1:09:46<25:44, 90.85s/batch]


Processed batch of 16 rows. Total processed: 1236.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 47/63 [1:11:15<24:07, 90.47s/batch]


Processed batch of 16 rows. Total processed: 1252.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 48/63 [1:12:47<22:41, 90.78s/batch]


Processed batch of 16 rows. Total processed: 1268.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 49/63 [1:14:17<21:09, 90.70s/batch]


Processed batch of 16 rows. Total processed: 1284.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 50/63 [1:15:49<19:41, 90.86s/batch]


Processed batch of 16 rows. Total processed: 1300.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 51/63 [1:17:21<18:15, 91.27s/batch]


Processed batch of 16 rows. Total processed: 1316.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.▌         | 52/63 [1:18:53<16:47, 91.56s/batch]


Processed batch of 16 rows. Total processed: 1332.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.█▍        | 53/63 [1:20:24<15:14, 91.42s/batch]


Processed batch of 16 rows. Total processed: 1348.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.██▎       | 54/63 [1:21:56<13:44, 91.60s/batch]


Processed batch of 16 rows. Total processed: 1364.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.███▏      | 55/63 [1:23:28<12:13, 91.71s/batch]


Processed batch of 16 rows. Total processed: 1380.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.████      | 56/63 [1:25:00<10:42, 91.75s/batch]


Processed batch of 16 rows. Total processed: 1396.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.████▊     | 57/63 [1:26:31<09:09, 91.52s/batch]


Processed batch of 16 rows. Total processed: 1412.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.█████▋    | 58/63 [1:28:02<07:36, 91.27s/batch]


Processed batch of 16 rows. Total processed: 1428.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.██████▌   | 59/63 [1:29:33<06:05, 91.34s/batch]


Processed batch of 16 rows. Total processed: 1444.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.███████▍  | 60/63 [1:31:04<04:33, 91.24s/batch]


Processed batch of 16 rows. Total processed: 1460.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.████████▎ | 61/63 [1:32:35<03:02, 91.18s/batch]


Processed batch of 16 rows. Total processed: 1476.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.█████████▏| 62/63 [1:34:05<01:30, 90.92s/batch]


Processed batch of 16 rows. Total processed: 1492.


Generating Falcon: 100%|██████████████████████████████████████████████████████| 63/63 [1:34:59<00:00, 90.48s/batch]


Processed batch of 8 rows. Total processed: 1500.
Processing news dataset with Falcon
500 out of 1500 rows already generated for Falcon.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.                     | 0/63 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 1/63 [04:24<4:33:11, 264.37s/batch]


Processed batch of 16 rows. Total processed: 516.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 2/63 [07:55<3:57:07, 233.23s/batch]


Processed batch of 16 rows. Total processed: 532.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 3/63 [12:27<4:10:59, 250.99s/batch]


Processed batch of 16 rows. Total processed: 548.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 4/63 [16:13<3:57:04, 241.09s/batch]


Processed batch of 16 rows. Total processed: 564.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 5/63 [20:38<4:01:24, 249.74s/batch]


Processed batch of 16 rows. Total processed: 580.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 6/63 [25:01<4:01:16, 253.98s/batch]


Processed batch of 16 rows. Total processed: 596.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 7/63 [28:52<3:50:16, 246.72s/batch]


Processed batch of 16 rows. Total processed: 612.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 8/63 [33:11<3:49:33, 250.42s/batch]


Processed batch of 16 rows. Total processed: 628.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 9/63 [36:04<3:23:36, 226.23s/batch]


Processed batch of 16 rows. Total processed: 644.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 10/63 [40:08<3:24:47, 231.84s/batch]


Processed batch of 16 rows. Total processed: 660.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 11/63 [44:35<3:30:09, 242.50s/batch]


Processed batch of 16 rows. Total processed: 676.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 12/63 [48:55<3:30:35, 247.75s/batch]


Processed batch of 16 rows. Total processed: 692.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 13/63 [52:35<3:19:38, 239.57s/batch]


Processed batch of 16 rows. Total processed: 708.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 14/63 [57:00<3:21:45, 247.05s/batch]


Processed batch of 16 rows. Total processed: 724.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 15/63 [1:01:17<3:20:13, 250.29s/batch]


Processed batch of 16 rows. Total processed: 740.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 16/63 [1:04:33<3:03:01, 233.65s/batch]


Processed batch of 16 rows. Total processed: 756.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 17/63 [1:08:16<2:56:52, 230.70s/batch]


Processed batch of 16 rows. Total processed: 772.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 18/63 [1:12:18<2:55:24, 233.87s/batch]


Processed batch of 16 rows. Total processed: 788.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 19/63 [1:14:51<2:33:45, 209.67s/batch]


Processed batch of 16 rows. Total processed: 804.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 20/63 [1:19:17<2:42:27, 226.69s/batch]


Processed batch of 16 rows. Total processed: 820.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 21/63 [1:23:40<2:46:17, 237.56s/batch]


Processed batch of 16 rows. Total processed: 836.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 22/63 [1:28:09<2:48:42, 246.89s/batch]


Processed batch of 16 rows. Total processed: 852.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 23/63 [1:32:38<2:49:00, 253.52s/batch]


Processed batch of 16 rows. Total processed: 868.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 24/63 [1:37:00<2:46:25, 256.03s/batch]


Processed batch of 16 rows. Total processed: 884.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 25/63 [1:41:25<2:43:58, 258.90s/batch]


Processed batch of 16 rows. Total processed: 900.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 26/63 [1:44:06<2:21:26, 229.37s/batch]


Processed batch of 16 rows. Total processed: 916.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 27/63 [1:48:19<2:21:51, 236.43s/batch]


Processed batch of 16 rows. Total processed: 932.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 28/63 [1:52:39<2:22:11, 243.75s/batch]


Processed batch of 16 rows. Total processed: 948.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 29/63 [1:57:08<2:22:23, 251.29s/batch]


Processed batch of 16 rows. Total processed: 964.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 30/63 [2:01:31<2:20:01, 254.58s/batch]


Processed batch of 16 rows. Total processed: 980.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 31/63 [2:05:56<2:17:29, 257.80s/batch]


Processed batch of 16 rows. Total processed: 996.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 32/63 [2:10:26<2:15:07, 261.54s/batch]


Processed batch of 16 rows. Total processed: 1012.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 33/63 [2:14:45<2:10:23, 260.80s/batch]


Processed batch of 16 rows. Total processed: 1028.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 34/63 [2:18:25<2:00:03, 248.39s/batch]


Processed batch of 16 rows. Total processed: 1044.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 35/63 [2:21:18<1:45:20, 225.74s/batch]


Processed batch of 16 rows. Total processed: 1060.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 36/63 [2:25:35<1:45:54, 235.34s/batch]


Processed batch of 16 rows. Total processed: 1076.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 37/63 [2:30:03<1:46:07, 244.90s/batch]


Processed batch of 16 rows. Total processed: 1092.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 38/63 [2:34:29<1:44:43, 251.33s/batch]


Processed batch of 16 rows. Total processed: 1108.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 39/63 [2:38:31<1:39:27, 248.63s/batch]


Processed batch of 16 rows. Total processed: 1124.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 40/63 [2:42:46<1:35:57, 250.35s/batch]


Processed batch of 16 rows. Total processed: 1140.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 41/63 [2:47:12<1:33:36, 255.31s/batch]


Processed batch of 16 rows. Total processed: 1156.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 42/63 [2:51:32<1:29:46, 256.52s/batch]


Processed batch of 16 rows. Total processed: 1172.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 43/63 [2:55:54<1:26:05, 258.29s/batch]


Processed batch of 16 rows. Total processed: 1188.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 44/63 [3:00:08<1:21:20, 256.89s/batch]


Processed batch of 16 rows. Total processed: 1204.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 45/63 [3:03:47<1:13:41, 245.66s/batch]


Processed batch of 16 rows. Total processed: 1220.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 46/63 [3:08:09<1:10:58, 250.53s/batch]


Processed batch of 16 rows. Total processed: 1236.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 47/63 [3:12:14<1:06:23, 248.96s/batch]


Processed batch of 16 rows. Total processed: 1252.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.       | 48/63 [3:16:37<1:03:13, 252.93s/batch]


Processed batch of 16 rows. Total processed: 1268.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 49/63 [3:19:25<53:05, 227.56s/batch]


Processed batch of 16 rows. Total processed: 1284.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 50/63 [3:23:50<51:45, 238.88s/batch]


Processed batch of 16 rows. Total processed: 1300.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 51/63 [3:28:14<49:16, 246.39s/batch]


Processed batch of 16 rows. Total processed: 1316.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.         | 52/63 [3:32:39<46:10, 251.86s/batch]


Processed batch of 16 rows. Total processed: 1332.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.▌        | 53/63 [3:36:41<41:30, 249.09s/batch]


Processed batch of 16 rows. Total processed: 1348.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.█▍       | 54/63 [3:40:59<37:45, 251.77s/batch]


Processed batch of 16 rows. Total processed: 1364.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.██▎      | 55/63 [3:45:27<34:11, 256.39s/batch]


Processed batch of 16 rows. Total processed: 1380.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.███      | 56/63 [3:48:45<27:53, 239.05s/batch]


Processed batch of 16 rows. Total processed: 1396.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.███▉     | 57/63 [3:53:01<24:23, 243.97s/batch]


Processed batch of 16 rows. Total processed: 1412.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.████▊    | 58/63 [3:57:27<20:53, 250.62s/batch]


Processed batch of 16 rows. Total processed: 1428.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.█████▋   | 59/63 [4:01:49<16:56, 254.13s/batch]


Processed batch of 16 rows. Total processed: 1444.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.██████▍  | 60/63 [4:06:03<12:41, 253.94s/batch]


Processed batch of 16 rows. Total processed: 1460.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.███████▎ | 61/63 [4:10:32<08:36, 258.48s/batch]


Processed batch of 16 rows. Total processed: 1476.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.████████▏| 62/63 [4:14:47<04:17, 257.47s/batch]


Processed batch of 16 rows. Total processed: 1492.


Generating Falcon: 100%|█████████████████████████████████████████████████████| 63/63 [4:16:23<00:00, 244.19s/batch]


Processed batch of 8 rows. Total processed: 1500.
Processing abstract dataset with Falcon
500 out of 1500 rows already generated for Falcon.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.                     | 0/63 [00:00<?, ?batch/s]
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 1/63 [01:24<1:27:43, 84.89s/batch]


Processed batch of 16 rows. Total processed: 516.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 2/63 [02:46<1:24:32, 83.15s/batch]


Processed batch of 16 rows. Total processed: 532.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 3/63 [04:11<1:23:55, 83.93s/batch]


Processed batch of 16 rows. Total processed: 548.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 4/63 [05:35<1:22:18, 83.70s/batch]


Processed batch of 16 rows. Total processed: 564.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 5/63 [06:55<1:19:43, 82.48s/batch]


Processed batch of 16 rows. Total processed: 580.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 6/63 [08:20<1:19:03, 83.23s/batch]


Processed batch of 16 rows. Total processed: 596.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 7/63 [09:43<1:17:45, 83.31s/batch]


Processed batch of 16 rows. Total processed: 612.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 8/63 [11:04<1:15:37, 82.49s/batch]


Processed batch of 16 rows. Total processed: 628.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.           | 9/63 [12:21<1:12:47, 80.89s/batch]


Processed batch of 16 rows. Total processed: 644.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 10/63 [13:46<1:12:30, 82.08s/batch]


Processed batch of 16 rows. Total processed: 660.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 11/63 [15:11<1:11:51, 82.91s/batch]


Processed batch of 16 rows. Total processed: 676.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 12/63 [16:35<1:10:51, 83.36s/batch]


Processed batch of 16 rows. Total processed: 692.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 13/63 [18:01<1:10:02, 84.05s/batch]


Processed batch of 16 rows. Total processed: 708.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 14/63 [19:20<1:07:35, 82.77s/batch]


Processed batch of 16 rows. Total processed: 724.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 15/63 [20:43<1:06:15, 82.82s/batch]


Processed batch of 16 rows. Total processed: 740.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 16/63 [22:06<1:04:45, 82.67s/batch]


Processed batch of 16 rows. Total processed: 756.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 17/63 [23:25<1:02:32, 81.58s/batch]


Processed batch of 16 rows. Total processed: 772.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 18/63 [24:48<1:01:26, 81.93s/batch]


Processed batch of 16 rows. Total processed: 788.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.          | 19/63 [26:12<1:00:41, 82.76s/batch]


Processed batch of 16 rows. Total processed: 804.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 20/63 [27:34<59:03, 82.41s/batch]


Processed batch of 16 rows. Total processed: 820.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 21/63 [28:59<58:10, 83.10s/batch]


Processed batch of 16 rows. Total processed: 836.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 22/63 [30:22<56:56, 83.32s/batch]


Processed batch of 16 rows. Total processed: 852.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 23/63 [31:47<55:45, 83.65s/batch]


Processed batch of 16 rows. Total processed: 868.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 24/63 [33:05<53:24, 82.16s/batch]


Processed batch of 16 rows. Total processed: 884.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 25/63 [34:27<51:55, 82.00s/batch]


Processed batch of 16 rows. Total processed: 900.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 26/63 [35:52<51:03, 82.80s/batch]


Processed batch of 16 rows. Total processed: 916.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 27/63 [37:16<49:54, 83.17s/batch]


Processed batch of 16 rows. Total processed: 932.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 28/63 [38:38<48:18, 82.82s/batch]


Processed batch of 16 rows. Total processed: 948.


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.            | 29/63 [40:01<46:58, 82.91s/batch]


Processed batch of 16 rows. Total processed: 964.


Generating Falcon:  46%|█████████████████████████▊                              | 29/63 [40:27<47:25, 83.70s/batch]


KeyboardInterrupt: 