# NLP

## Load Libraries

In [25]:
#! pip install sentence_transformers
#! pip install langchain
#! pip install -U langchain-community
# ! pip install langchain-huggingface
# ! pip install faiss-gpu

In [26]:
# General Libraries
import pandas as pd
import numpy as np
import time

import os

# To open Google's json
import gzip
import json

# Deep Learning
import torch
import transformers
from sentence_transformers import SentenceTransformer, util

# RAG
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM,pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_core.documents.base import Document

import sys
#sys.path.append("./docker-python/patches")
from huggingface_hub import login
#from kaggle_secrets import UserSecretsClient

# Paralelization
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

# Colab
from google.colab import drive
drive.mount('/content/drive')

# Working Directory
import os
os.chdir("/content/drive/MyDrive/nlp")
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/nlp


## Global Variables

In [27]:
DOWNLOAD_GSUTIL = False
GET_FROM_RAW = False
LLAMA_NF = False

## User defined funtions

In [28]:
# Function to process a single file
def process_file(file_path):
    relevant_data = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            full_text = [i['token'] for i in data['document_tokens']]
            full_text = '<<tok>>'.join(full_text)
            annotations_text = data['annotations']
            short_answers = annotations_text[0]['short_answers']
            yes_no_answer = annotations_text[0]['yes_no_answer']
            if (yes_no_answer == 'NONE') & (len(short_answers) == 0):
                continue
            else:
                if len(short_answers) == 0:
                    short_answer = yes_no_answer
                    relevant_data.append([data['example_id'], full_text, data['question_text'], short_answer])
                elif len(short_answers) == 1:
                    short_answer = short_answers[0]
                    relevant_data.append([data['example_id'], full_text, data['question_text'], short_answer])
                else:
                    for short_answer in short_answers:
                        relevant_data.append([data['example_id'], full_text, data['question_text'], short_answer])
    return relevant_data

# Function to prepare file paths and parallelize processing
def parallel_process_files(file_names, dir_x = './v1.0/train/'):
    file_paths = [f'{dir_x}/{file_name}' for file_name in file_names]
    relevant_data = []
    with ProcessPoolExecutor(max_workers=64) as executor:
        results = executor.map(process_file, file_paths)
        for result in results:
            relevant_data.extend(result)  # Collect data from each processed file
    return relevant_data

def list_torch_devices():
    if torch.cuda.is_available():
        num_devices = torch.cuda.device_count()
        print(f"Number of CUDA devices: {num_devices}")

        for i in range(num_devices):
            total_memory_gib = torch.cuda.get_device_properties(i).total_memory / (1024**3)  # Convert to GiB
            allocated_memory_gib = torch.cuda.memory_allocated(i) / (1024**3)  # Convert to GiB
            reserved_memory_gib = torch.cuda.memory_reserved(i) / (1024**3)  # Convert to GiB
            available_memory_gib = total_memory_gib - allocated_memory_gib  # Calculate available memory

            print(f"Device {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Total Memory: {total_memory_gib:.2f} GiB")
            print(f"  Memory Allocated: {allocated_memory_gib:.2f} GiB")
            print(f"  Memory Cached: {reserved_memory_gib:.2f} GiB")
            print(f"  Available Memory: {available_memory_gib:.2f} GiB")
            print()
    else:
        print("CUDA is not available.")


## Data Loading

In [29]:
if DOWNLOAD_GSUTIL:
  ! gsutil -m cp -R gs://natural_questions/v1.0 /home/dvillacreses/nlp

In [30]:
if GET_FROM_RAW:
    ##
    all_files = os.listdir('./v1.0/train')
    all_files = [i for i in all_files if 'jsonl.gz' in i]
    relevant_data = parallel_process_files(all_files,'./v1.0/train')
        # 1 hour 20 minutes
    df = pd.DataFrame(relevant_data,columns=['example_id',"text",'question_text',"annotations"])

    df['type'] = [str(type(i)) for i in df['annotations']]
    df['answer_text'] = ''
    for i in range(df.shape[0]):
        type_annotations = df.loc[i,'type']
        if type_annotations == "<class 'dict'>":
            annotations = df.loc[i,'annotations']
            start_token = annotations['start_token']
            end_token = annotations['end_token']
            tokenized_text = df.loc[i,'text'].split("<<tok>>")
            tokenized_text = " ".join(tokenized_text[start_token:end_token])
            df.loc[i,'answer_text'] = tokenized_text
            # 2 minutes
    df.to_pickle("nq-train.pkl")
        # 1 minute
    df = df.sample(10_000)
    df.to_pickle("nq-train-sample.pkl")

    ##
    all_files = os.listdir('./v1.0/dev')
    all_files = [i for i in all_files if 'jsonl.gz' in i]

    relevant_data = parallel_process_files(all_files,'./v1.0/dev')
        # 1 minute
    df = pd.DataFrame(relevant_data,columns=['example_id',"text",'question_text',"annotations"])

    df['type'] = [str(type(i)) for i in df['annotations']]
    df['answer_text'] = ''
    for i in range(df.shape[0]):
        type_annotations = df.loc[i,'type']
        if type_annotations == "<class 'dict'>":
            annotations = df.loc[i,'annotations']
            start_token = annotations['start_token']
            end_token = annotations['end_token']
            tokenized_text = df.loc[i,'text'].split("<<tok>>")
            tokenized_text = " ".join(tokenized_text[start_token:end_token])
            df.loc[i,'answer_text'] = tokenized_text
            # 2 secs

    df.to_pickle("nq-dev.pkl")
        # 1 sec


if not GET_FROM_RAW:
    df_train = pd.read_pickle("nq-train.pkl")
    df_test = pd.read_pickle("nq-dev.pkl")

## Data Wrangling

In [31]:
df_train = df_train.sample(30_000)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

## EDA

In [32]:
print(f"{df_train.shape=}")
print(f"{df_test.shape=}")

df_train.shape=(30000, 6)
df_test.shape=(3288, 6)


In [33]:
df_train.head(2)

Unnamed: 0,example_id,text,question_text,annotations,type,answer_text
0,5682907781875136555,List<<tok>>of<<tok>>the<<tok>>Canterbury<<tok>...,what is the host's name in canterbury tales,"{'end_byte': 62926, 'end_token': 418, 'start_b...",<class 'dict'>,Harry Bailey
1,-8558875372522180976,The<<tok>>Shadow<<tok>>of<<tok>>Your<<tok>>Smi...,what movie is the song the shadow of your smil...,"{'end_byte': 60120, 'end_token': 225, 'start_b...",<class 'dict'>,The Sandpiper


In [34]:
df_test.head(2)

Unnamed: 0,example_id,text,question_text,annotations,type,answer_text
0,6915606477668963399,Therefore<<tok>>sign<<tok>>-<<tok>>wikipedia<<...,what do the 3 dots mean in math,"{'end_byte': 66817, 'end_token': 837, 'start_b...",<class 'dict'>,the therefore sign ( ∴ ) is generally used bef...
1,-5004457603684974952,Super<<tok>>Bowl<<tok>>50<<tok>>halftime<<tok>...,who is playing the halftime show at super bowl...,"{'end_byte': 58456, 'end_token': 208, 'start_b...",<class 'dict'>,Coldplay with special guest performers Beyoncé...


In [35]:
print("Multiple answers per question, training set:")
display(df_train.groupby(['example_id']).size().value_counts())

print("Multiple answers per question, test set:")
display(df_test.groupby(['example_id']).size().value_counts())

Multiple answers per question, training set:


Unnamed: 0,count
1,26231
2,1195
3,294
4,90
5,21
6,4
8,1


Multiple answers per question, test set:


Unnamed: 0,count
1,2436
2,101
3,40
5,25
4,23
7,14
10,6
6,6
8,5
9,3


## RAG

### Llama no Fine-tuning

In [36]:
# Call the function to list devices
list_torch_devices()
device = torch.device("cuda")

Number of CUDA devices: 1
Device 0: NVIDIA A100-SXM4-40GB
  Total Memory: 39.56 GiB
  Memory Allocated: 15.09 GiB
  Memory Cached: 16.61 GiB
  Available Memory: 24.47 GiB



In [37]:
if LLAMA_NF:
    # Load your LLaMA model (for generation)
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    api_token = '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'

    model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

    # Create the pipeline manually specifying the device
    generation_pipeline = transformers.TextGenerationPipeline(
        model=model,
        tokenizer=tokenizer,
        device=device.index
    )


    # Load the Sentence Transformer model for embeddings
    embedder = SentenceTransformer('all-MiniLM-L6-v2').to(device)

    def retrieve_relevant_chunk(question, document_chunks, document_embeddings, top_k=1):
        with torch.no_grad():  # Disable gradient tracking to save memory
            # Embed the question in half-precision
            question_embedding = embedder.encode(question, convert_to_tensor=True).to(device).half()
            # Compute cosine similarities between the question and document chunks
            similarities = util.pytorch_cos_sim(question_embedding, document_embeddings).to(device)
            # Get the top k most similar chunks
            top_k_indices = torch.topk(similarities, k=top_k).indices.flatten()
            # Retrieve the corresponding document chunks
            relevant_chunks = [document_chunks[i] for i in top_k_indices]
            # Deallocate memory
            del question_embedding, similarities, top_k_indices
            torch.cuda.empty_cache()
        return "\n".join(relevant_chunks)

    t0 = time.time()
    all_res = []
    torch.cuda.empty_cache()
    for i in range(df_test.shape[0]):
        print("-"*100)
        print(i)
        try:
            # Create embeddings for each chunk
            document = df_test.loc[i, 'text'].replace("<<tok>>", " ")
            # Chunk the document into paragraphs (or sentences)
            document_chunks = document.split('\n\n')
            # Create embeddings in half-precision and load onto GPU
            document_embeddings = embedder.encode(document_chunks, convert_to_tensor=True).to(device).half()
            # Simulate a user question
            user_question = df_test.loc[i, 'question_text']
            # Retrieve the most relevant chunk(s)
            relevant_chunk = retrieve_relevant_chunk(user_question, document_chunks, document_embeddings)
            # Pass the relevant chunk and user question to the LLaMA model for generation
            messages = [
                {"role": "system", "content": "Act as a Retrieval Augmented Generation. Your answers must be short and precise. Do not repeat the question."},
                {"role": "user", "content": f"Based on the following context:\n\n{relevant_chunk}\n\nAnswer the question: {user_question}"},
            ]
            # Generate the response using the LLaMA model
            with torch.no_grad():
                outputs = generation_pipeline(messages, max_new_tokens=256)
            # Print the response
            print(user_question)
            print(outputs[0]["generated_text"][2]['content'])
            all_res.append(outputs[0]["generated_text"][2]['content'])
            # Clear memory after processing
            del outputs, document_embeddings
            torch.cuda.empty_cache()
        except Exception as e:
            print(user_question)
            print("Not computable", e)
            all_res.append("Not computable")
            torch.cuda.empty_cache()
    t1 = time.time()
    dev_answers = pd.Series(all_res)
    dev_answers.to_pickle('dev_answers_llama3_nf.pkl')

    del model
    del tokenizer, generation_pipeline, embedder
    torch.cuda.empty_cache()

In [38]:
if LLAMA_NF:
    print(f"Processing time in minutes: {(t1-t0)/60}")
if not LLAMA_NF:
    print(f"Processing time in minutes: {135.28400518894196}")

Processing time in minutes: 135.28400518894196


In [39]:
dev_answers = pd.read_pickle('dev_answers_llama3_nf.pkl')

In [40]:
df_test['llama_nf'] = dev_answers

In [41]:
df_test.head()

Unnamed: 0,example_id,text,question_text,annotations,type,answer_text,llama_nf
0,6915606477668963399,Therefore<<tok>>sign<<tok>>-<<tok>>wikipedia<<...,what do the 3 dots mean in math,"{'end_byte': 66817, 'end_token': 837, 'start_b...",<class 'dict'>,the therefore sign ( ∴ ) is generally used bef...,"In mathematics, the three dots ∴ are used to i..."
1,-5004457603684974952,Super<<tok>>Bowl<<tok>>50<<tok>>halftime<<tok>...,who is playing the halftime show at super bowl...,"{'end_byte': 58456, 'end_token': 208, 'start_b...",<class 'dict'>,Coldplay with special guest performers Beyoncé...,"Coldplay, with special guests Beyoncé and Brun..."
2,7478795216476346339,2017<<tok>>BBC<<tok>>Sports<<tok>>Personality<...,who won the 2017 sports personality of the year,"{'end_byte': 40758, 'end_token': 104, 'start_b...",<class 'dict'>,Mo Farah,Mo Farah won the 2017 BBC Sports Personality o...
3,-6227140278154131101,Sports<<tok>>in<<tok>>the<<tok>>Las<<tok>>Vega...,is there a basketball team in las vegas,YES,<class 'str'>,,Not computable
4,7217222058435937287,World<<tok>>Economic<<tok>>Forum<<tok>>-<<tok>...,where was the world economic forum held this year,"{'end_byte': 149783, 'end_token': 287, 'start_...",<class 'dict'>,"Davos , a mountain resort in Graubünden , in t...",The text does not provide information on the c...


In [42]:
print(f"Total of not computable texts: {(df_test['llama_nf']=='Not computable').sum()}")

Total of not computable texts: 1025


In [43]:
df_tmp = df_test[df_test['llama_nf']!='Not computable']
print(f"Proportion of answers correctly (exactly) answered: {(df_tmp['answer_text']==df_tmp['llama_nf']).mean()}")

Proportion of answers correctly (exactly) answered: 0.0547945205479452


### RAG with Llama

In [44]:
# https://www.kaggle.com/discussions/questions-and-answers/357399
# https://www.datacamp.com/tutorial/rag-vs-fine-tuning

In [45]:
df_train.head()

Unnamed: 0,example_id,text,question_text,annotations,type,answer_text
0,5682907781875136555,List<<tok>>of<<tok>>the<<tok>>Canterbury<<tok>...,what is the host's name in canterbury tales,"{'end_byte': 62926, 'end_token': 418, 'start_b...",<class 'dict'>,Harry Bailey
1,-8558875372522180976,The<<tok>>Shadow<<tok>>of<<tok>>Your<<tok>>Smi...,what movie is the song the shadow of your smil...,"{'end_byte': 60120, 'end_token': 225, 'start_b...",<class 'dict'>,The Sandpiper
2,645564879443758169,Stuck<<tok>>in<<tok>>a<<tok>>Moment<<tok>>You<...,who was stuck in the moment written for,"{'end_byte': 33619, 'end_token': 362, 'start_b...",<class 'dict'>,Michael Hutchence
3,-1462294026599658682,Uncle<<tok>>Grandpa<<tok>>-<<tok>>wikipedia<<t...,who plays the voice of pizza steve on uncle gr...,"{'end_byte': 48326, 'end_token': 1692, 'start_...",<class 'dict'>,Adam DeVine
4,2962896242960384607,Aleutian<<tok>>trench<<tok>>-<<tok>>wikipedia<...,what type of plate boundary is the aleutian is...,"{'end_byte': 55535, 'end_token': 152, 'start_b...",<class 'dict'>,convergent plate boundary


In [46]:
device_name = "cuda"
device = torch.device(device_name)
list_torch_devices()

Number of CUDA devices: 1
Device 0: NVIDIA A100-SXM4-40GB
  Total Memory: 39.56 GiB
  Memory Allocated: 15.09 GiB
  Memory Cached: 16.61 GiB
  Available Memory: 24.47 GiB



In [47]:
t0 = time.time()
# Load pre-trained models
## Login to Hugging Face
api_token = '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'
os.environ['HUGGINGFACE_TOKEN'] = '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'
hf_token = os.getenv('HUGGINGFACE_TOKEN')
login(token=hf_token, add_to_git_credential=True)

# From df to langchain document
google_data = []
for i in range(df_train.shape[0]):
    text,question_text,answer_text = df_train.loc[i,['text','question_text','answer_text']]
    text = text.replace('<<tok>>',' ')

    document = Document(
        #page_content=text,
        page_content=answer_text,
        metadata={
            "question": question_text,
            "answer": answer_text
        }
    )
    google_data.append(document)

# Specify the dataset name
dataset_name = "ruslanmv/ai-medical-chatbot"


# # Create a loader instance using dataset columns
# loader_doctor = HuggingFaceDatasetLoader(dataset_name,"Doctor")

# # Load the data
# doctor_data = loader_doctor.load()

# # Select the first 1000 entries
# doctor_data = doctor_data[:1000]

# Pre-trained embeddings
## Define the path to the embedding model
#modelPath = "sentence-transformers/all-MiniLM-L12-v2"
modelPath = "sentence-transformers/paraphrase-MiniLM-L6-v2"
## GPU acceleration
model_kwargs = {'device':device_name}
## Create a dictionary with encoding options
encode_kwargs = {'normalize_embeddings': False}
## Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# From langchain data to vector database
vector_db = FAISS.from_documents(google_data, embeddings)
#vector_db = FAISS.from_documents(doctor_data, embeddings)
vector_db.save_local("/google_data")


# Retriever
retriever = vector_db.as_retriever()
base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#base_model = 'meta-llama/Llama-2-7b-hf'
#base_model = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=120
)

llm = HuggingFacePipeline(pipeline=pipe)

rag_prompt = hub.pull("rlm/rag-prompt")


qa_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)


# # Question and answers
# question = "Hi Doctor, I have a headache, help me."
# result = qa_chain.invoke(question)
# print(result.split("Answer: ")[1])

t1 = time.time()

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [48]:
print(f"Training time in minutes: {(t1-t0)/60}")

Training time in minutes: 1.0771641651789348


In [None]:
df_test = df_test.sample(int(df_test.shape[0]*0.5), random_state = 0).reset_index(drop=True)
t0 = time.time()
all_res = []
for i in range(df_test.shape[0]):
    print(i)
    question = df_test.loc[i,'question_text']
    result = qa_chain.invoke(question)
    #print(result.split("Answer: ")[1])
    all_res.append(result)

t1 = time.time()

dev_answers = pd.Series(all_res)
dev_answers.to_pickle('dev_answers_llama3_rag.pkl')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


450


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


451


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


452


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


453


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


454


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


455


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


456


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


457


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


458


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


459


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


460


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


461


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


462


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


463


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


464


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


465


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


466


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


467


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


468


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


469


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


470


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


471


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


472


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


473


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


474


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


475


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


476


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


477


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


478


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


479


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


480


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


481


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


482


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


483


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


484


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


485


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


486


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


487


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


488


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


489


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


490


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


491


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


492


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


493


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


494


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


495


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


496


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


497


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


498


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


499


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


500


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


501


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


502


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


503


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


504


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


505


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


506


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


507


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


508


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


509


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


510


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


511


In [None]:
print(f"Inference time in minutes: {(t1-t0)/60}")