In [1]:
# Import all the required libraries.
import gdown
import os
import urllib
import torch
import time
import datetime
import math
import numpy as np
import faiss
import textwrap
from datasets import Dataset
import pandas as pd

In [2]:
!pip install faiss-cpu datasets



In [3]:
# Import the drive module from the google.colab library
from google.colab import drive

# Mount the Google Drive to the Colab environment
# This allows access to files stored in Google Drive from within the Colab notebook
drive.mount('/content/drive')

# Define the path to the "Files" directory in your Google Drive
files_dir = '/content/drive/My Drive/Files'

# Print a message indicating that the files are being read
print('The files are being read.\n')

# Initialize lists to store titles and articles
titles = []
articles = []

# Initialize a counter to keep track of the number of processed files
i = 0

# Scan each file in the directory named "Files"
for filename in os.listdir(files_dir):
    # Skip files that do not have a .txt extension
    if not filename.endswith('.txt'):
        continue

    # Open the file in text read mode
    with open(os.path.join(files_dir, filename), "r") as f:
        # Decode the filename (without the .txt extension) to a readable title
        title = urllib.parse.unquote(filename[:-4])  # Decode any characters not allowed in URLs
        title = title.replace('_', ' ')  # Replace underscores with spaces

        # Check if the title is empty after stripping whitespace
        if not title.strip():
            print('Empty title for', filename)  # Print a warning message for empty titles
            continue  # Skip this file and proceed to the next one

        # Add the title to the titles list
        titles.append(title)

        # Read the file content
        article = f.read()

        # Add the article to the articles list
        articles.append(article)

    # Increment the counter for each processed file
    i += 1

    # Print progress for every 500 processed files
    if (i % 500) == 0:
        print('Processed {:,}'.format(i))

# Print a message indicating that the reading process is done
print('DONE.\n')

# Print the total number of articles read
print('There are {:,} articles.'.format(len(articles)))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The files are being read.

DONE.

There are 2 articles.


In [4]:
print('Let us retrieve the all the titles.')
print(titles[0:])
print('\nLet us retrieve all the content.')
print(articles[0:])

Let us retrieve the all the titles.
['Generative pre-trained transformers Text', 'Retrieval Augmented Generation Text']

Let us retrieve all the content.
['Generative pre-trained transformers (GPT) are a type of large language model (LLM)[1][2][3] and a prominent framework for generative artificial intelligence.[4][5] They are artificial neural networks that are used in natural language processing tasks.[6] GPTs are based on the transformer architecture, pre-trained on large data sets of unlabelled text, and able to generate novel human-like content.[2][3] As of 2023, most LLMs have these characteristics[7] and are sometimes referred to broadly as GPTs.[8]\n\nThe first GPT was introduced in 2018 by OpenAI.[9] OpenAI has released very influential GPT foundation models that have been sequentially numbered, to comprise its "GPT-n" series.[10] Each of these was significantly more capable than the previous, due to increased size (number of trainable parameters) and training. The most recent

In [5]:
# Print a message indicating the number of articles before splitting
print('Before splitting, {:,} articles.\n'.format(len(titles)))

# Initialize lists to store passage titles and passages
passage_titles = []
passages = []

# Print a message indicating the start of splitting into chunks
print('Splitting into chunks.')

# Iterate over each title and article in the titles and articles lists
for i in range(len(titles)):
    title = titles[i]
    article = articles[i]

    # Check if the article is empty
    if len(article) == 0:
        # Print a message indicating that an empty article is being skipped
        print('Skipping empty article:', title)
        continue

    # Split the article into words
    words = article.split()

    # Iterate over the words in chunks of 100 words
    for i in range(0, len(words), 100):
        chunk_words = words[i : i + 100]
        chunk = " ".join(chunk_words)
        chunk = chunk.strip()

        # Check if the chunk is empty
        if len(chunk) == 0:
            continue

        # Append the title and chunk to the respective lists
        passage_titles.append(title)
        passages.append(chunk)

# Print a message indicating that the splitting process is done
print('Splitting done.\n')

# Create a dictionary to store the chunked corpus with titles and passages
chunked_corpus = {'title': passage_titles, 'text': passages}

# Print a message indicating the number of passages after splitting
print('After splitting, {:,} "passages".'.format(len(chunked_corpus['title'])))

Before splitting, 2 articles.

Splitting into chunks.
Splitting done.

After splitting, 21 "passages".


In [6]:
# Import the DPRQuestionEncoderTokenizer class from the transformers library
from transformers import DPRQuestionEncoderTokenizer

# Initialize the tokenizer for context encoding from the pretrained 'facebook/dpr-ctx_encoder-multiset-base' model
ctx_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-multiset-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Get the number of passages in the chunked corpus
num_passages = len(chunked_corpus['title'])

# Print a message indicating the start of tokenizing passages for DPR
print('Tokenizing {:,} passages for DPR...'.format(num_passages))

# Tokenize the passage titles and texts using the DPRQuestionEncoderTokenizer
# Note: This tokenizer takes both titles and texts for context encoding
outputs = ctx_tokenizer(
    chunked_corpus["title"],  # List of passage titles
    chunked_corpus["text"],   # List of passage texts
    truncation=True,          # Truncate the sequences to the maximum length supported by the model
    padding="longest",        # Pad sequences to the longest sequence in the batch
    return_tensors="pt"       # Return PyTorch tensors
)

# Print a message indicating the completion of the tokenization process
print('Tokenization process completed.')

# Retrieve the input IDs from the tokenization outputs
input_ids = outputs["input_ids"]

# Print the shape of the input_ids tensor
print(input_ids.shape)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokenizing 21 passages for DPR...
Tokenization process completed.
torch.Size([21, 238])


In [8]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU is available.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [9]:
# Import the DPRContextEncoder class from the transformers library
from transformers import DPRContextEncoder

# Initialize the DPRContextEncoder model from the pretrained 'facebook/dpr-ctx_encoder-multiset-base' model
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

# Move the DPRContextEncoder model to the specified device (GPU if available, otherwise CPU)
ctx_encoder = ctx_encoder.to(device=device)

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Define a function to format elapsed time into a human-readable string
def format_time(elapsed):
    # Round the elapsed time to the nearest second
    elapsed_rounded = int(round(elapsed))

    # Convert the rounded elapsed time from seconds to a string in the format HH:MM:SS
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [11]:
import time
torch.set_grad_enabled(False)

t0 = time.time()
step = 0
batch_size = 16
num_passages = input_ids.size()[0]  # Assuming input_ids is defined earlier as the tokenized passage inputs
num_batches = math.ceil(num_passages / batch_size)
embeds_batches = []

print('Generating embeddings for {:,} passages...'.format(num_passages))

for i in range(0, num_passages, batch_size):
    batch_ids = input_ids[i:i + batch_size].to(device)  # Get the batch input IDs and move them to the device

    outputs = ctx_encoder(batch_ids, return_dict=True)
    embeddings = outputs["pooler_output"].cpu().numpy()  # Move embeddings to CPU and convert to numpy array
    embeds_batches.append(embeddings)  # Collect embeddings in batches

    step += 1

    if step % 100 == 0:
        elapsed = format_time(time.time() - t0)
        print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, num_batches, elapsed))

print('Done.')

# Concatenate all the embedding batches into a single array
embeddings = np.concatenate(embeds_batches, axis=0)

# Print the size of the dataset embeddings to verify the shape of the concatenated array
print('Size of dataset embeddings:', embeddings.shape)

Generating embeddings for 21 passages...
Done.
Size of dataset embeddings: (21, 768)


In [12]:
# Concatenate all the embedding batches into a single array
# The concatenation is done along the first axis (rows) to create a single array of embeddings
embeddings = np.concatenate(embeds_batches, axis=0)

# Print the size of the dataset embeddings to verify the shape of the concatenated array
print('Size of dataset embeddings:', embeddings.shape)

Size of dataset embeddings: (21, 768)


In [13]:
# Set the dimensionality of the vectors to be indexed
dim = 768

# Set the number of neighbors for the HNSW (Hierarchical Navigable Small World) index
m = 128

# Create a HNSW index for dense vectors using inner product as the similarity metric
# This index type is useful for approximate nearest neighbor search
index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)

In [14]:
# Print a message indicating the start of building the FAISS index
print('Building of the FAISS index is in progress.')

# Record the current time to measure the duration of the indexing process
t0 = time.time()

# Train the FAISS index with the embeddings
index.train(embeddings)

# Add the embeddings to the FAISS index
index.add(embeddings)

# Print a message indicating the completion of the indexing process
print('Done.')

# Print the time taken to add the embeddings to the index
print('Adding embeddings to index took', format_time(time.time() - t0))

Building of the FAISS index is in progress.
Done.
Adding embeddings to index took 0:00:00


In [15]:
# Import the DPRQuestionEncoder class from the transformers library
from transformers import DPRQuestionEncoder

# Initialize the question encoder with the pretrained 'facebook/dpr-question_encoder-multiset-base' model
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")

# Move the question encoder model to the specified device (GPU or CPU)
q_encoder = q_encoder.to(device=device)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
from transformers import DPRQuestionEncoderTokenizer, DPRQuestionEncoder

# Initialize the question tokenizer with the pretrained 'facebook/dpr-question_encoder-multiset-base' model
q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-multiset-base")

# Initialize the question encoder with the pretrained 'facebook/dpr-question_encoder-multiset-base' model
q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")

# Move the question encoder model to the specified device (GPU or CPU)
q_encoder = q_encoder.to(device)

# Encode the query into input IDs using the question tokenizer, returning PyTorch tensors
input_ids = q_tokenizer.encode("How many models have been created by Cerebras?", return_tensors = "pt")

# Move the input IDs tensor to the specified device (GPU or CPU)
input_ids = input_ids.to(device)

# Pass the input IDs through the question encoder to obtain the outputs
outputs = q_encoder(input_ids)

# Extract the query embedding from the outputs
q_embed = outputs['pooler_output']

# Move the query embedding tensor to the CPU and convert it to a NumPy array
q_embed = q_embed.cpu().numpy()

# Print the shape of the query embedding
print("Query embedding:", q_embed.shape)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Query embedding: (1, 768)


In [18]:
# Search the FAISS index with the query embedding to find the k closest matches
D, I = index.search(q_embed, k=3)

# Print the indices of the closest matching passages
print('Closest matching indices:', I)

# Print the inner product scores of the closest matches
print('Inner products:', D)

Closest matching indices: [[1 6 2]]
Inner products: [[67.11247 65.05115 64.29021]]


In [19]:
# Initialize a text wrapper to format the passage text to a specified width
wrapper = textwrap.TextWrapper(width=80)

# Iterate over the indices of the closest matching passages
for i in I[0]:
  # Print the index of the matching passage
  print('Index:', i)

  # Retrieve the title of the matching passage from the chunked corpus
  title = chunked_corpus['title'][i]

  # Retrieve the text of the matching passage from the chunked corpus
  passage = chunked_corpus['text'][i]

  # Print the title of the matching article
  print('Article Title: ', title, '\n')

  # Print the formatted passage text
  print('Passage:')
  print(wrapper.fill(passage))
  print('')

Index: 1
Article Title:  Generative pre-trained transformers Text 

Passage:
these was significantly more capable than the previous, due to increased size
(number of trainable parameters) and training. The most recent of these, GPT-4,
was released in March 2023.[11] Such models have been the basis for their more
task-specific GPT systems, including models fine-tuned for instruction
following—which in turn power the ChatGPT chatbot service.[1] The term "GPT" is
also used in the names and descriptions of such models developed by others. For
example, other GPT foundation models include a series of models created by
EleutherAI,[12] and seven models created by Cerebras in 2023.[13] Also,
companies in different industries have developed

Index: 6
Article Title:  Generative pre-trained transformers Text 

Passage:
whereas ChatGPT is further trained for conversational interaction with a human
user.[30][31] OpenAI's most recent GPT foundation model, GPT-4, was released on
March 14, 2023. It can

In [20]:
chunked_corpus = {'title': passage_titles, 'text': passages}

In [21]:
# Create a DataFrame from the chunked_corpus dictionary
# This converts the chunked_corpus (a dictionary with titles and texts) into a pandas DataFrame
df = pd.DataFrame(chunked_corpus)

# Convert the pandas DataFrame into a Dataset object from the datasets library
# This allows for easy handling and manipulation of the data in the dataset format
dataset = Dataset.from_pandas(df)

# Print the dataset to verify its contents
# This displays the structure and contents of the dataset
print(dataset)

Dataset({
    features: ['title', 'text'],
    num_rows: 21
})


In [22]:
# Initialize an empty list to store the embeddings
embs = []

# Iterate over each row in the embeddings matrix
for i in range(embeddings.shape[0]):
  # Append the i-th embedding (a row from the matrix) to the embs list
  embs.append(embeddings[i, :])

In [23]:
dataset = dataset.add_column("embeddings", embs) # Add the embeddings as a new column to the dataset
dataset # Display the dataset contents.

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 21
})

In [24]:
# Initialize the FAISS index with the specified dimensions and metric
index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)

# Add the FAISS index to the dataset for the embeddings column
dataset.add_faiss_index(
    column="embeddings",       # Column name in the dataset to index
    index_name="embeddings",   # Name of the index to be created
    custom_index=index,        # Custom FAISS index to use
    faiss_verbose=True         # Verbose output from FAISS (shows progress)
)

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 21
})

In [25]:
from transformers import RagRetriever
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    use_dummy_dataset = False,
    indexed_dataset = dataset,
    index_name = "embeddings"
)



config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

(…)_encoder_tokenizer/tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

question_encoder_tokenizer/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ncoder_tokenizer/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.


(…)enerator_tokenizer/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

generator_tokenizer/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

generator_tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)erator_tokenizer/special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizerFast'.


In [26]:
from transformers import RagTokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [27]:
# Import the RagSequenceForGeneration class from the transformers library
from transformers import RagSequenceForGeneration

# Load the pre-trained RAG sequence model from Facebook's "rag-sequence-nq"
# Specify the retriever to be used with the model
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-sequence-nq were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.bias', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
# Record the start time for measuring the response time
t0 = time.time()

# Define the question to be asked
question = "How many models have been created by Cerebras?"

# Tokenize the question using the question encoder tokenizer
# Convert the question into input IDs, which are numerical representations of the tokens
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]

# Generate an answer using the RAG model
# The model generates a response based on the input IDs of the question
generated = model.generate(input_ids)

# Decode the generated answer from the model's output tokens to a readable string
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

# Print the question and the generated answer
print("Q: " + question)
print("A: " + generated_string)

# Print the time taken to generate the response
print('\nResponse took %.2f seconds' % (time.time() - t0))



Q: How many models have been created by Cerebras?
A:  seven

Response took 177.28 seconds


In [29]:
# Define a function to ask a question and get a response from the model
def ask_question(question):
  # Record the start time to measure the response time
  t0 = time.time()

  # Tokenize the question using the question encoder tokenizer
  # Convert the question into input IDs, which are numerical representations of the tokens
  input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]

  # Generate an answer using the RAG model
  # The model generates a response based on the input IDs of the question
  generated = model.generate(input_ids)

  # Decode the generated answer from the model's output tokens to a readable string
  generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

  # Print the question and the generated answer
  print("Q: " + question)
  print("A: '{:}'".format(generated_string))

  # Print the time taken to generate the response
  print('\nResponse took %.2f seconds' % (time.time() - t0))

In [30]:
ask_question("How many models have been created by Cerebras?")

Q: How many models have been created by Cerebras?
A: ' seven'

Response took 176.93 seconds


In [31]:
ask_question("What is Hierarchical RAG?")



Q: What is Hierarchical RAG?
A: ' multi-level retrieval'

Response took 235.99 seconds
