In [None]:
import torch 
import os
import requests 
import fitz
from tqdm import tqdm 
import numpy as np 

In [None]:
os.environ['CUDA_VISIBLE_DEVICES']='1'
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

In [None]:
pdf_path = 'visual para.pdf'

In [None]:
def text_formatter(text: str) -> str : 
    clean_txt = text.replace("\n"," ").strip()
    return clean_txt


#     return pages_and_texts
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    print(len(doc))
    n = len(doc)
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        if page_number <= n :
            text = page.get_text()  # get plain text encoded as UTF-8
            text = text_formatter(text)
            pages_and_texts.append({"page_number": page_number,  
                                    "page_char_count": len(text),
                                    "page_word_count": len(text.split(" ")),
                                    "page_sentence_count_raw": len(text.split(". ")),
                                    "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                    "text": text})
    return pages_and_texts


In [None]:
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:15]

In [None]:
import random 
random.sample(pages_and_texts, k = 3)

In [None]:
import pandas as pd 

df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
# In case you want to reduce the maximum length:
model.max_seq_length = 8192

queries = [
    "how much protein should a female eat",
    "summit define",
]
documents = [
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
]

query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

scores = (query_embeddings @ document_embeddings.T) * 100
print(scores.tolist())

In [None]:
# import torch
# import torch.nn.functional as F

# from torch import Tensor
# from transformers import AutoTokenizer, AutoModel


# def last_token_pool(last_hidden_states: Tensor,
#                  attention_mask: Tensor) -> Tensor:
#     left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
#     if left_padding:
#         return last_hidden_states[:, -1]
#     else:
#         sequence_lengths = attention_mask.sum(dim=1) - 1
#         batch_size = last_hidden_states.shape[0]
#         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# def get_detailed_instruct(task_description: str, query: str) -> str:
#     return f'Instruct: {task_description}\nQuery: {query}'


# # Each query must come with a one-sentence instruction that describes the task
# task = 'Given a web search query, retrieve relevant passages that answer the query'
# queries = [
#     get_detailed_instruct(task, 'how much protein should a female eat'),
#     get_detailed_instruct(task, 'summit define')
# ]
# # No need to add instruction for retrieval documents
# documents = [
#     "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
#     "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
# ]
# input_texts = queries + documents

# tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)
# model = AutoModel.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)

# max_length = 8192

# # Tokenize the input texts
# batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
# outputs = model(**batch_dict)
# embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# # normalize embeddings
# embeddings = F.normalize(embeddings, p=2, dim=1)
# scores = (embeddings[:2] @ embeddings[2:].T) * 100
# print(scores.tolist())


In [None]:
from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')

doc = nlp("Hi, I saw you standing there. What were you doing?")

# assert(len(list(doc.sents))) == 3

s = list(doc.sents)

In [None]:
type(s)

In [None]:
type(s[0])

In [None]:
import nltk

txt = "Hi, I saw you standing there. What were you doing?"
l = nltk.tokenize.sent_tokenize(txt, language='english')
l

In [None]:
type(l)

In [None]:
type(l[0])

In [None]:
pages_and_texts[6]

In [None]:
for item in tqdm(pages_and_texts) : 
    text = item['text']
    item["sentences"] = nltk.tokenize.sent_tokenize(text, language='english') 

    item['page_sentence_count_nltk'] = len(item['sentences'])

In [None]:
pages_and_texts[9]

In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

In [None]:
"""We will chunk sentences into groups of 5 """

In [None]:
chunk_size = 10

def chunking(input_list , chunk_size) :
    l = [input_list[i : i+ chunk_size] for i in range(0,len(input_list), chunk_size)]
    return l 

test = list(range(21))

chunking(test,chunk_size)


In [None]:
for item in tqdm(pages_and_texts) : 
    item["chunks"] = chunking(item['sentences'], chunk_size)
    item['num_chunks'] = len(item["chunks"])

In [None]:
pages_and_texts[6]['chunks']

In [None]:
pages_and_texts[6]['num_chunks']

In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

In [None]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for chunk in item["chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["chunks"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

In [None]:
pages_and_texts[6]['chunks']

In [None]:
pages_and_chunks[6]

In [None]:
pages_and_texts[6]['num_chunks']

In [None]:
chunk_dict

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

In [None]:
random.sample(pages_and_chunks, k = 1 )

Filtering very short chunks, they may not contain much info 

In [None]:
"""Removing very short chunks """

In [None]:
min_token_len = 20

In [None]:
df[df["chunk_token_count"] <= min_token_len]["chunks"]

In [None]:
for row in df[df["chunk_token_count"] <= min_token_len].sample(2).iterrows(): 
    print(f'CHunk token count : {row[1]["chunk_token_count"]} | text : {row[1]["chunks"]}')


In [None]:
pages_and_chunks_over_threshold = df[df["chunk_token_count"] > min_token_len].to_dict(orient="records")
pages_and_chunks_over_threshold[:2]

In [None]:
random.sample(pages_and_chunks_over_threshold, k =2)

Embedding chunks 

In [None]:
""" 
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct", trust_remote_code=True)
# In case you want to reduce the maximum length:
model.max_seq_length = 8192

queries = [
    "how much protein should a female eat",
    "summit define",
]
documents = [
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments.",
]

query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

scores = (query_embeddings @ document_embeddings.T) * 100
print(scores.tolist())


"""

In [None]:
"""Testing """

test_sentences = ["Testing a local rag system.", " I hope this works.", "I am wasting too much time on this."]

test_embeddings = model.encode(test_sentences)
embeddings_dict = dict(zip(test_sentences, test_sentences))

In [None]:
# See the embeddings
for test_sentences, test_sentences in embeddings_dict.items():
    print("Sentence:", test_sentences)
    print("Embedding:", test_embeddings)
    print("Embedding size :", test_embeddings.shape)
    print("")

In [None]:
test_embeddings[0].shape

In [None]:
text_chunks = [item["chunks"] for item in pages_and_chunks_over_threshold]
text_chunks[10]

In [None]:
len(text_chunks)

In [None]:
# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks_over_threshold):
    item["embedding"] = model.encode(item["chunks"], batch_size=32, convert_to_numpy=True)

In [None]:
# text_chunk_embeddings = model.encode(text_chunks,
#                                                batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
#                                                convert_to_tensor=True) # optional to return embeddings as tensor instead of array

# text_chunk_embeddings

In [None]:
pages_and_chunks_over_threshold[0]

In [None]:
# text_chunk_embeddings.shape

In [None]:
text_chunk_embeddings_df = pd.DataFrame(pages_and_chunks_over_threshold)
embedding_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunk_embeddings_df.to_csv(embedding_df_save_path, index=False, escapechar='\\')


In [None]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embedding_df_save_path)
text_chunks_and_embedding_df_load.head()

In [None]:
# Assuming pages_and_chunks_over_threshold is a list of dictionaries and 'embedding' is one of the keys
text_chunk_embeddings_df = pd.DataFrame(pages_and_chunks_over_threshold)

# Save the entire DataFrame including embeddings using pickle
text_chunk_embeddings_df.to_pickle("text_chunks_and_embeddings.pkl")

In [None]:
# Load the entire DataFrame including embeddings using pickle
text_chunks_and_embedding_df = pd.read_pickle("text_chunks_and_embeddings.pkl")

# Convert embeddings to torch tensor and send to device
embeddings_tensor = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
print(embeddings_tensor.shape)

RAG search and answer 

In [None]:
import pandas as pd
import numpy as np
import torch

# Load DataFrame from pickle file
text_chunks_and_embedding_df = pd.read_pickle("text_chunks_and_embeddings.pkl")

# Example: Convert back to torch tensor assuming 'embedding' is a key containing numpy arrays
embeddings_tensor = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32)

# Prepare a similar DataFrame for the loaded data
loaded_df = pd.DataFrame(text_chunks_and_embedding_df)

# Ensure 'embedding' column remains as numpy arrays
loaded_df["embedding"] = loaded_df["embedding"].apply(lambda x: np.array(x))

# Now you have a DataFrame 'loaded_df' which should be structurally similar to 'text_chunk_embeddings_df'


In [None]:
embeddings_tensor[0].shape

In [None]:
embeddings_tensor.shape

In [None]:
loaded_df.head()

In [None]:
# # import random

# # import torch
# import numpy as np 
# import pandas as pd

# # device = "cuda" if torch.cuda.is_available() else "cpu"

# # Import texts and embedding df
# text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# # text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# # text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep="  "))

# # embeddings = torch.tensor(np.stack(text_chunks_and_embedding_df["embedding"].to_list(), axis=0))


# embeddings_from_df = text_chunk_embeddings_df["embedding"].tolist()
# # embeddings_from_df

# embeddings = torch.tensor(embeddings_from_df, dtype=torch.float32).to(device)

# # # Convert texts and embedding df to list of dicts
# # pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")



In [None]:
# import numpy as np
# import pandas as pd
# import ast
# import torch

# # device = "cuda" if torch.cuda.is_available() else "cpu"

# # Import texts and embedding df
# text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# def safe_literal_eval(val):
#     try:
#         return np.array(ast.literal_eval(val))
#     except (SyntaxError, ValueError):
#         # Handle cases where the string might not be properly formatted
#         return np.array([])

# # Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(safe_literal_eval)

# # Check for and remove any empty arrays resulting from parsing errors
# text_chunks_and_embedding_df = text_chunks_and_embedding_df[text_chunks_and_embedding_df["embedding"].apply(len) > 0]

# # Stack the numpy arrays into a single numpy array
# embeddings = np.stack(text_chunks_and_embedding_df["embedding"].to_list(), axis=0)

# # Convert the numpy array to a torch tensor
# embeddings = torch.tensor(embeddings, dtype=torch.float32).to(device)

# # # Convert texts and embedding df to list of dicts
# # pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")


In [None]:
# embeddings 

In [None]:
# embeddings = (embeddings_from_df).to(device)

# Convert texts and embedding df to list of dicts
# pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")



In [None]:
loaded_df

In [None]:
embeddings_tensor

In [None]:
embeddings_tensor.shape

In [None]:
loaded_df.head()

In [None]:
pages_and_chunks

In [None]:
# text_chunks_and_embedding_df["embedding"]

Query embedding and stored embedding matching

In [None]:
from sentence_transformers import util 

In [None]:
embeddings_tensor = embeddings_tensor.to(device)

In [None]:
query = "ZoDiac Watermarking"
print(f"query : {query}")

query_embeddings = model.encode(query, convert_to_tensor=True ).to(device)

dot_scores = util.dot_score(a= query_embeddings, b=embeddings_tensor)[0]

In [None]:
top_k_dot_results = torch.topk(dot_scores,k=5)

In [None]:
top_k_dot_results

In [None]:
pages_and_chunks[11]

In [None]:
query_embeddings.dtype

In [None]:
# query_embeddings

In [None]:
# embeddings[0].dtype

In [None]:
import textwrap

def print_wrapped(text,wrap_length=80) : 
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)

In [None]:
query = "Tree-Ring Watermarking"
print(f"query : {query}")

for value, index in zip(top_k_dot_results[0], top_k_dot_results[1]): 
    print(f"Score: {value:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[index]["chunks"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[index]['page_number']}")
    print("\n")

Functinons for semantic search 

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]


    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["chunks"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [None]:
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

In [None]:
# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

LLMs

GPT-2

In [None]:
# # https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/generation/logits_process.py#L411
# from transformers.generation.logits_process import LogitsWarper

In [None]:
# class TopPLogitsWarper(LogitsWarper):
#     """
#     [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off. Often
#     used together with [`TemperatureLogitsWarper`] and [`TopKLogitsWarper`].

#     Args:
#         top_p (`float`):
#             If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
#             higher are kept for generation.
#         filter_value (`float`, *optional*, defaults to -inf):
#             All filtered values will be set to this float value.
#         min_tokens_to_keep (`int`, *optional*, defaults to 1):
#             Minimum number of tokens that cannot be filtered.
#     """

#     def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
#         top_p = float(top_p)
#         if top_p < 0 or top_p > 1.0:
#             raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
#         if not isinstance(min_tokens_to_keep, int) or (min_tokens_to_keep < 1):
#             raise ValueError(f"`min_tokens_to_keep` has to be a positive integer, but is {min_tokens_to_keep}")

#         self.top_p = top_p
#         self.filter_value = filter_value
#         self.min_tokens_to_keep = min_tokens_to_keep

#     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
#         s_sorted_vals, s_sorted_indices = torch.sort(scores, descending=True, dim = -1)
#         softmax_outputs_cumsum = s_sorted_vals.softmax(dim = -1, ).cumsum(dim = -1)
#         indices_to_remove = softmax_outputs_cumsum <= self.top_p
#         indices_to_remove = indices_to_remove.scatter(1, s_sorted_indices, indices_to_remove)
#         indices_to_remove = ~indices_to_remove
#         scores_processed = scores.masked_fill(indices_to_remove, self.filter_value)
#         #print(scores[0], scores_processed[0], '11')
#         return scores_processed

In [None]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, LogitsProcessorList
# from transformers import MaxLengthCriteria, StoppingCriteriaList  # Correct import path
# import torch

# # Initialize the tokenizer and model
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")

# # Setup the prompt and other beam search settings
# x = 'The capital of India?'
# input_ids = tokenizer(x, return_tensors='pt').input_ids.to(model.device)
# print(f'input_ids = {input_ids}')
# y = 'Delhi'
# output_ids = tokenizer(y, return_tensors='pt').input_ids.to(model.device)
# print(f'output_ids = {output_ids}')

# # Number of beams
# num_beams = 10

# logits_top_p = TopPLogitsWarper(top_p=0.9)

# # Logits processor and stopping criteria
# logits_processor = LogitsProcessorList([logits_top_p])


# #Processing logits
# Temp_scale = 2
# with torch.no_grad():
#     logits = model(input_ids).logits[0]
#     processed_logits = logits_top_p(input_ids = input_ids, scores = logits)
#     probabilities = (processed_logits / Temp_scale).softmax(dim = -1)
#     probabilities_final = probabilities[-1, :]
#     print(probabilities.shape, torch.argmax(probabilities_final), probabilities_final[13856])
#     #fx_y = probabilities_final[]


# # Generate text using beam search
# output_sequences = model.generate(
#     input_ids,
#     max_length=12,
#     num_beams=num_beams,
#     num_return_sequences=2,
#     logits_processor=logits_processor,
# )

# # Decode and print the output beams
# for index, output_sequence in enumerate(output_sequences):
#     output_text = tokenizer.decode(output_sequence, skip_special_tokens=True)
#     print(f'beam {index}: {output_text}')


Gemma-2-9b-it

In [None]:
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [None]:
from transformers.utils import is_flash_attn_2_available 

if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

In [None]:
# from transformers import pipeline 

# model_id  = "google/gemma-2-9b"


In [None]:
# from transformers import pipeline
# import torch

# pipe = pipeline(
#     "text-generation",
#     model="google/gemma-2-9b-it",
#     model_kwargs={"torch_dtype": torch.bfloat16},
#     device="cuda",
# )

# messages = [
#     {"role": "user", "content": "Who are you? Please, answer in pirate-speak."},
# ]
# outputs = pipe(
#     messages,
#     max_new_tokens=1024,
#     do_sample=False,
# )
# assistant_response = outputs[0]["generated_text"][-1]["content"]
# print(assistant_response)

In [None]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
llm = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# llm.to(device)

In [None]:
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = llm.generate(**input_ids, max_new_tokens=512)
print(tokenizer.decode(outputs[0]))

In [None]:
# input = "Write a poem about time in 50 words"
# input_ids = tokenizer(input, return_tensors="pt").to("cuda")

# outputs = model.generate(**input_ids)
# print(tokenizer.decode(outputs[0]))

In [None]:
# tokenizer.decode(outputs[0])

In [None]:
# input_text = "Write a poem about time"
# dialogue_template = [{"role": "user", "content": input_text}]

# # Assuming input_data is a tensor, directly move it to the GPU
# input_data = tokenizer.apply_chat_template(conversation=dialogue_template, return_tensors="pt").to("cuda")

# # Generate outputs directly using input_data
# outputs = model.generate(input_ids=input_data, max_new_tokens=256)

# # Decode and print the output
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
input_text = "Write a poem about time"
dialogue_template = [{"role": "user", "content": input_text}]

# Assuming input_data is a tensor, directly move it to the GPU
prompt = tokenizer.apply_chat_template(dialogue_template, tokenize=False, add_generation_prompt=True)


In [None]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate outputs directly using input_data
outputs = llm.generate(**input_ids, max_new_tokens=256)

# Decode and print the output
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
pages_and_chunks[5]

In [None]:
# def prompt_formatter(query, context_items ):
#     """
#     Augments query with text-based context from context_items.
#     """
#     # Join context items into one dotted paragraph
#     context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

#     # Create a base prompt with examples to help the model
#     # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
#     # We could also write this in a txt file and import it in if we wanted.
#     base_prompt = """Based on the following context items, please answer the query.
# Give yourself room to think by extracting relevant passages from the context before answering the query.
# Don't return the thinking, only return the answer.
# Make sure your answers are as explanatory as possible.
# Use the following examples as reference for the ideal answer style.
# \nExample 1:
# Query: What are the fat-soluble vitamins?
# Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
# \nExample 2:
# Query: What are the causes of type 2 diabetes?
# Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
# \nExample 3:
# Query: What is the importance of hydration for physical performance?
# Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
# \nNow use the following context items to answer the user query:
# {context}
# \nRelevant passages: <extract relevant passages from the context here>
# User query: {query}
# Answer:"""

#     # Update base prompt with context items and query   
#     base_prompt = base_prompt.format(context=context, query=query)

#     # Create prompt template for instruction-tuned model
#     dialogue_template = [
#         {"role": "user",
#         "content": base_prompt}
#     ]

#     # Apply the chat template
#     prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
#                                           tokenize=False,
#                                           add_generation_prompt=True)
#     return prompt


In [None]:
def prompt_formatter(query, context_items, use_dialogue_template=True):
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    # context = "- " + "\n- ".join([item["chunks"] for item in context_items])
    context = " ".join([item["chunks"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """
        Based on the following context items, please answer the query.
        Context item : 
        {context}
        User query: {query}
        Answer:
        """

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    if(use_dialogue_template == True) :
        # Apply the chat template
        prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                            tokenize=False,
                                            add_generation_prompt=True)
    else : 
        prompt = tokenizer.apply_chat_template(conversation=base_prompt,
                                            tokenize=False,
                                            add_generation_prompt=True) 
    return prompt


In [None]:
query = "Explain the black-box visual paraphrase"
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)


In [None]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")


In [None]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text
    
    return output_text, context_items


In [None]:
# query = random.choice(query_list)
query = "What is the aim of this paper?"
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items
