In [1]:
# code modified from https://github.com/mrdbourke/simple-local-rag
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.
Collecting torch
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cuf

In [7]:
import random
import torch
import numpy as np
import pandas as pd
import re
import textwrap
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
import os
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm
from spacy.lang.en import English # see https://spacy.io/usage for install instructions
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig


device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
# token for huggingface needed for Gemma model
token_access = "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"

### Import PDF Document

In [9]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number+1,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

In [10]:
# Get PDF document
pdf_path = "/content/NHL Players(one per page).pdf"

if not os.path.exists(pdf_path):
  print("File doesn't exist.")
else:
  print(f"File {pdf_path} exists.")

pages_and_texts = pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)


File /content/NHL Players(one per page).pdf exists.


0it [00:00, ?it/s]

In [11]:
pages_and_texts[:2]

[{'page_number': 1,
  'page_char_count': 236,
  'page_word_count': 39,
  'page_sentence_count_raw': 2,
  'page_token_count': 59.0,
  'text': 'Leon Draisaitl (Edmonton Oilers) participated in 74 games, scoring 39 goals and contributing 60 assists, amassing a total of 99 points. He logged an average ice time of 20:45 minutes per game and maintained a shooting accuracy of 19.0%.'},
 {'page_number': 2,
  'page_char_count': 191,
  'page_word_count': 33,
  'page_sentence_count_raw': 3,
  'page_token_count': 47.75,
  'text': 'J.T. Miller (Vancouver Canucks) appeared in 76 games, with 35 goals and 61 assists, leading to 96 points. He averaged 19:32 minutes on the ice per game, and his shooting percentage was 19.6%.'}]

In [12]:
# more info on our data
df = pd.DataFrame(pages_and_texts)
df.head(1)


Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,236,39,2,59.0,Leon Draisaitl (Edmonton Oilers) participated ...


In [13]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,68.0,68.0,68.0,68.0,68.0
mean,34.5,202.63,34.91,2.03,50.66
std,19.77,11.2,2.2,0.17,2.8
min,1.0,181.0,30.0,2.0,45.25
25%,17.75,194.0,33.0,2.0,48.5
50%,34.5,202.0,35.0,2.0,50.5
75%,51.25,210.0,36.25,2.0,52.5
max,68.0,236.0,39.0,3.0,59.0


### Chunking

In [14]:
# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

def chunk(pages, chunk_size=10):
    """This function modifies the pages list"""

    # split into sentences, group sentences, embed
    nlp = English()

    # Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/
    nlp.add_pipe("sentencizer")

    for item in tqdm(pages):
        item["sentences"] = list(nlp(item["text"]).sents)

        # Make sure all sentences are strings
        item["sentences"] = [str(sentence) for sentence in item["sentences"]]

        # Count the sentences
        item["page_sentence_count_spacy"] = len(item["sentences"])

    # Loop through pages and texts and split sentences into chunks
    for item in tqdm(pages):
        item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                            slice_size=chunk_size)
        item["num_chunks"] = len(item["sentence_chunks"])

def split_chunks(pages_list):
    pages_and_chunks = []
    for item in tqdm(pages_list):
        for sentence_chunk in item["sentence_chunks"]:
            chunk_dict = {}
            chunk_dict["page_number"] = item["page_number"]

            # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
            chunk_dict["sentence_chunk"] = joined_sentence_chunk

            # Get stats about the chunk
            chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
            chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
            chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

            pages_and_chunks.append(chunk_dict)

    return pages_and_chunks


In [15]:
chunk(pages_and_texts, chunk_size=10)

# Inspect an example
print(pages_and_texts[0])

# Split each chunk into its own item
pages_and_chunks = split_chunks(pages_and_texts)

pages_and_chunks[0]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

{'page_number': 1, 'page_char_count': 236, 'page_word_count': 39, 'page_sentence_count_raw': 2, 'page_token_count': 59.0, 'text': 'Leon Draisaitl (Edmonton Oilers) participated in 74 games, scoring 39 goals and contributing 60 assists, amassing a total of 99 points. He logged an average ice time of 20:45 minutes per game and maintained a shooting accuracy of 19.0%.', 'sentences': ['Leon Draisaitl (Edmonton Oilers) participated in 74 games, scoring 39 goals and contributing 60 assists, amassing a total of 99 points.', 'He logged an average ice time of 20:45 minutes per game and maintained a shooting accuracy of 19.0%.'], 'page_sentence_count_spacy': 2, 'sentence_chunks': [['Leon Draisaitl (Edmonton Oilers) participated in 74 games, scoring 39 goals and contributing 60 assists, amassing a total of 99 points.', 'He logged an average ice time of 20:45 minutes per game and maintained a shooting accuracy of 19.0%.']], 'num_chunks': 1}


  0%|          | 0/68 [00:00<?, ?it/s]

{'page_number': 1,
 'sentence_chunk': 'Leon Draisaitl (Edmonton Oilers) participated in 74 games, scoring 39 goals and contributing 60 assists, amassing a total of 99 points. He logged an average ice time of 20:45 minutes per game and maintained a shooting accuracy of 19.0%.',
 'chunk_char_count': 236,
 'chunk_word_count': 39,
 'chunk_token_count': 59.0}

In [16]:
# # Show random chunks with under 30 tokens in length
# df = pd.DataFrame(pages_and_chunks)
# min_token_length = 30
# for row in df[df["chunk_token_count"] <= min_token_length].iterrows():
#     print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

In [17]:
# pages_and_chunks_min = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
# pages_and_chunks_min[:2]

### Embedding

In [18]:
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device) # choose the device to load the model to

# Create embeddings one by one on the GPU
for item in tqdm(pages_and_chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


  0%|          | 0/68 [00:00<?, ?it/s]

### Save embeddings to file

In [19]:
# # Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
# embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
# text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [20]:
# # Import saved file and view
# text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
# text_chunks_and_embedding_df_load.head()

## Setting up Search

In [21]:
chunk_df = text_chunks_and_embeddings_df

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
# chunk_df["embedding"] = chunk_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = chunk_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(chunk_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([68, 768])

In [22]:
# Define helper function to print wrapped text
def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

### Search function

In [23]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings
    dot_scores = util.dot_score(query_embedding, embeddings)[0]

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

## Get LLM for Generation

In [24]:
# Get GPU available memory
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 40 GB


In [34]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"
print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 40 | Recommend model: Gemma 7B in 4-bit or float16 precision.
use_quantization_config set to: False
model_id set to: google/gemma-7b-it


In [35]:
# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, token=token_access)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation, # which attention version to use
                                                 token=token_access)

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: google/gemma-7b-it


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [36]:
# model info
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear(in_features=24576, out_features=3072, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): Gem

In [37]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

8537680896

## Generating text with our LLM

In [38]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: How many goals did Tim score?
Context: Tim (Vancouver Canucks) played in 69 games, scoring 2 goals and providing 420 assists, resulting in 422 points. He averaged 31:31 minutes on the ice per game with a shooting percentage of 2.20%.
Answer: Tim from the Vancouver Canucks scored 2 goals in 69 games.
\nExample 2:
Query: What team does Dhun play on?
Context: Dhun (Toronto Maple Leafs) played in 2 games, scoring 5 goals and providing 5 assists, resulting in 10 points. He averaged 10:00 minutes on the ice per game with a shooting percentage of 50.20%.
Answer: Dhun plays for the Toronto Maple Leafs.
\nExample 3:
Query: How many points did Jacob score?
Context: Jacob has a shooting percentage of 99.99%, scoring 12 goals and providing 0 assists, resulting in 12 points. He has played 5 games and averaged 5:00 minutes per game.
Answer: Jacob scored 12 points with 12 goals and 0 assists.
\nNow use the following context items to answer the user query:
{context}
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [39]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True,
        n_resources=5):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [40]:
# for prompt with no retrieval

test = 'How many goals did J.T. Miller score?'

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": test}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create

# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Input text: {test}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")


Input text: How many goals did J.T. Miller score?

Output text:
I do not have access to real-time information, therefore I cannot answer the question of how many goals J.T. Miller scored.


In [41]:
query_list = [
    "How many goals did Connor McDavid score?",
    "What team does Mitch Marner play for?",
    "How many games did Mika Zibanejad play?",
    "What's Dhun's golf handicap?",
    "Who are the top Toronto Maple Leafs scorers?",
    "Name 3 Toronto Maple Leafs players"
]

for query in query_list:
    # Answer query with context and return context
    answer, context_items = ask(query=query,
                                temperature=0.7,
                                max_new_tokens=512,
                                return_answer_only=False)

    print(f"\nAnswer:")
    print_wrapped(answer)
    print(f"Context items:")
    print([item['sentence_chunk'] for item in context_items])


Answer:
Sure, here is the answer to the query:  Connor McDavid (Edmonton Oilers) scored
63 goals in 80 games.
Context items:
['Connor McDavid (Edmonton Oilers) appeared in 80 games, scoring 63 goals and accumulating 90 assists, leading to 153 points. His ice time averaged 22:57 minutes per game with a shooting percentage of 17.2%.', 'Kyle Connor (Winnipeg Jets) appeared in 82 games, scoring 47 goals and assisting 40 times, totaling 87 points. He spent an average of 21:58 minutes on ice per game with a shooting percentage of 16.7%.', 'Dylan Larkin (Detroit Red Wings) participated in 81 games, scoring 32 goals and 45 assists for 77 points. His average ice time was 21:34 minutes per game, and his shooting accuracy was 14.8%.', 'Nathan MacKinnon (Colorado Avalanche) appeared in 71 games, scoring 35 goals and 68 assists for 103 points. He was on the ice for an average of 22:07 minutes per game, with a shooting percentage of 14.2%.', 'Jonathan Huberdeau (Calgary Flames) appeared in 82 games