In [1]:
import os 
import requests 

### Get PDF document

In [2]:
pdf_path = "kech.pdf"
print(f"File {pdf_path} exists.")

File kech.pdf exists.


In [3]:
!pip install pyMuPDF



In [4]:
import fitz 
from tqdm.auto import tqdm #for progress bars 

def text_formatter(text: str) -> str : 
    cleaned_text = text.replace("\n", " ").strip()
    
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number,  
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:2]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 1510,
  'page_word_count': 252,
  'page_sentence_count_raw': 10,
  'page_token_count': 377.5,
  'text': 'Marrakech is one of Morocco’s most fascinating places to visit. It is the fourth largest city. Known as the red city and the pearl of the Moroccan South. It was a particular imperial capital of a vast empire during the reign of Almoravids that covered most of the “Grand Maghreb” and extended well into Europe. It was the protector of the Islamic civilization in that part of the world and a celebrated intellectual and commercial center. Visiting the city means visiting a particular place rich in monuments of varied cultures from different ages. The successive sovereignties left an extraordinary religious and historical heritage: the majestic defensive walls with nine amazing gates, as well as uncountable memorials and monuments, Kasbas, palaces, and roads…apart from this city benefits from a natural variety. The Bahia Palace Monument The Bahia 

In [5]:
!pip install pandas numpy



* exploring our data 

In [6]:
import pandas as pd
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,1510,252,10,377.5,Marrakech is one of Morocco’s most fascinating...
1,1,1524,260,13,381.0,The Koutoubia Mosque historical monuments It i...
2,2,1161,200,14,290.25,popular artists. “The cultural space of Jemaa ...
3,3,316,54,4,79.0,"Dar Si Said Museum: located in Marrakech, is d..."


In [7]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,4.0,4.0,4.0,4.0,4.0
mean,1.5,1127.75,191.5,10.25,281.94
std,1.29,566.62,95.45,4.5,141.65
min,0.0,316.0,54.0,4.0,79.0
25%,0.75,949.75,163.5,8.5,237.44
50%,1.5,1335.5,226.0,11.5,333.88
75%,2.25,1513.5,254.0,13.25,378.38
max,3.0,1524.0,260.0,14.0,381.0


our average token count per page is 452

### Splitting pages into sentences 

In [8]:
!pip install spacy



In [9]:
from spacy.lang.en import English 
import random 

nlp = English()
nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/4 [00:00<?, ?it/s]

* Get PDF document

In [10]:
random.sample(pages_and_texts, k=1)

[{'page_number': 3,
  'page_char_count': 316,
  'page_word_count': 54,
  'page_sentence_count_raw': 4,
  'page_token_count': 79.0,
  'text': 'Dar Si Said Museum: located in Marrakech, is dedicated to Moroccan craft wood. and it’s an amazing Historical Monument in Morocco. Formerly a mansion built in the late nineteenth century by Si Said, brother of Ahmed Ba, Moulay Abdelaziz Grand Vizier of the Sultan Moulay Hassan. Dar Si Said museum was built in 1932.',
  'sentences': ['Dar Si Said Museum: located in Marrakech, is dedicated to Moroccan craft wood.',
   'and it’s an amazing Historical Monument in Morocco.',
   'Formerly a mansion built in the late nineteenth century by Si Said, brother of Ahmed Ba, Moulay Abdelaziz Grand Vizier of the Sultan Moulay Hassan.',
   'Dar Si Said museum was built in 1932.'],
  'page_sentence_count_spacy': 4}]

### Chunking our sentences together 


In [11]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 15 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
random.sample(pages_and_texts, k=1)

[{'page_number': 3,
  'page_char_count': 316,
  'page_word_count': 54,
  'page_sentence_count_raw': 4,
  'page_token_count': 79.0,
  'text': 'Dar Si Said Museum: located in Marrakech, is dedicated to Moroccan craft wood. and it’s an amazing Historical Monument in Morocco. Formerly a mansion built in the late nineteenth century by Si Said, brother of Ahmed Ba, Moulay Abdelaziz Grand Vizier of the Sultan Moulay Hassan. Dar Si Said museum was built in 1932.',
  'sentences': ['Dar Si Said Museum: located in Marrakech, is dedicated to Moroccan craft wood.',
   'and it’s an amazing Historical Monument in Morocco.',
   'Formerly a mansion built in the late nineteenth century by Si Said, brother of Ahmed Ba, Moulay Abdelaziz Grand Vizier of the Sultan Moulay Hassan.',
   'Dar Si Said museum was built in 1932.'],
  'page_sentence_count_spacy': 4,
  'sentence_chunks': [['Dar Si Said Museum: located in Marrakech, is dedicated to Moroccan craft wood.',
    'and it’s an amazing Historical Monument 

In [13]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,1.5,1127.75,191.5,10.25,281.94,10.25,1.0
std,1.29,566.62,95.45,4.5,141.65,4.5,0.0
min,0.0,316.0,54.0,4.0,79.0,4.0,1.0
25%,0.75,949.75,163.5,8.5,237.44,8.5,1.0
50%,1.5,1335.5,226.0,11.5,333.88,11.5,1.0
75%,2.25,1513.5,254.0,13.25,378.38,13.25,1.0
max,3.0,1524.0,260.0,14.0,381.0,14.0,1.0


now we have an average number of chunks of 1.0

### Splitting each chunk into its own item 

We'd like to embed each chunk of sentences into its own numerical representation.

So to keep things clean, let's create a new list of dictionaries each containing a single chunk of sentences with relative information such as page number as well statistics about each chunk.

In [14]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/4 [00:00<?, ?it/s]

4

* Get PDF document

In [15]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 1,
  'sentence_chunk': 'The Koutoubia Mosque historical monuments It is the largest mosque in Morocco, built-in 1150 and a height of 65 meters, and to this day it remains a religious center of worship. The Koutoubia Mosque is one of the Islamic monuments firmly in the history of Morocco. Koutoubia Mosque is in the middle of Marrakesh, near Jemaa El Fna Square. The name of the mosque is derived from “the Ketbiyen”, which is the name of a bookstore that was believed to be near mosques. The Koutoubia Mosque is one of the most important mosques in Morocco. It has exceptional large dimensions, occupies 5300 square meters and consists of 17 wings and 11 domes with relief space. In it the majestic decisions of the sultans and major events took place. The mosque and its minaret, decorated in its upper parts with a ceramic frieze painted in turquoise, became a symbol of the city. The great Koutoubia platform, it is equipped with an automatic movement system that is considered o

In [16]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,4.0,4.0,4.0,4.0
mean,1.5,1127.0,190.75,281.75
std,1.29,566.84,95.66,141.71
min,0.0,315.0,53.0,78.75
25%,0.75,948.75,162.5,237.19
50%,1.5,1335.0,225.5,333.75
75%,2.25,1513.25,253.75,378.31
max,3.0,1523.0,259.0,380.75


### Embedding text chunks 

In [17]:
!pip install sentence-transformers



In [18]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]

#### Embed all texts in batches

In [19]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="sentence-transformers/distilbert-base-nli-stsb-quora-ranking", 
                                      device="cuda")
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings



tensor([[-0.6082,  0.1504, -0.0437,  ..., -0.3059,  0.1822, -0.6947],
        [-0.4137,  0.2858, -0.3256,  ..., -0.0716,  0.1634, -0.2357],
        [-0.7899,  0.3205, -0.3877,  ...,  0.1164, -0.1339, -0.4176],
        [-0.1174, -0.0208, -0.5185,  ...,  0.1183,  0.1990, -0.2115]],
       device='cuda:0')

### let's save the embeddings to a file : 

In [20]:
import pandas as pd

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks)
text_chunks_and_embeddings_df['embeddings'] = text_chunk_embeddings.tolist()
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)


#### Embed all texts in batches

In [21]:
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embeddings
0,0,Marrakech is one of Morocco’s most fascinating...,1510,252,377.5,"[-0.6082422137260437, 0.1504398137331009, -0.0..."
1,1,The Koutoubia Mosque historical monuments It i...,1523,259,380.75,"[-0.41371166706085205, 0.2858239710330963, -0...."
2,2,popular artists. “The cultural space of Jemaa ...,1160,199,290.0,"[-0.7898679971694946, 0.32051485776901245, -0...."
3,3,"Dar Si Said Museum: located in Marrakech, is d...",315,53,78.75,"[-0.11736156791448593, -0.02078251913189888, -..."


### Turning embeddings to tensors 

In [22]:
import random 
import torch 
import numpy as np 
import pandas as pd 
import ast 

device = "cuda" if torch.cuda.is_available() else "cpu"

# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array
text_chunks_and_embedding_df["embeddings"] = text_chunks_and_embedding_df["embeddings"].apply(lambda x: ast.literal_eval(x))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embeddings"].tolist()), dtype=torch.float32).to(device)


In [23]:
embeddings.shape

torch.Size([4, 768])

In [24]:
text_chunks_and_embedding_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embeddings
0,0,Marrakech is one of Morocco’s most fascinating...,1510,252,377.5,"[-0.6082422137260437, 0.1504398137331009, -0.0..."
1,1,The Koutoubia Mosque historical monuments It i...,1523,259,380.75,"[-0.41371166706085205, 0.2858239710330963, -0...."
2,2,popular artists. “The cultural space of Jemaa ...,1160,199,290.0,"[-0.7898679971694946, 0.32051485776901245, -0...."
3,3,"Dar Si Said Museum: located in Marrakech, is d...",315,53,78.75,"[-0.11736156791448593, -0.02078251913189888, -..."


### create another model instance for usablity 

In [25]:
from sentence_transformers import util, SentenceTransformer 

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cuda")



* Let's define a query 

In [26]:
query = "what is jamaa el fana"
# we should embed the query with the same model 
query_embedding = embedding_model.encode(query, convert_to_tensor = True)

* let's get similarity scores with the dot product 

In [27]:
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]

top_results_dot_product = torch.topk(dot_scores, k=2)
top_results_dot_product

torch.return_types.topk(
values=tensor([0.2899, 0.2410], device='cuda:0'),
indices=tensor([2, 1], device='cuda:0'))

In [28]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [29]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'what is jamaa el fana'

Results:
Score: 0.2899
Text:
popular artists. “The cultural space of Jemaa el-Fna” was included by UNESCO in
2001 as an immaterial cultural heritage of humanity. Jamaa El Fna Square is one
of the historical landmarks in morocco. Menara Garden Menara Garden: is a large
garden planted with olive trees about a 45-minute walk from the Jamaa El Fna. At
the heart of this garden, a large basin at the foot of a pavilion serves as a
reservoir of water to irrigate crops. It is a very peaceful place, away from the
bustle of the city. It is, therefore, an ideal place for walking. Saadian Tombs
Saadian Tombs: Dates from the time of the great sultan Ahmad al-Mansur Saadi
(1578-1603). These tombs were not discovered until about 1917, then restored by
the Department of Fine Arts. They are constantly since then to impress visitors
with the beauty of their decoration.and it’s one of Morocco Historical Monuments
Madrasa Ben Youssef Madrasa Ben Youssef: is a gem of Moorish 

In [30]:
import time 
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=2,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    print(dot_scores.size())
    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = torch.topk(input=dot_scores, 
                                 k=min(n_resources_to_return, dot_scores.size(0))) 
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [31]:
query = "what is koutoubia"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

torch.Size([4])


(tensor([-0.3621, -0.4093], device='cuda:0'), tensor([1, 2], device='cuda:0'))

#### Print out the texts of the top scores

In [32]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

Query: what is koutoubia

Results:
Score: 0.2899
popular artists. “The cultural space of Jemaa el-Fna” was included by UNESCO in
2001 as an immaterial cultural heritage of humanity. Jamaa El Fna Square is one
of the historical landmarks in morocco. Menara Garden Menara Garden: is a large
garden planted with olive trees about a 45-minute walk from the Jamaa El Fna. At
the heart of this garden, a large basin at the foot of a pavilion serves as a
reservoir of water to irrigate crops. It is a very peaceful place, away from the
bustle of the city. It is, therefore, an ideal place for walking. Saadian Tombs
Saadian Tombs: Dates from the time of the great sultan Ahmad al-Mansur Saadi
(1578-1603). These tombs were not discovered until about 1917, then restored by
the Department of Fine Arts. They are constantly since then to impress visitors
with the beauty of their decoration.and it’s one of Morocco Historical Monuments
Madrasa Ben Youssef Madrasa Ben Youssef: is a gem of Moorish architecture

### Setup of LLM 


In [33]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [34]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [35]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 


model_id = "google/gemma-7b-it"
model_id = model_id 
print(f"[INFO] Using model_id: {model_id}")

# Instantiate tokenizer (tokenizer turns text into numbers ready for the model) 
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 low_cpu_mem_usage=False 
                                                 ) 

[INFO] Using model_id: google/gemma-7b-it


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [36]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear(in_features=24576, out_features=3072, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): Gemm

#### getting answers 

In [37]:
input_text = "what is koutoubia? "
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]
# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
what is koutoubia? 

Prompt (formatted):
<bos><start_of_turn>user
what is koutoubia?<end_of_turn>
<start_of_turn>model



In [38]:
input_ids = tokenizer(prompt, return_tensors="pt",max_length=512, truncation=True, padding=True, return_overflowing_tokens=False, return_attention_mask=True).to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")
llm_model.to("cuda")

outputs = llm_model.generate(**input_ids, max_new_tokens=256)
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   5049,    603,    638,    745,
            507,  13394, 235336,    107,    108,    106,   2516,    108]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   5049,    603,    638,    745,
           507,  13394, 235336,    107,    108,    106,   2516,    108, 235333,
           745,    507,  13394,    603,    476,   5168,   1671,    575,    573,
         12884,   5571,    577,  12637,    476,   2301, 235269,  35656,  26247,
           576,   1461,    575,    476,  31682,    578,   9376,  13795, 235265,
          1165,    603,   3695,   1671,    577,  12637,  71828,    576,   4078,
        235269,   2730, 235269,    689,  27574, 235265,      1],
       device='cuda:0')



In [39]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
what is koutoubia?<end_of_turn>
<start_of_turn>model
Koutoubia is a term used in the Middle East to describe a small, intimate gathering of people in a relaxed and friendly atmosphere. It is often used to describe gatherings of friends, family, or neighbors.<eos>



In [40]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Input text: what is koutoubia? 

Output text:
Koutoubia is a term used in the Middle East to describe a small, intimate gathering of people in a relaxed and friendly atmosphere. It is often used to describe gatherings of friends, family, or neighbors.


* as we can see the result is not accurate using the base model.

In [41]:
# list of queries 

query_list = ["where is koutoubia",
              "when jamaa alfana was built",
              "what does koutoubia mean",
              "when menara was built",
              "what are the best places to visit in marrakech"]

In [42]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: where is koutoubia
torch.Size([4])


(tensor([-0.2792, -0.3412], device='cuda:0'), tensor([1, 2], device='cuda:0'))

In [94]:
def prompt_formatter(query: str, context_items: list[dict]) -> str:
   
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
    # we are using a guided prompt 
    base_prompt = """Based on the following context items {context}, please answer the query{query}. 
    Give yourself room to think by extracting relevant passages from the context before answering
    the query.
    Don't return the thinking, only return the answer.
    start with the sentence : you are now approching the : 
    Make sure your answers are as if you are a tour guide. 
    Use the following examples as reference for the ideal answer style.
    
    \n example1: 
    Query : what is koutoubia ?
    Answer: The Koutoubia Mosque historical monuments It is the largest mosque in Morocco,
            built-in 1150 and a height of 65 meters, and to this day it remains a religious
            center of worship.
    
    \n example2: 
    Query : when jamaa alfna was created ? 
    Answer : L'espace culturel de la place Jemaa el-Fna » est inscrit patrimoine culturel immatériel depuis 2008 (proclamation en 2001) et au patrimoine mondial depuis 1985 par l'Unesco.
    
    \n example3: 
    Query : tell me more about marrakech?
    answer : Marrakech is one of Morocco’s most fascinating places to visit. It is the fourth largest city. Known as the red city and the pearl of the Moroccan South.
It was a particular imperial capital of a vast empire during the reign of Almoravids that covered most of the “Grand Maghreb” and extended well into Europe. It was the protector of the Islamic civilization in that part of the world and a celebrated intellectual and commercial center.
    """

    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [95]:
query = "what is the best places to discover in marrakech"
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: what is the best places to discover in marrakech
torch.Size([4])
<bos><start_of_turn>user
Based on the following context items - popular artists. “The cultural space of Jemaa el-Fna” was included by UNESCO in 2001 as an immaterial cultural heritage of humanity. Jamaa El Fna Square is one of the historical landmarks in morocco. Menara Garden Menara Garden: is a large garden planted with olive trees about a 45-minute walk from the Jamaa El Fna. At the heart of this garden, a large basin at the foot of a pavilion serves as a reservoir of water to irrigate crops. It is a very peaceful place, away from the bustle of the city. It is, therefore, an ideal place for walking. Saadian Tombs Saadian Tombs: Dates from the time of the great sultan Ahmad al-Mansur Saadi (1578-1603). These tombs were not discovered until about 1917, then restored by the Department of Fine Arts. They are constantly since then to impress visitors with the beauty of their decoration.and it’s one of Morocco Histori

In [96]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: what is the best places to discover in marrakech
RAG answer:
<bos>You are now approaching the vibrant city of Marrakech, a place steeped in history and adorned with breathtaking architectural marvels. Here are the top attractions you can explore in this captivating city:

**Jemaa el-Fna:** A UNESCO World Heritage Site, Jamaa El Fna Square is one of the most iconic landmarks in Morocco. It's a bustling marketplace where traditional music, dance, and storytelling come alive.

**Menara Garden:** Escape the city bustle and immerse yourself in the tranquility of Menara Garden, a haven of olive trees and pristine water basins.

**Saadian Tombs:** Dating back to the 17th century, Saadian Tombs are adorned with intricate carvings and vibrant colors.

**Madsara Ben Youssef:** Discover the exquisite architecture and cultural heritage of Madrasa Ben Youssef, a historic Koranic school.

**Dar Si Said Museum:** Immerse yourself in the rich history of Moroccan craft wood at Dar Si Said Museum

In [97]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU 
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text
    
    return output_text, context_items

In [98]:
query = "what is koutoubia "
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)



print(f"\n Context items:")
for item in context_items:
    print(item["sentence_chunk"])

# When jamaa al fna was built

Query: what is koutoubia 
torch.Size([4])
Answer:

You are now approaching the Koutoubia Mosque historical monuments, the largest
mosque in Morocco, built-in 1150 and standing tall at 65 meters. It's not just a
religious center of worship but also one of the Islamic monuments firmly etched
in the history of Morocco.  The Koutoubia Mosque is a place steeped in history,
a symbol of the city, and a testament to the city's rich cultural heritage. Its
captivating architecture, delicate ornamentation, and vibrant history make it a
must-see attraction for visitors from all walks of life.

 Context items:
The Koutoubia Mosque historical monuments It is the largest mosque in Morocco, built-in 1150 and a height of 65 meters, and to this day it remains a religious center of worship. The Koutoubia Mosque is one of the Islamic monuments firmly in the history of Morocco. Koutoubia Mosque is in the middle of Marrakesh, near Jemaa El Fna Square. The name of the mosque is derived from “the Ketbiyen”, w

In [63]:
!pip install -U WhisperSpeech

Collecting WhisperSpeech
  Downloading WhisperSpeech-0.8-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 580 kB/s eta 0:00:011
Collecting torchaudio
  Downloading torchaudio-2.3.0-cp39-cp39-manylinux1_x86_64.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.1 MB/s eta 0:00:01
[?25hCollecting soundfile
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 129.6 MB/s eta 0:00:01
Collecting fastprogress
  Downloading fastprogress-1.0.3-py3-none-any.whl (12 kB)
Collecting fastcore
  Downloading fastcore-1.5.37-py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 44.4 MB/s  eta 0:00:01
[?25hCollecting speechbrain<1.0
  Downloading speechbrain-0.5.16-py3-none-any.whl (630 kB)
[K     |████████████████████████████████| 630 kB 125.6 MB/s eta 0:00:01
[?25hCollecting vocos
  Downloading vocos-0.1.0-py3-none-any.whl (24 kB)
Collecting hyperpyyaml
  Downloadin

In [99]:
from whisperspeech.pipeline import Pipeline

pipe = Pipeline(torch_compile=True)




#### TTS streaming 

we are going to split the output data to reduce latency 

In [100]:
import re
sentences = re.split(r'[.!?]', answer)

# Remove empty strings from the list
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Print the sentences
for sentence in sentences:
    print(sentence)
    out= pipe.generate_to_notebook(sentence)

You are now approaching the Koutoubia Mosque historical monuments, the largest mosque in Morocco, built-in 1150 and standing tall at 65 meters
 |----------------------------------------| 0.00% [0/748 00:00<?]



 |----------------------------------------| 0.00% [0/683 00:00<?]<00:49]



 |████████████████████████████████████████| 100.00% [683/683 00:32<00:00]

It's not just a religious center of worship but also one of the Islamic monuments firmly etched in the history of Morocco
 |----------------------------------------| 0.67% [5/748 00:00<00:00]



 |████████████████████████████████████████| 100.00% [596/596 00:01<00:00]

The Koutoubia Mosque is a place steeped in history, a symbol of the city, and a testament to the city's rich cultural heritage
 |████████████████████████████████████████| 100.00% [623/623 00:01<00:00]

Its captivating architecture, delicate ornamentation, and vibrant history make it a must-see attraction for visitors from all walks of life
 |████████████████████████████████████████| 100.00% [689/689 00:01<00:00]

In [67]:
!pip install Flask

Collecting Flask
  Downloading flask-3.0.3-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.1 MB/s ta 0:00:011
Collecting blinker>=1.6.2
  Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Collecting Werkzeug>=3.0.0
  Downloading werkzeug-3.0.3-py3-none-any.whl (227 kB)
[K     |████████████████████████████████| 227 kB 4.3 MB/s eta 0:00:01
[?25hCollecting itsdangerous>=2.1.2
  Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Installing collected packages: Werkzeug, itsdangerous, blinker, Flask
Successfully installed Flask-3.0.3 Werkzeug-3.0.3 blinker-1.8.2 itsdangerous-2.2.0


In [68]:
from flask import Flask, jsonify

app = Flask(__name__)

@app.route('/location', methods=['GET'])
def get_location():
    location_data = {
        "latitude": 40.7128,
        "longitude": -74.0060,
        "city": "Marrakech",
        "country": "MAR"
    }
    return jsonify(location_data)

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "/home/user/miniconda/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/user/miniconda/lib/python3.9/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/home/user/miniconda/lib/python3.9/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
  File "/home/user/miniconda/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 692, in initialize
    self.init_sockets()
  File "/home/user/miniconda/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 331, in init_sockets
    self.shell_port = self._bind_socket(self.shell_socket, self.shell_port)
  File "/home/user/miniconda/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 253, in _bind_socket
    return self._try_bind

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
