### this project to use GEN LLM Model and apply RAG technique
to help junior ML engineer to learn the Designing of Machine Learning System

### 1- first step to read the data from the pdf file

In [None]:
import fitz
import re
import os
from tqdm.auto import tqdm


def clean_the_text(text: str) -> str:
    """Clean PDF page text by removing headers, footers, page numbers, and fixing hyphenations."""
    # Remove newlines and normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove common page numbers (optional, tweak as needed)
    text = re.sub(r'Page \d+|\d{1,3}$', '', text)

    # Fix hyphenation at line breaks: "exam-\nple" => "example"
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)

    # Optional: remove figure/table labels
    text = re.sub(r'Figure\s*\d+[^.]*\.?', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Table\s*\d+[^.]*\.?', '', text, flags=re.IGNORECASE)

    return text.strip()


def get_pdf_files_name(folder_path:str) -> list[str] :
    """Get all pdf file paths in the given folder"""
    return [os.path.join(folder_path,f) for f in os.listdir(folder_path) if f.endswith(".pdf")]

def getDataFromPdf(pdfbook:str) ->list[dict] :
    """This function used to get the book pages contains"""
    #  open the pdf file to get the pages and page number
    doc = fitz.open(pdfbook)
    pdfData= []
    for page_number, page in tqdm(enumerate(doc)):
       # get the data from the page
        text_in_apage = page.get_text()
        #clean the text
        cleanedText = clean_the_text(text_in_apage)
        pdfData.append({
            "book name": pdfbook.replace("./bookpdf\\",""),
            "page_number" : page_number,
             "numberOfCharPerPage" :len(cleanedText),
            "numberOfWordPerPage":len(cleanedText.split(" ")),
            "numberOfTokenPerPageExpected":len(re.sub(r"[\s]", "", cleanedText))/4,
            "numberOfSentences":len(cleanedText.split(". ")),
            "text":cleanedText
                   })
    return pdfData




In [None]:
# === Main execution ===
folder_path = "./bookpdf"  # replace with your folder path
pdf_files = get_pdf_files_name(folder_path)
print(pdf_files)

all_data = []

for pdf_file in pdf_files:
    book_data = getDataFromPdf(pdf_file)
    print(len(book_data))
    all_data.extend(book_data)


['./bookpdf\\Computer-Vision-Algorithms-and-Applications-2nd Edition, Richard Szeliski.pdf', './bookpdf\\Data-Science-and-Machine-Learning.pdf', './bookpdf\\designing-machine-learning-systems.pdf', './bookpdf\\Information-Theory,-Inference,-and-Learning-Algorithms.pdf', './bookpdf\\Introduction-to-Machine-Learning-with-Python.pdf']


0it [00:00, ?it/s]

1232


0it [00:00, ?it/s]

533


0it [00:00, ?it/s]

470


0it [00:00, ?it/s]

640


0it [00:00, ?it/s]

392


In [None]:
len(all_data)

3267

In [None]:
all_data[500]

{'book name': 'Computer-Vision-Algorithms-and-Applications-2nd Edition, Richard Szeliski.pdf',
 'page_number': 500,
 'numberOfCharPerPage': 2010,
 'numberOfWordPerPage': 334,
 'numberOfTokenPerPageExpected': 419.25,
 'numberOfSentences': 12,
 'text': '7.3 Contour tracking 475 -1 +1 ϕ = 0 ϕ∆ g(I) 44 Level set evolution for a geodesic active contour. The embedding function φ is updated based on the curvature of the underlying surface modulated by the edge/speed function g(I), as well as the gradient of g(I), thereby attracting it to strong edges. Kimmel, and Sapiro (1997) and Yezzi, Kichenassamy et al. (1997), dφ dt = |∇φ|div \x12 g(I) ∇φ |∇φ| \x13 = g(I)|∇φ|div \x12 ∇φ |∇φ| \x13 + ∇g(I) · ∇φ, (7.38) where g(I) is a generalized version of the snake edge potential. To get an intuitive sense of the curve’s behavior, assume that the embedding function φ is a signed distance function away from the curve (44), in which case |φ| = 1. The ﬁrst term in Equation (7.38) moves the curve in the dire

### 2- Data preparation


we need to get some information about the data like the maximum number of words, token, ... which will help us to select the proper LLM Model

In [None]:
# !pip install numpy==1.26.4 pandas==2.2.2



In [None]:
import pandas as pd

df = pd.DataFrame(all_data)


In [None]:
df.describe().round(2)

Unnamed: 0,page_number,numberOfCharPerPage,numberOfWordPerPage,numberOfTokenPerPageExpected,numberOfSentences
count,3267.0,3267.0,3267.0,3267.0,3267.0
mean,395.29,2144.34,356.56,447.2,29.39
std,305.14,810.24,150.24,169.73,75.17
min,0.0,0.0,1.0,0.0,1.0
25%,163.0,1692.5,276.0,350.0,11.0
50%,326.0,2175.0,361.0,451.0,17.0
75%,529.0,2704.0,433.0,562.75,25.0
max,1231.0,11232.0,2436.0,2199.25,1082.0


 Note: the mean of number of token ~= 450


### 2.1 divide the page into sentences for easier to handle
use SpaCy or NLTK library

In [None]:
# !pip install spacy


In [None]:
# Use nltk.tokenize.PunktSentenceTokenizer
# This is often better than spaCy's sentencizer if you want lightweight segmentation without full parsing
# from nltk.tokenize import sent_tokenize
# item["sentences"] = sent_tokenize(item["text"])


In [None]:
from spacy.lang.en import English

nlp = English()

# use Spacy for divide pages to sentences then add it to the bookData

nlp.add_pipe("sentencizer")

for item in tqdm(all_data):
    #convert the page text into sentences and make sure that the sentences datatupe is string
    item["sentences"] = [str(sent) for sent in nlp(item["text"]).sents]

    # item["sentences"] = list(nlp(item["text"]).sents)
    # item["sentences"] =[str(sentence) for sentence in item["sentences"]]
    # Count the sentences
    item["sentences_per_page_spacy"] = len(item["sentences"])


  0%|          | 0/3267 [00:00<?, ?it/s]

In [None]:
all_data[45]

{'book name': 'Computer-Vision-Algorithms-and-Applications-2nd Edition, Richard Szeliski.pdf',
 'page_number': 45,
 'numberOfCharPerPage': 1726,
 'numberOfWordPerPage': 264,
 'numberOfTokenPerPageExpected': 365.75,
 'numberOfSentences': 15,
 'text': '20 Computer Vision: Algorithms and Applications, 2nd ed. (ﬁnal draft, Sept. 2021) (a) (b) (c) (d) (e) (f) 11 Examples of computer vision algorithms from the 2010s: (a) the SuperVision deep neural network © Krizhevsky, Sutskever, and Hinton (2012); (b) object instance segmentation (He, Gkioxari et al. 2017) © 2017 IEEE; (c) whole body, expression, and gesture ﬁtting from a single image (Pavlakos, Choutas et al. 2019) © 2019 IEEE; (d) fusing multiple color depth images using the KinectFusion real-time system (Newcombe, Izadi et al. 2011) © 2011 IEEE; (e) smartphone augmented reality with real-time depth occlusion effects (Valentin, Kowdle et al. 2018) © 2018 ACM; (f) 3D map computed in real-time on a fully autonomous Skydio R1 drone (Cross 2

#### 2.2 Check and study the data

In [None]:
df2 = pd.DataFrame(all_data)

In [None]:
df2.describe().round(2)

Unnamed: 0,page_number,numberOfCharPerPage,numberOfWordPerPage,numberOfTokenPerPageExpected,numberOfSentences,sentences_per_page_spacy
count,3267.0,3267.0,3267.0,3267.0,3267.0,3267.0
mean,395.29,2144.34,356.56,447.2,29.39,19.08
std,305.14,810.24,150.24,169.73,75.17,14.11
min,0.0,0.0,1.0,0.0,1.0,0.0
25%,163.0,1692.5,276.0,350.0,11.0,11.0
50%,326.0,2175.0,361.0,451.0,17.0,17.0
75%,529.0,2704.0,433.0,562.75,25.0,24.0
max,1231.0,11232.0,2436.0,2199.25,1082.0,175.0


#### 2.3 Divide the list of sentences per page into smaller, discrete units for easier processing

- Enhanced Manageability: Dividing text into smaller, evenly sized chunks ensures easier handling and processing, especially when dealing with large datasets.

- Avoiding Information Loss: Embedding models often have a token capacity limit (e.g., 384 tokens). If a sequence exceeds this limit, information loss may occur due to truncation, which compromises the quality of embeddings.

- Optimal Utilization of LLM Context Window: Large Language Models (LLMs) typically have a restricted context window, which dictates the number of tokens they can process at once. Exceeding this capacity not only leads to inefficiencies but also requires additional computational resources. By chunking text appropriately, we maximize the utility of the context window while minimizing unnecessary computational overhead.



In [None]:
chunk_size = 10  # number of sentences in one chunck (e.g [18 sentences]->[[10sents],[8sents]])

def split_list(input_list : list, chunk_size:int) ->list[list] :

   """
    Splits a list into smaller sublists of a specified maximum size.

    Parameters:
        input_list (List[str]): The list to be split.
        chunk_size (int): The maximum size of each sublist.

    Returns:
        List[List[str]]: A list of sublists, each containing up to `chunk_size` elements.
   """
   return [input_list[i : i + chunk_size] for i in range(0 , len(input_list), chunk_size)]




In [None]:
all_data[55]['sentences']

['30 Computer Vision: Algorithms and Applications, 2nd ed. (',
 'ﬁnal draft, Sept. 2021) Week Chapter Topics 1.',
 'Chapters 1–2 Introduction and image formation 2.',
 'Chapter 3 Image processing 3.',
 'Chapters 4–5 Optimization and learning 4.',
 'Chapter 5 Deep learning 5.',
 'Chapter 6 Recognition 6.',
 'Chapter 7 Feature detection and matching 7.',
 'Chapter 8 Image alignment and stitching 8.',
 'Chapter 9 Motion estimation 9.',
 'Chapter 10 Computational photography 10.',
 'Chapter 11 Structure from motion 11.',
 'Chapter 12 Depth estimation 12.',
 'Chapter 13 3D reconstruction 13.',
 'Chapter 14 Image-based rendering 1 Sample syllabus for a one semester 13-week course.',
 'A 10-week quarter could go into lesser depth or omit some topics.',
 '1.4 Sample syllabus Teaching all of the material covered in this book in a single quarter or semester course is a Herculean task and likely one not worth attempting.11 It is better to simply pick and choose topics related to the lecturer’s pr

In [None]:
# Loop through pages and texts and split sentences into chunks
for item in tqdm(all_data):
    item["sentence_chunks"] = split_list(item["sentences"],
                                         chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/3267 [00:00<?, ?it/s]

In [None]:
all_data[52]

{'book name': 'Computer-Vision-Algorithms-and-Applications-2nd Edition, Richard Szeliski.pdf',
 'page_number': 52,
 'numberOfCharPerPage': 3125,
 'numberOfWordPerPage': 459,
 'numberOfTokenPerPageExpected': 666.75,
 'numberOfSentences': 18,
 'text': '1.3 Book overview 27 and recognition techniques are built on extracting and matching feature points (Section 7.1), so this is a fundamental technique required by many subsequent chapters (Chapters 8 and 11) and even in instance recognition (Section 6.1). We also cover edge and straight line detection in Sections 7.2 and 7.4, contour tracking in Section 7.3, and low-level segmentation techniques in Section 7.5. Feature detection and matching are used in Chapter 8 to perform image alignment (or registration) and image stitching. We introduce the basic techniques of feature-based alignment and show how this problem can be solved using either linear or non-linear least squares, depending on the motion involved. We also introduce additional con

In [None]:
df3 = pd.DataFrame(all_data)

In [None]:
df3.describe().round(2)

Unnamed: 0,page_number,numberOfCharPerPage,numberOfWordPerPage,numberOfTokenPerPageExpected,numberOfSentences,sentences_per_page_spacy,num_chunks
count,3267.0,3267.0,3267.0,3267.0,3267.0,3267.0,3267.0
mean,395.29,2144.34,356.56,447.2,29.39,19.08,2.38
std,305.14,810.24,150.24,169.73,75.17,14.11,1.41
min,0.0,0.0,1.0,0.0,1.0,0.0,0.0
25%,163.0,1692.5,276.0,350.0,11.0,11.0,2.0
50%,326.0,2175.0,361.0,451.0,17.0,17.0,2.0
75%,529.0,2704.0,433.0,562.75,25.0,24.0,3.0
max,1231.0,11232.0,2436.0,2199.25,1082.0,175.0,18.0


### 2.4 prepare the chunk for the LLM Model
 now the chunk contains a 10 separated sentences we need to  merge to create a one paragraph to be suitable for LLM Model

In [None]:

pages_and_chunks = []
for item in tqdm(all_data):
   if "sentence_chunks" in item:
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["book name"] = item['book name']
        chunk_dict['page_number'] = item['page_number']
# join the sentences in one chunk to create a paragragh
        join_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        join_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', join_sentence_chunk)
        chunk_dict["sentence_chunk"] = join_sentence_chunk

        chunk_dict["chunk_char_count"] = len(join_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in join_sentence_chunk.strip(" ")])
        chunk_dict["chunk_token_count"] = len(join_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)
else:
     print(f"Missing 'sentence_chunks' in item: {item}")



  0%|          | 0/3267 [00:00<?, ?it/s]

Missing 'sentence_chunks' in item: {'book name': 'Introduction-to-Machine-Learning-with-Python.pdf', 'page_number': 391, 'numberOfCharPerPage': 642, 'numberOfWordPerPage': 110, 'numberOfTokenPerPageExpected': 133.25, 'numberOfSentences': 7, 'text': 'activity near their habitat means greater amounts of sediment and chemicals in the water. In an effort to save this endangered species, biologists have begun to raise the amphibians in captivity and release them when they reach a less vulnerable age. Many of the animals on O’Reilly covers are endangered; all of them are important to the world. To learn more about how you can help, go to animals.oreilly.com. The cover image is from Wood’s Animate Creation. The cover fonts are URW Type‐ writer and Guardian Sans. The text font is Adobe Minion Pro; the heading font is Adobe Myriad Condensed; and the code font is Dalton Maag’s Ubuntu Mono.', 'sentences': ['activity near their habitat means greater amounts of sediment and chemicals in the water.'

In [None]:
df4 = pd.DataFrame(pages_and_chunks)

In [None]:
df4.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,7780.0,7780.0,7780.0,7780.0
mean,467.39,898.34,898.34,224.58
std,353.32,517.58,517.58,129.39
min,0.0,2.0,2.0,0.5
25%,184.0,517.0,517.0,129.25
50%,373.0,847.0,847.0,211.75
75%,664.0,1231.0,1231.0,307.75
max,1231.0,10415.0,10415.0,2603.75


### note:
The entire textbook has been segmented into manageable chunks, each containing up to 10 sentences, with the corresponding page number recorded.

This structured approach enables precise referencing of any text segment, ensuring clear traceability to its original source.

### 2.5 Data cleaning
 #### General Guideline:
- < 10 tokens: Likely headers, footers, or non-informative text.

- 10–32 tokens: Could be short but valid sentences (e.g., bullet points, definitions).

- more than 32 tokens: More likely to contain meaningful and context-rich content.

In [None]:
min_token_length = 32

# Print 5 sample chunks that are shorter than or equal to the minimum token threshold
for index, row in df4[df4["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row["chunk_token_count"]} | Text: {row["sentence_chunk"]}\n')



Chunk token count: 0.75 | Text: 602

Chunk token count: 1.5 | Text: 23.16)

Chunk token count: 10.0 | Text: At the ith iteration, probabilities r at

Chunk token count: 2.5 | Text: 1805–1809.

Chunk token count: 24.0 | Text: 60] A. K. Jain. Fundamentals of Digital Image Processing. Prentice Hall, Englewood Cliffs, 1989.



In [None]:
cleaned_chunk= df4[df4["chunk_token_count"]>=min_token_length].to_dict(orient = "records")
cleaned_chunk[:2]

[{'book name': 'Computer-Vision-Algorithms-and-Applications-2nd Edition, Richard Szeliski.pdf',
  'page_number': 0,
  'sentence_chunk': 'Computer Vision: Algorithms and Applications 2nd Edition Richard Szeliski Final draft, September 30, 2021 © 2022 Springer This electronic draft was downloaded Dec_27,_2022 for the personal use of _________________________???_________________________ ________jacky870810@icloud.com________ and may not be posted or re-distributed in any form. Please refer interested readers to the book’s Web site at https://szeliski.org/Book, where you can also provide feedback.',
  'chunk_char_count': 463,
  'chunk_word_count': 463,
  'chunk_token_count': 115.75},
 {'book name': 'Computer-Vision-Algorithms-and-Applications-2nd Edition, Richard Szeliski.pdf',
  'page_number': 3,
  'sentence_chunk': '1 Introduction 1 What is computer vision? •A brief history • Book overview • Sample syllabus • Notation n^ 2 Image formation 33 Geometric primitives and transformations • Pho

## 2.6 save chunk

In [None]:
import torch
# torch.save(cleaned_chunk, "cleaned_chunk_readyforembedding.pt")

In [None]:
import torch
cleaned_chunk = torch.load("cleaned_chunk_readyforembedding.pt")
cleaned_chunk[:2]

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_chunk_readyforembedding.pt'

### 3. Embedding Our Text Chunks
While humans interpret and understand text naturally, machines process information most effectively in the form of numbers.

#### Summary of the Process
* * Words are meaningless to machines until we map them to numbers.

* * Instead of assigning random numbers, we train a model to learn meaning from large text corpora.

* * The result: embeddings — vectors that capture semantic meaning.

* * Similar words are close together in this vector space.
### Why Is This Important?
Embedding learning was a major step forward in:

- Search engines

- Recommendation systems

- Chatbots

- Translation

- Voice assistants

- Language generation

### Why Are Embeddings Important?
* Traditional methods (like one-hot encoding or simple token IDs) treat words as isolated symbols. For example:

* "car" and "automobile" would be completely different and unrelated.

But embeddings solve this by placing similar words closer together in vector space.

### Why Choose all-mpnet-base-v2?
* 📌 1. State-of-the-art accuracy
It’s based on Microsoft’s MPNet, which improves over BERT and RoBERTa.

Offers strong semantic similarity performance — great for:

- Question-answering

- Semantic search

- Clustering

- Duplicate detection

* 📌 2. Built for Sentences
It’s fine-tuned using Siamese & Triplet networks on sentence pairs.

That means embeddings from this model are meaningfully comparable — distances directly reflect semantic similarity.

* 📌 3. Efficient
Despite being accurate, it's still reasonably fast and lightweight for practical use.



In [None]:
!pip install sentence_transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [5]:
%%time
from sentence_transformers import SentenceTransformer , util

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

CPU times: user 4.06 s, sys: 1.18 s, total: 5.24 s
Wall time: 9.89 s


In [None]:

# # Extract all sentences
# sentences = [item["sentence_chunk"] for item in cleaned_chunk]

# # Batch encode
# embeddings = embedding_model.encode(sentences, batch_size=32, show_progress_bar=True)

# # Attach embeddings back to items
# for item, emb in zip(cleaned_chunk, embeddings):
#     item['embedding'] = emb


In [None]:
from tqdm.auto import tqdm

for item in tqdm(cleaned_chunk):
    item['embedding'] = embedding_model.encode(item["sentence_chunk"])


  0%|          | 0/7475 [00:00<?, ?it/s]

In [None]:
import torch
torch.save(cleaned_chunk,"cleaned_embedded_chunked2.pt")
print("done")

done


In [1]:
import torch
cleaned_chunk= torch.load("cleaned_embedded_chunked2.pt")
print("done")

  cleaned_chunk= torch.load("cleaned_embedded_chunked2.pt")


done


In [None]:
# loaded_cleaned = torch.load("cleaned_chunk.pt")

In [None]:
# list_of_chunk = [item["sentence_chunk"] for item in cleaned_chunk]
# list_of_chunk[:2]

In [None]:
# %%time
# chuncked_text_embedding = embedding_model.encode(list_of_chuck ,
#                                                  batch_size=32 , convert_to_tensor=True)


### Extract and creat list of embedding from the ccleaned_chunk data

In [2]:
embedding = [item["embedding"]  for item in cleaned_chunk]


In [None]:
# embedding[0]

array([-3.22471596e-02,  1.00584119e-04, -1.83501712e-03,  5.88923395e-02,
       -1.45165594e-02, -7.36929197e-03,  7.02884980e-03,  4.17751782e-02,
        8.59931391e-03,  1.62962321e-02,  5.32661937e-02,  4.79736514e-02,
        5.10932803e-02, -6.71845209e-03, -2.33528372e-02, -7.21996352e-02,
       -6.19428121e-02, -1.44209685e-02, -4.70060110e-02, -1.91253610e-02,
        1.12584075e-02, -2.87137274e-02,  6.10022293e-03, -1.22142897e-03,
       -8.42429325e-02,  2.82193366e-02, -5.27978968e-03, -2.68212110e-02,
        4.57868204e-02,  1.06517570e-02, -2.46156752e-02,  1.27255861e-02,
       -8.83830525e-03,  1.29647609e-02,  2.11044767e-06, -7.65771642e-02,
       -7.20729725e-03, -1.42689813e-02,  7.50446618e-02, -7.20035518e-03,
        3.30601446e-02,  4.91423649e-04,  1.63715705e-02, -3.68691608e-02,
       -2.09438540e-02,  2.77584977e-02,  6.08712770e-02,  1.47208916e-02,
        1.60253011e-02,  1.52521050e-02, -2.29642931e-02,  4.95672401e-04,
       -2.35068444e-02,  

### 4.0 Similarity

### 4. Pass the search query to the model


In [3]:
from sentence_transformers import SentenceTransformer , util

In [6]:
%%time
# 1. Define the search Query
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


query = "How can infrastructure help reduce engineering effort in ML deployment?"

# 2. Embed the search query to the same model
query_embedded = embedding_model.encode(query , convert_to_tensor=True)
# 3. find the similarity in the given text using dot product
embeddings_tensor = torch.tensor(embedding)

a = query_embedded.to(device)
b = embeddings_tensor.to(device)

dot_score = util.dot_score(a=a , b=b)[0]

print(f"embedding length : {len(embeddings_tensor)}")
top_search_result_dot_product =torch.topk(dot_score, k=5)


top_search_result_dot_product





embedding length : 7475
CPU times: user 1.03 s, sys: 188 ms, total: 1.22 s
Wall time: 1.9 s


torch.return_types.topk(
values=tensor([1.8159, 1.8129, 1.7371, 1.6693, 1.6620], device='cuda:0'),
indices=tensor([4871, 4872, 5017, 4943, 4653], device='cuda:0'))

In [7]:
cleaned_chunk[6958]['sentence_chunk']

'• What question(s) am I trying to answer?Do I think the data collected can answer that question? •What is the best way to phrase my question(s) as a machine learning problem? •Have I collected enough data to represent the problem I want to solve? •What features of the data did I extract, and will these enable the right predictions? •How will I measure success in my application? •How will the machine learning solution interact with other parts of my research or business product?In a larger context, the algorithms and methods in machine learning are only one part of a greater process to solve a particular problem, and it is good to keep the big picture in mind at all times. Many people spend a lot of time building complex machine learning solutions, only to find out they don’t solve the right problem. When going deep into the technical aspects of machine learning (as we will in this book), it is easy to lose sight of the ultimate goals.'

In [8]:
context=""
for indx in top_search_result_dot_product.indices.tolist():
  context += cleaned_chunk[indx]['sentence_chunk'] + "\n -"

print(context)


Chapter 10. Infrastructure and Tooling for MLOps In Chapters 4 to 6, we discussed the logic for developing ML systems. In Chapters 7 to 9, we discussed the considerations for deploying, monitoring, and continually updating an ML system. Up until now, we’ve assumed that ML practitioners have access to all the tools and infrastructure they need to implement that logic and carry out these considerations. However, that assumption is far from being true. Many data scientists have told me that they know the right things to do for their ML systems, but they can’t do them because their infrastructure isn’t set up in a way that enables them to do so. ML systems are complex. The more complex a system, the more it can benefit from good infrastructure. Infrastructure, when set up right, can help automate processes, reducing the need for specialized knowledge and engineering time. This, in turn, can speed up the development and delivery of ML applications, reduce the surface area for bugs, and enab

### 5. Gerative LLM model

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

model.to(device)





tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [None]:
model


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [11]:
def generate_answer(context, question=query, max_length=512):
    prompt = (
    f"As an expert, answer the following question use the Context provided.\n"
    # f"Be accurate, concise, give examples and steps .\n\n"
    f"Context:\n{context}\n\n"
    f"Question:\n{question}\n\n"
    f"Answer:"
)

    # prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=max_length, do_sample=True, temperature=0.7)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [12]:
# context = context.to(device)
# query = query.to(device)
answer = generate_answer(context, query, max_length=512)

In [13]:
print(answer)

As an expert, answer the following question use the Context provided.
Context:
Chapter 10. Infrastructure and Tooling for MLOps In Chapters 4 to 6, we discussed the logic for developing ML systems. In Chapters 7 to 9, we discussed the considerations for deploying, monitoring, and continually updating an ML system. Up until now, we’ve assumed that ML practitioners have access to all the tools and infrastructure they need to implement that logic and carry out these considerations. However, that assumption is far from being true. Many data scientists have told me that they know the right things to do for their ML systems, but they can’t do them because their infrastructure isn’t set up in a way that enables them to do so. ML systems are complex. The more complex a system, the more it can benefit from good infrastructure. Infrastructure, when set up right, can help automate processes, reducing the need for specialized knowledge and engineering time. This, in turn, can speed up the developm

In [None]:
#  # from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# from transformers import pipeline

# # model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# # tokenizer = AutoTokenizer.from_pretrained(model_id)
# # model = AutoModelForCausalLM.from_pretrained(model_id)

# generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# question = query
# def generate_answer(question, context):
#     prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
#     response = generator(prompt, max_new_tokens=512, temperature=0.7)
#     return response[0]["generated_text"]


In [None]:
# answer = generate_answer(context, query)

In [None]:
# answer