# Imports

In [5]:
# Imports 

# General imports
import numpy as np
import re
import pandas as pd

# Pytorch and transformers (for LLM)
import transformers, torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModel
transformers.logging.set_verbosity_info()

# For loading documents from a path
from pathlib import Path

# For the embedding module
from sentence_transformers import SentenceTransformer

# %%

# Load device

if torch.backends.mps.is_available():
    # MPS is the GPU model in Mac technology
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device =torch.device("cpu")

print (torch.ones(1, device=device))

  from .autonotebook import tqdm as notebook_tqdm


tensor([1.], device='mps:0')


# The different Modules of the RAG

## The foundation model

We are going to work with Foundation Models, that is models that have been pre-trained on large data sets; but we want to work with OpenSource and hosted models. To do so we are going to fetch models from the [HuggingFace plateform](https://huggingface.co/)

In [6]:
class FoundationModel():

    def __init__(self,FOUND_MODEL_PATH,TEMPERATURE=None,MAX_NEW_TOKENS=10000):
        
        self.model=AutoModelForCausalLM.from_pretrained(FOUND_MODEL_PATH, 
                                             #device_map=mps_device,
                                             #device_map=cuda,
                                             torch_dtype="auto",
                                             trust_remote_code=True,
                                             ).to(device)
        
        
        self.tokenizer= AutoTokenizer.from_pretrained(FOUND_MODEL_PATH)

        self.model.generation_config.temperature=TEMPERATURE # Config of the temperature
        self.model.generation_config.top_p=None              # Config parameter related to the type of generation (like greedy decoding for instance)

        self.llm = pipeline("text-generation",
                     model=self.model,
                     tokenizer=self.tokenizer,
                     return_full_text=False,
                     max_new_tokens=MAX_NEW_TOKENS,
                     do_sample=True
                     )
        
        self.num_parameters = self.model.num_parameters()
        
        print('Number of parameters in my model','{:.2e}'.format(self.num_parameters))


    def generate_response(self,prompt):
        
        messages = [
            {'role':'user', 'content':prompt}
            ]
        
        output=self.llm(messages)
        # Note that the output is a list of len 1 which is a dict with key 'generated_text'
        return output


    # We anticipate the use of RAG and create a generate response taking into account the context

    def generate_response_with_context(self, prompt, context):
    
        # The context is a list of str 
    
        messages = []

        if context:
            for i, ctx in enumerate(context):
                messages.append({'role': 'system','content': f"context {i+1}: {ctx}"})

        messages.append({'role': 'user', 'content': prompt})

    
        output=self.llm(messages)
        return output

As you will see output of the models will be a list on len 1 in dict format with key ```generated_text```. We somehow reformat the output.

In [7]:
def extract_response(output):
    # output is a list of len 1 as a dict with key 'generated_text'

    return output[0]['generated_text']

Another component of the answer is the inclusion of a reasoning component (which is identified with tags ```<think> ...</think>```). We write which only extract the answer.

In [8]:
def short_response(output):

    response=extract_response(output=output)
    #text = "Before <think>to delete</think> After"
    short = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
    return short.strip()

### Unit test for the foundation model

In [9]:
# Unit test
# Start with a model
# Here we list some models and choose a small model Qwen 0.6

Path_SDS="HuggingFaceTB/SmolLM3-3B"
Path_Qwen_4B = "Qwen/Qwen3-4B"
Path_DSR1="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
Path_Q_06="Qwen/Qwen3-0.6B"

f_model = FoundationModel(FOUND_MODEL_PATH=Path_Q_06)

loading configuration file config.json from cache at /Users/reveilla/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full

Number of parameters in my model 5.96e+08


Have a look at the different elements while loading the model; do you recognize some constitutive elements of Transformers ? Look also the number of parameters of the model.

Try several prompts, look at the type of the output directly from the llm or simply the output produced with `generate_response`methods (standard or short).

In [10]:
# Launch a query

# print the output and look to the format

We are going to study a specific example. Generate and print the answer to the query below. Which natural remark once have to keep in mind while using Open Source models ? 

In [11]:
# A specific prompt

prompt_RF='Is Robert Redford alive ?; Answer must be [Yes] or [No]'

In [12]:
print("Prompt:",prompt_RF)

output = f_model.generate_response(prompt=prompt_RF)

response = short_response(output=output)

print(response)

Prompt: Is Robert Redford alive ?; Answer must be [Yes] or [No]
[No]


## The Embedding model

Our foundation model will be mostly used as a decoder, even though it includes an encoder one usually chooses a separate embedding model. Indeed the embedding model of the decoder is designed for the generation process but fails to produce most adequate embeddings for a semantic study (like computing similarity). We make use here of an Open Source embedding model from Library `SentenceTransformer` (in particular we will create embeddings for sentences or more precisely for *chunks*).

In [13]:
class EmbeddingModel():

    def __init__(self,EMBEDD_MODEL_PATH):
        
        # EMBEDD_MODEL_PATH is the name of the embedding model used within the SentenceTransformer lib

        self.Embedmodel=SentenceTransformer(EMBEDD_MODEL_PATH).to(device)
        self.dim=SentenceTransformer(EMBEDD_MODEL_PATH).get_sentence_embedding_dimension()


    def get_embeddings(self,texts):
        
        # texts is a list of strings (which is supposed to be the list of chinks; without the source)
        # we return embeddings of torch type with shape (len(texts),self.dim)

        embeddings=self.Embedmodel.encode(texts,convert_to_tensor=True,normalize_embeddings=True).to(device)
        return embeddings
    

    def compute_cos_sim_embed(self,embed1,embed2):

        # embed1,embeds2 are two embeddings of shape (1,dim)
        # We compute the cos-similarity of two texts (it is returned as a float)

        embed1=embed1.view(-1)
        embed2=embed2.view(-1)

        norm1=torch.norm(embed1,p=2,dim=0)
        norm2=torch.norm(embed2,p=2,dim=0)

        scal = torch.dot(embed1,embed2)
        
        return scal.item()/(norm1.item()*norm2.item())

    
    def compute_cos_sim_texts(self,text_1,text_2):

        # text1,text2 are two str
        # We compute the cos-similarity of two texts (it is returned as a float)

        embeds = self.get_embeddings(texts=[text_1,text_2])
        
        return self.compute_cos_sim_embed(embeds[0],embeds[1])


### Unit test for the embedding model

In [14]:
# Unit test

Embed_mini="all-MiniLM-L6-v2"
EmbedModel=EmbeddingModel(EMBEDD_MODEL_PATH=Embed_mini)

# Once again have a look at the parameters of the model.

sentences=['Hello World','How is the weather ?']

embeddings = EmbedModel.get_embeddings(texts=sentences)

print(type(embeddings),embeddings.shape,type(embeddings[0].dtype))

em = EmbedModel.compute_cos_sim_embed(embeddings[0],embeddings[1])

print(em)

loading configuration file config.json from cache at /Users/reveilla/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /Users/reveilla/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f

<class 'torch.Tensor'> torch.Size([2, 384]) <class 'torch.dtype'>
0.3035542964935303


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Notion of Chunk and Splitter

We decide to create a class for managing chunks. A *chunk* will be defined by:

-  ```source``` the name of the .txt file from which the chunk comes from
-  ```content``` which is the str that composes the chunk
-  ```embedding``` which is the embedding associated to ```content```.

In [15]:
class Chunk():

    def __init__(self,source,content,embed_model: EmbeddingModel):

        self.embedding_model=embed_model

        #dim is the common dimension of the embeddings
        dim = self.embedding_model.dim
        
        # A chunk is defined by its source (str); its content (str); its embedding (a torch which shape (1,dim))

        self.source=str(source)
        self.content=str(content)
        self.embedding=self.embedding_model.get_embeddings(texts=[content]).reshape(1,dim)


    def print_chunk(self):

        print('source:',self.source,'content:',self.content,'embedding shape:',self.embedding.shape)

## Todwards the index: the splitter

We are going to chunk resources and then compare the similarity of each of these chunks with the one of the query. We follow the plan of the lecture and start with the splitter.

This class aims in getting the documents that will be in a folder with address ```path_doc```, and return as an output of the method ```get_chunks``` the chunks associated to these resources. Note that we keep track of which document is issued each chunk (via the ```source``` feature).

In [16]:
class Splitter():

    def __init__(self,embed_model: EmbeddingModel):
        
        self.embedding_model=embed_model
        
        self.docs = [] 
        # We store the original documents as a list of .txt files (format is {"source":'File_name',"content_page":(str)})
        self.chunks=[] 
        # This will be the list of chunks 

    def get_documents(self,path_doc):
        # PATH_DOC is the Path form where the documents will be found (each document is a.txt file).
        docs=[]

        for file in Path(path_doc).rglob("*.txt"):
            name=file.name
            with open(file, "r", encoding="utf-8") as file: 
                resource=file.read().strip()
                if resource:
                    #print(resource,len(resource))
                    docs.append({"source":name,"content_page":resource})
        
        self.docs=docs


    def get_chunks_contents_from_1_doc(self,file_name,content_page,chunk_size,overlap,sentence_split=False):

        if chunk_size < overlap:
            raise Exception('Careful overlap must be smaller than chunk_size')
        
        # Now we chunk according to chunk size and overlap

        if sentence_split:

            content=content_page.split(".")

            for text in content:

                text = text.lstrip()

                if not text=="":
                    self.chunks.append(Chunk(source=file_name,
                      content=text,embed_model=self.embedding_model))
        
        else:
        
            current = 0

            while current < len(content_page):
                end = min(len(content_page),current+chunk_size)
                content = content_page[current:end]
            
                self.chunks.append(Chunk(source=file_name,
                      content=content,embed_model=self.embedding_model))
                
                current += chunk_size - overlap
        

    def get_chunks(self,path_doc,chunk_size,overlap,sentence_split=False):

        self.get_documents(path_doc=path_doc)

        docs=self.docs

        for doc in docs:

            self.get_chunks_contents_from_1_doc(file_name=doc["source"],
                                                content_page=doc["content_page"],
                                                chunk_size=chunk_size,
                                                overlap=overlap,
                                                sentence_split=sentence_split)
    
    def reset_splitter(self):

        self.docs=[]
        self.chunks=[]


### Unit test for the splitting module

In [17]:
# Unit test

this_path = Path.cwd()/"Docs"
embed_model=EmbeddingModel(EMBEDD_MODEL_PATH=Embed_mini)

Split=Splitter(embed_model)

Split.reset_splitter()
Split.get_documents(path_doc=this_path)
Split.get_chunks(path_doc=this_path,chunk_size=30,overlap=15,sentence_split=True)

print(len(Split.chunks))

loading configuration file config.json from cache at /Users/reveilla/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /Users/reveilla/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f

6


In [18]:
# Do your own tests, with overlaps, without, what is the impact ? like for example : 

for i in range(len(Split.chunks)):

    print(Split.chunks[i].source,"\n",Split.chunks[i].content,"\n",Split.chunks[i].embedding.shape)

Robert_Redford.txt 
 Robert Redford passed away last month 
 torch.Size([1, 384])
Personalities.txt 
 Albert Einstein proposed the theory of relativity, which transformed our understanding of time,space, and gravity 
 torch.Size([1, 384])
Personalities.txt 
 Marie Curie was a physicist and chemist who conducted pioneering research on radioactivity and won two Nobel Prizes 
 torch.Size([1, 384])
Personalities.txt 
 Isaac Newton formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics 
 torch.Size([1, 384])
Personalities.txt 
 Charles Darwin introduced the theory of evolution by natural selection in his book 'On the Origin of Species' 
 torch.Size([1, 384])
Personalities.txt 
 Ada Lovelace is regarded as the first computer programmer for her work on Charles Babbage's early mechanical computer, the Analytical Engine 
 torch.Size([1, 384])


## Index, Database and Retriever

We are now in position to define the retriever. In our setting with very few document we will build our index (when the number of resources is pretty high (over $10^4$) we make use of an index (which is trained for that purpose); one can for example make use of the FAISS library.

In [None]:
class Retriever():

    def __init__(self,embed_model: EmbeddingModel):
        
        self.embedding_model=embed_model
        
        # The index is a list of (Id(int),chunk); chunk needs the size DIM for the Embeddings
        self.index=[]
        
        
    def add_elements_to_index(self,chunks):

        # chunks is a list of chunk

        num = len(self.index)

        for chunk in chunks:

            self.index.append([num,chunk])
            num+=1

    def search_best(self,query,number_of_hits=3,adapt=False):
        
        # query is a str

        query_embed = self.embedding_model.get_embeddings(texts=[query]).to(device).reshape(1,self.embedding_model.dim)

        results=[]

        index=self.index

        scores=[]

        for item in index:

            id,chunk = item

            sim = self.embedding_model.compute_cos_sim_embed(embed1=query_embed,embed2=chunk.embedding)

            scores.append((id,chunk,sim))
  
        results=sorted(scores,key=lambda x:x[2],reverse=True)[:min(number_of_hits,len(index))]

        # We can also add a criterion to exclude the worst hits; here we choose an arbitrary criterion (we exclude a hit if the similarity is smaller than half of the previous one among the number_of_hits chunks)

        if adapt:

            i=1
            go=True
            while go and i<len(results):
                if results[i][2] < results[i-1][2]*0.5:
                    go=False
                else:
                    i+=1
            
            results=results[:i]

        return results
    
    def reset_Retriever_index(self):

        self.index=[]

### Unit test of the Retriever module

In [61]:
# Unit test

# Recall that we have already in our previous unit tests defined an embedding model and a splitter

        # embed_model=EmbeddingModel(EMBEDD_MODEL_PATH=Embed_mini)

        # Split=Splitter(embed_model)
        # Split.chunks=[]
        # Split.get_chunks(path_doc=this_path,chunk_size=30,overlap=15,sentence_split=True)

print(Split.docs)

chunks=Split.chunks

print(len(chunks))

retriever = Retriever(embed_model)


# Add the chunks to the index

# Get the best results using your retriever to the query 

        #prompt_RF='Is Robert Redford alive ?; Answer must be [Yes] or [No]'


retriever.add_elements_to_index(chunks=chunks)

query='Is Robert Redford alive'

results=retriever.search_best(query=query,number_of_hits=3,adapt=True)

print("results",results)

[{'source': 'Robert_Redford.txt', 'content_page': 'Robert Redford passed away last month.'}, {'source': 'Personalities.txt', 'content_page': "Albert Einstein proposed the theory of relativity, which transformed our understanding of time,space, and gravity. \nMarie Curie was a physicist and chemist who conducted pioneering research on radioactivity and won two Nobel Prizes.\nIsaac Newton formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics.\nCharles Darwin introduced the theory of evolution by natural selection in his book 'On the Origin of Species'.\nAda Lovelace is regarded as the first computer programmer for her work on Charles Babbage's early mechanical computer, the Analytical Engine."}]
6
1
results [(0, <__main__.Chunk object at 0x380cf5370>, 0.8707782626152039)]


In [62]:
# Unit test follow up

Split.chunks=[]

Split.get_chunks(path_doc=this_path,chunk_size=30,overlap=15,sentence_split=True)

print(Split.docs)

chunks=Split.chunks

print(len(chunks))

retriever = Retriever(embed_model)

retriever.add_elements_to_index(chunks=chunks)

query='Is Robert Redford alive'

results=retriever.search_best(query=query,number_of_hits=2)

print("results",results)


[{'source': 'Robert_Redford.txt', 'content_page': 'Robert Redford passed away last month.'}, {'source': 'Personalities.txt', 'content_page': "Albert Einstein proposed the theory of relativity, which transformed our understanding of time,space, and gravity. \nMarie Curie was a physicist and chemist who conducted pioneering research on radioactivity and won two Nobel Prizes.\nIsaac Newton formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics.\nCharles Darwin introduced the theory of evolution by natural selection in his book 'On the Origin of Species'.\nAda Lovelace is regarded as the first computer programmer for her work on Charles Babbage's early mechanical computer, the Analytical Engine."}]
6
results [(0, <__main__.Chunk object at 0x37fc430e0>, 0.8707782626152039), (1, <__main__.Chunk object at 0x380cf5130>, 0.08921658992767334)]


# The RAG Architecture

In [22]:
class RAG():

    def __init__(self,CONFIG):
        
        self.foundation_model=FoundationModel(FOUND_MODEL_PATH=CONFIG['FOUND_MODEL_PATH'])
        self.Embedding_model=EmbeddingModel(EMBEDD_MODEL_PATH=CONFIG['EMBEDD_MODEL_PATH'])
        self.splitter=Splitter(self.Embedding_model)
        self.retriever=Retriever(self.Embedding_model)

        self.dim_embed = CONFIG['DIM_EMBED']
        self.chunk_size = CONFIG['CHUNK_SIZE']
        self.overlap = CONFIG['OVERLAP']

    
    def reset_index(self):

        self.retriever.reset_Retriever_index()
        self.splitter.reset_splitter()
    
    
    def load_documents_and_get_chunks(self,path,sentence_split=False):

        self.splitter.get_chunks(path_doc=path,
                                 chunk_size=self.chunk_size,
                                 overlap=self.overlap,
                                 sentence_split=sentence_split)
        
        chunks = self.splitter.chunks

        self.retriever.add_elements_to_index(chunks=chunks)
        
    
    def get_retrieval(self,query,number_of_hits):

        retrieved_info = self.retriever.search_best(query=query,number_of_hits=number_of_hits)

        # It is the full information of the form (Id, chunk, sim)

        retrieved=[]
        
        for elem in retrieved_info:

            i,chunk, distance=elem

            retrieved.append(chunk.content)
        
        # We get rid of repeated items
        return list(dict.fromkeys(retrieved))
    
    def generate_response_rag(self,query):

        retrieved=self.get_retrieval(query=query,
                                          number_of_hits=3)
        
        return self.foundation_model.generate_response_with_context(prompt=query,
                                                                   context=retrieved)


# Experiments

In [23]:
CONFIG = {
    'FOUND_MODEL_PATH':Path_Q_06,
    #'FOUND_MODEL_PATH':"Qwen/Qwen3-4B",
    'EMBEDD_MODEL_PATH':"all-MiniLM-L6-v2",
    'DIM_EMBED':384,
    'CHUNK_SIZE':300,
    'OVERLAP':30
        }

rag = RAG(CONFIG=CONFIG)

loading configuration file config.json from cache at /Users/reveilla/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/c1899de289a04d12100db370d81485cdf75e47ca/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full

Number of parameters in my model 5.96e+08


loading configuration file config.json from cache at /Users/reveilla/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /Users/reveilla/.cache/huggingface/hub/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/c9745ed1d9f207416be6d2e6f8de32d1f

In [24]:
# We initialize our RAG with our documents

rag.reset_index()

rag.load_documents_and_get_chunks(path=Path.cwd()/"Docs",sentence_split=True)

chunks=rag.splitter.chunks

## The Robert Redford experiment

In [26]:
# Once again for the prompt below compare the answer with and without RAG
        # prompt_RF='Is Robert Redford alive ?; Answer must be [Yes] or [No]'

# Print what is the retrieved context

query = prompt_RF

response_no_RAG = rag.foundation_model.generate_response(prompt=query)

context = rag.get_retrieval(query=query,number_of_hits=3)

response_RAG = rag.generate_response_rag(query=query)

print(response_no_RAG,"\n",response_RAG)

[{'generated_text': "<think>\nOkay, the user is asking if Robert Redford is alive. I need to check if there's any information I might be missing. First, I remember that Redford was born in 1930 and died in 2002. So, he was indeed alive at the time of his death. But wait, maybe there's a trick here. Let me think. Some people might confuse his death with his early life, but no, he was born in 1930 and died in 2002. So the answer should be [Yes].\n</think>\n\n[Yes]"}] 
 [{'generated_text': "<think>\nOkay, the user is asking if Robert Redford is alive. Let me check the contexts provided.\n\nContext 1 says Robert Redford passed away last month. So the answer should be [Yes] because he did pass away. But wait, maybe there's a trick here. The user might be testing if I can recognize that passing away means he's not alive. But according to general knowledge, Redford did pass away. So the correct answer is Yes.\n</think>\n\n[Yes]"}]


### Reranking

You noticed that since you asked for a fixed ```number_of_hits``` some of the context might be pointless. We aim in preventing this and selecting the retrieved context. We provide two ways for that. 

**1st we plot the similarity and try to find an accurate split out of it.**

In [60]:
# In both methods we start with our retrieved information

rag.reset_index()

rag.load_documents_and_get_chunks(path=Path.cwd()/"Docs",sentence_split=True)

retrieved_info = rag.retriever.search_best(query=query,number_of_hits=3)
# It is (Id,chunk,sim) and it is ordered by sim


In [61]:
data=[]
select_data=[]

temp_sim = retrieved_info[0][2] # The highest sim

for item in retrieved_info:
   
    Id,chunk,sim=item
    data.append((Id,chunk.content,sim))

    temp = temp_sim*0.9 # the current sim - 10%

    if sim > temp:

        temp_sim = sim
        select_data.append((Id,chunk.content,sim))


data = pd.DataFrame(data,columns=["Id","chunk","sim"])
selected_data = pd.DataFrame(select_data,columns=["Id","chunk","sim"])

print(data)
print(selected_data)

   Id                                              chunk       sim
0   0              Robert Redford passed away last month  0.719682
1   1  Albert Einstein proposed the theory of relativ...  0.094111
2   3  Isaac Newton formulated the laws of motion and...  0.045098
   Id                                  chunk       sim
0   0  Robert Redford passed away last month  0.719682


**2nd method : *LLM as a judge***

This time we give the query and the retrieved information to a LLM called *judge (as a) LLM* and ask it to select the more suitable chunks.

In [62]:
# We define the judge (it is always better to take another LLM and to do the opposite of what we are doing now: usually one chooses the judge to be a more powerful model; here for VRAM reasons we select a weaker one).DS_Store

judge_llm = FoundationModel(FOUND_MODEL_PATH=Path_SDS)


loading configuration file config.json from cache at /Users/reveilla/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM3-3B/snapshots/a07cc9a04f16550a088caea529712d1d335b0ac1/config.json
Model config SmolLM3Config {
  "architectures": [
    "SmolLM3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "dtype": "bfloat16",
  "eos_token_id": 128012,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_atte

Number of parameters in my model 3.08e+09


In [63]:
context_chunks = data["chunk"].tolist()

#print(context_chunks)

#prompt = print(f"Given [{query}] tell me among the elements of [{context_chunks}] which ones are the more relevant to [{query}]; give me the answer as a list of elements of [{context_chunks}]")

prompt=f""" You are a clever assistant. Given the query: "{query}"; here is a list of resources "{context_chunks}". For each of those say which is useful to answer the query. Give your result by only listing the useful results.
"""

print(judge_llm.generate_response(prompt=prompt))

[{'generated_text': '<think>\nOkay, let\'s tackle this query. The user is asking if Robert Redford is alive, and they want the answer to be Yes or No. The provided resources are three sentences, and I need to determine which ones are useful for answering the question.\n\nFirst, let me recall who Robert Redford is. He\'s an American actor, director, and producer known for his roles in films like "Butch Cassidy and the Sundance Kid" and "All the President\'s Men." I need to check if he\'s still alive.\n\nLooking at the first resource: "Robert Redford passed away last month." If this is true, then the answer would be No. The second resource talks about Albert Einstein\'s theory of relativity, which isn\'t relevant to Redford\'s current status. The third resource is about Newton\'s laws of motion, also unrelated. \n\nSince the first resource directly states that Redford passed away, that\'s the key information needed here. The other two sentences don\'t provide any information about his cu

## The personalities experiment

In [None]:
# You can extend the study by now looking to questions that still address one chunk but which are related semantically. 
# We introduce 5 queries; each one is related to one and only one personality

queries = [
"Who introduced the theory of relativity?",
"Who was the first computer programmer?",
"What did Isaac Newton contribute to science?",
"Who won two Nobel Prizes for research on radioactivity?",
"What is the theory of evolution by natural selection?"
]

print(queries)

['Who introduced the theory of relativity?', 'Who was the first computer programmer?', 'What did Isaac Newton contribute to science?', 'Who won two Nobel Prizes for research on radioactivity?', 'What is the theory of evolution by natural selection?']
['Albert Einstein proposed the theory of relativity, which transformed our understanding of time, space and gravity.', "Ada Lovelace is regarded as the first computer programmer for her work on Charles Babbage's early mechanical computer, the Analytical Engine.", 'Isaac Newton formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics.', 'Marie Curie was a physicist and chemist who conducted pioneering research on radioactivity and won two Nobel Prizes.', "Charles Darwin introduced the theory of evolution by natural selection in his book 'On the Origin of Species'."]


In [None]:
# Similarly to before, test the index and the impact of the RAG on the queries above based on the reference (contrary to the previous situation the Open Source model already has a sufficient knowledge on these questions).