In [19]:
import os

# Vérifie si le code est exécuté sur Google Colab
if 'COLAB_GPU' in os.environ:
    # Commandes à exécuter uniquement sur Google Colab
    if os.path.isdir('tp-rag'):
        %cd tp-rag
    if os.path.isdir('.git'):
        # Already in the git repository, just pull
        # Pull updates; only check/install if no updates
        !git pull | grep -q 'Already up to date.' || pip install -r requirements.txt
    else:
        # Clone the repository
        !git clone https://github.com/Florian-Audouard/tp-rag
        %cd tp-rag
        !pip install -r requirements.txt
else:
    # Commandes à exécuter si ce n'est pas sur Google Colab
    print("Pas sur Google Colab, ces commandes ne seront pas exécutées.")

remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 4 (delta 3), reused 4 (delta 3), pack-reused 0 (from 0)[K
Unpacking objects: 100% (4/4), 1.33 KiB | 1.33 MiB/s, done.
From https://github.com/Florian-Audouard/tp-rag
   1ebb0ad..244c6d1  master     -> origin/master


In [29]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from chromadb import Client
from chromadb.config import Settings
from langgraph.checkpoint import Checkpointer


EMBESSINGS_MODEL_NAME = "intfloat/multilingual-e5-base"
DATA_FOLDER = "data/"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = CHUNK_SIZE // 5

ImportError: cannot import name 'Checkpointer' from 'langgraph.checkpoint' (unknown location)

In [3]:
embeddings = HuggingFaceEmbeddings(model_name=EMBESSINGS_MODEL_NAME)
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [4]:
loader = DirectoryLoader(DATA_FOLDER)
documents = loader.load()
print(f"Number of documents loaded: {len(documents)}")

Number of documents loaded: 63


In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
all_splits = text_splitter.split_documents(documents)
print(f"Number of paragraphs created: {len(all_splits)}")

Number of paragraphs created: 8847


In [None]:
# Add documents in batches to avoid exceeding max batch size
BATCH_SIZE = 5000
for i in range(0, len(all_splits), BATCH_SIZE):
    batch = all_splits[i : i + BATCH_SIZE]
    vector_store.add_documents(documents=batch)
    print(f"Added batch {i//BATCH_SIZE + 1}: {len(batch)} documents")
print(f"All {len(all_splits)} documents added to the vector store.")

In [26]:
def generate_query(query, k=3, score=False):
    if score:
        return vector_store.similarity_search_with_score(query, k=k)
    return vector_store.similarity_search(query, k=k)


print()
generate_query("what is Video-Panda ?", k=1)




[Document(id='1f56e887-9255-470c-b68e-002f38192e9b', metadata={'start_index': 52738, 'source': 'data/autres_articles/2412.18609v1.pdf'}, page_content='F. Broader Impact\n\nWe introduce Video-Panda, an encoder-free Video Lan- guage Model for video understanding. Our model addresses key ethical and practical challenges in large-scale AI de- ployment. While many VLMs raise concerns about data bias, privacy, and computational costs, Video-Panda miti- gates these issues through two key design choices: training exclusively on publicly available datasets and eliminating the need for a pretrained encoder. This approach not only reduces ethical concerns but also significantly lowers com- putational requirements and deployment costs, making the model more accessible and environmentally sustainable.')]

In [None]:
llm = ChatGroq(
    api_key="",
    model="llama-3.1-8b-instant",
    temperature=0,
)

llm.invoke("Hello, world!").content

"Hello, world. It's nice to meet you. Is there something I can help you with or would you like to chat?"

In [28]:
prompt_template = """
You are a knowledgeable assistant. Use the following context to answer the question at the end.
DOCUMENTS:
{context}

QUESTION:
{question}

ANSWER:"""


def generate_answer(question):
    results = generate_query(question, k=3)
    context = ""
    for i, documents in enumerate(results):
        context += f"DOCUMENT {i}" + ":\n"
        context += documents.page_content + "\n\n"

    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=prompt_template,
    )

    chain = prompt | llm
    answer = chain.invoke({"context": context, "question": question}).content
    return answer


generate_answer("What is Video-Panda?")

'Video-Panda is an encoder-free Video Language Model for video understanding.'