In [85]:
import subprocess
import time
import os

def fetch_ollama():
    !curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
    !mkdir -p /usr/local/bin
    !tar -C /usr/local -xzf ollama-linux-amd64.tgz
    !chmod +x /usr/local/bin/ollama

def install_ollama():
    if not os.path.isfile('/usr/local/bin/ollama'):
        fetch_ollama()
    process = subprocess.Popen(
        ['/usr/local/bin/ollama', 'serve'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        env={**os.environ, 'OLLAMA_HOST': '0.0.0.0:11434'}
    )

    # Esperar a que el servidor se inicie
    time.sleep(5)

def check_ollama():
    import requests
    try:
        response = requests.get("http://localhost:11434/ping")
        print("Ollama est d√©j√† install√©.")
    except requests.ConnectionError:
        print("Ollama n'est pas install√©. Installation en cours...")
        install_ollama()
        print("Ollama a √©t√© install√© avec succ√®s.")

# V√©rifie si le code est ex√©cut√© sur Google Colab
if 'COLAB_GPU' in os.environ:
    # Commandes √† ex√©cuter uniquement sur Google Colab
    if os.path.isdir('tp-rag'):
        %cd tp-rag
    if os.path.isdir('.git'):
        # Already in the git repository, just pull
        # Pull updates; only check/install if no updates
        !git pull | grep -q 'Already up to date.' || pip install -r requirements.txt
    else:
        # Clone the repository
        !git clone https://github.com/Florian-Audouard/tp-rag
        %cd tp-rag
        !pip install -r requirements.txt
    check_ollama()
    !/usr/local/bin/ollama pull qwen3:8b

else:
    # Commandes √† ex√©cuter si ce n'est pas sur Google Colab
    print("Pas sur Google Colab, ces commandes ne seront pas ex√©cut√©es.")

remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (2/2), done.[K
Unpacking objects: 100% (4/4), 9.85 KiB | 4.92 MiB/s, done.
remote: Total 4 (delta 2), reused 4 (delta 2), pack-reused 0 (from 0)[K
From https://github.com/Florian-Audouard/tp-rag
   e689dd8..26f98c7  master     -> origin/master
Ollama est d√©j√† install√©.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l


In [78]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder


EMBESSINGS_MODEL_NAME = "intfloat/multilingual-e5-base"
DATA_FOLDER = "data/"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = CHUNK_SIZE // 5

In [50]:
embeddings = HuggingFaceEmbeddings(model_name=EMBESSINGS_MODEL_NAME)
vector_store_splits = Chroma(
    collection_name="split_data_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_split_db",  # Where to save data locally, remove if not necessary
)
vector_store_full = Chroma(
    collection_name="full_data_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_full_db",  # Where to save data locally, remove if not necessary
)

In [51]:
loader = DirectoryLoader(DATA_FOLDER)
documents = loader.load()
print(f"Number of documents loaded: {len(documents)}")

Number of documents loaded: 63


In [52]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
all_splits = text_splitter.split_documents(documents)
print(f"Number of paragraphs created: {len(all_splits)}")

Number of paragraphs created: 8847


In [53]:
_ = vector_store_full.add_documents(documents=documents)

In [54]:
# Add documents in batches to avoid exceeding max batch size
BATCH_SIZE = 5000
for i in range(0, len(all_splits), BATCH_SIZE):
    batch = all_splits[i : i + BATCH_SIZE]
    vector_store_splits.add_documents(documents=batch)
    print(f"Added batch {i//BATCH_SIZE + 1}: {len(batch)} documents")
print(f"All {len(all_splits)} documents added to the vector store.")

Added batch 1: 5000 documents
Added batch 2: 3847 documents
All 8847 documents added to the vector store.


In [None]:
def generate_query(vector_store, query, k=3, score=False):
    if score:
        return vector_store.similarity_search_with_score(query, k=k)
    return vector_store.similarity_search(query, k=k)


generate_query(vector_store_splits, "what is Video-Panda ?", k=1)




[Document(id='5753a59f-f862-4b2a-8eb6-0b2f8796f1b6', metadata={'start_index': 52738, 'source': 'data/autres_articles/2412.18609v1.pdf'}, page_content='F. Broader Impact\n\nWe introduce Video-Panda, an encoder-free Video Lan- guage Model for video understanding. Our model addresses key ethical and practical challenges in large-scale AI de- ployment. While many VLMs raise concerns about data bias, privacy, and computational costs, Video-Panda miti- gates these issues through two key design choices: training exclusively on publicly available datasets and eliminating the need for a pretrained encoder. This approach not only reduces ethical concerns but also significantly lowers com- putational requirements and deployment costs, making the model more accessible and environmentally sustainable.')]

In [None]:
llm = ChatGroq(api_key=API_KEY, model="llama-3.1-8b-instant", temperature=0)

llm_ollama = ChatOllama(
    model="qwen3:8b",
    temperature=0,
)

res1 = llm.invoke("Hello, world!").content
res2 = llm_ollama.invoke("Hello, world!").content

print("Response from grok :", res1)
print("Response from ollama :", res2)

Response from grok : Hello, world. It's nice to meet you. Is there something I can help you with or would you like to chat?
Response from ollama : Hello! üòä How can I assist you today? Whether you have questions, need help with something, or just want to chat, I'm here for you! What's on your mind?


In [None]:
SYSTEM_PROMPT = """You are a helpful AI assistant that helps people find information. Use the provided DOCUMENTS to answer the question at the end. If you don't know the answer, just say you don't know, don't try to make up an answer."""
USER_PROMPT = """DOCUMENTS:
{context}
QUESTION: {question}
Answer:"""


def generate_answer(
    agent, question, get_session_history=lambda x: InMemoryChatMessageHistory()
):
    results = generate_query(vector_store_splits, question, k=3)
    context = ""
    for i, documents in enumerate(results):
        context += f"DOCUMENT {i}" + ":\n"
        context += documents.page_content + "\n\n"

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", SYSTEM_PROMPT),
            MessagesPlaceholder(variable_name="history"),
            ("human", USER_PROMPT),
        ]
    )
    chain = prompt | agent
    chain_with_memory = RunnableWithMessageHistory(
        chain,
        get_session_history,
        input_messages_key="question",
        history_messages_key="history",
    )

    response = chain_with_memory.invoke(
        {"context": context, "question": question},
        config={"configurable": {"session_id": "user-1"}},
    )
    return response.content


generate_answer(llm, "What is Video-Panda?")

'Video-Panda is an **encoder-free Video Language Model (VLM)** designed for video understanding. It addresses ethical and practical challenges in AI deployment by training exclusively on publicly available datasets and eliminating the need for a pretrained encoder. This approach reduces computational costs, improves environmental sustainability, and enhances accessibility. Video-Panda demonstrates competitive performance compared to models like VideoChat-GPT and Video-LLaVA, with faster inference speeds (processing videos in ~41ms) and stronger results in correctness, context, and temporal understanding, despite using fewer parameters (45M) and fewer video frames (8 vs. 100).'

In [81]:
store = {}


def get_session_history(session_id: str):
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]


ans1 = generate_answer(llm, "What is Video-Panda?", get_session_history)
ans2 = generate_answer(llm, "Tell me what we discussed earlier?", get_session_history)
print("Answer 1:", ans1)
print("Answer 2:", ans2)

Answer 1: Video-Panda is an **encoder-free Video Language Model (VLM)** designed for video understanding. It addresses ethical and practical challenges in AI deployment by training exclusively on publicly available datasets and eliminating the need for a pretrained encoder. This design reduces computational costs, improves accessibility, and enhances environmental sustainability. Video-Panda demonstrates competitive performance compared to models like VideoChat-GPT and Video-LLaVA, with faster inference speeds (processing videos in ~41ms) and efficient parameter usage (45M parameters for its visual component). It excels in correctness, context, and temporal understanding while using fewer video frames (8 vs. 100) than competing models.
Answer 2: Based on the provided documents, there is no information available about previous discussions or conversation history. The documents focus on technical content related to AI models, dialogue systems, and academic references, but none contain re

In [93]:
SYSTEM_PROMPT_SUMMARY = """You are a helpful AI assistant that helps people summarize documents. Use the provided DOCUMENT to create a concise summary."""

USER_PROMPT_SUMMARY = """DOCUMENT:{document}"""


def create_sumarry(document, debug=False):
    document = generate_query(vector_store_full, document, k=1)[0]
    if debug:
        print("Document to summarize:", document.metadata["source"])
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", SYSTEM_PROMPT_SUMMARY),
            ("human", USER_PROMPT_SUMMARY),
        ]
    )
    chain = prompt | llm_ollama
    summary = chain.invoke({"document": document.page_content})
    return summary.content


summary = create_sumarry("Video-Panda", debug=True)
print("Summary:", summary)

Document to summarize: data/autres_articles/2412.18609v1.pdf
Summary: The document presents **Video-Panda**, an encoder-free video language model (VLM) designed for video understanding, with a focus on balancing performance, efficiency, and ethical considerations. Below is a structured summary of its key components and findings:

---

### **1. Model Architecture & Training Strategy**
- **Encoder-Free Design**: Video-Panda eliminates the need for a pre-trained encoder, reducing computational costs and ethical concerns (e.g., data bias, privacy).
- **Staged Training**: 
  - **Stage 1**: Uses 702K video-text pairs for initial alignment, with gradual complexity scaling to avoid overfitting.
  - **Stage 2**: Uses half the dataset (351K samples) to refine representations.
  - **Stage 3**: Freezes parameters for fine-tuning, ensuring robustness.
- **Learnable Selective Downsampling (LSD)**: 
  - Applied after **Local Spatial-Temporal Encoding (LSTE)** to preserve temporal context.
  - Outperf

In [None]:
import uuid
import gradio as gr


def summarize_doc(topic):
    return create_sumarry(topic, debug=False)


def chat_answer(message, history):
    # Just return the answer for the chat interface
    answer = generate_answer(llm, message, get_session_history)
    return answer


# Close existing demo if it exists
try:
    if "demo" in globals() and hasattr(demo, "close"):
        demo.close()
        time.sleep(1)  # Give it a moment to close
        print("Closed existing Gradio server")
except Exception:
    pass

with gr.Blocks() as demo:
    gr.Markdown(
        "## RAG assistant\nInteract with the vector store for Q&A or quick summaries."
    )
    with gr.Tab("Q&A chat"):
        gr.ChatInterface(
            fn=chat_answer,
            title="Ask the indexed documents",
            description="Conversational Q&A using your vector store.",
        )
    with gr.Tab("Summarize"):
        topic_box = gr.Textbox(label="Document/topic", placeholder="e.g., Video-Panda")
        summary_box = gr.Textbox(label="Summary", lines=6)
        sum_btn = gr.Button("Summarize")
        sum_btn.click(fn=summarize_doc, inputs=topic_box, outputs=summary_box)

demo.launch(server_name="0.0.0.0", share=True)

Closing server running on port: 7861
Closed existing Gradio server


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ddeacaaf5c66f156da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 2191, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 1696, in call_function
    prediction = await fn(*processed_input)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/utils.py", line 882, in async_wrapper
    response = await f(*args, **kwargs)
               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-