In [1]:
import os

# Vérifie si le code est exécuté sur Google Colab
if 'COLAB_GPU' in os.environ:
    # Commandes à exécuter uniquement sur Google Colab
    if os.path.isdir('tp-rag'):
        %cd tp-rag
    if os.path.isdir('.git'):
        # Already in the git repository, just pull
        # Pull updates; only check/install if no updates
        !git pull | grep -q 'Already up to date.' || pip install -r requirements.txt
    else:
        # Clone the repository
        !git clone https://github.com/Florian-Audouard/tp-rag
        %cd tp-rag
        !pip install -r requirements.txt
else:
    # Commandes à exécuter si ce n'est pas sur Google Colab
    print("Pas sur Google Colab, ces commandes ne seront pas exécutées.")

/content/tp-rag


In [2]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from uuid import uuid4


EMBESSINGS_MODEL_NAME = "intfloat/multilingual-e5-base"
DATA_FOLDER = "data/"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = CHUNK_SIZE // 5

In [3]:
embeddings = HuggingFaceEmbeddings(model_name=EMBESSINGS_MODEL_NAME)
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [4]:
loader = DirectoryLoader(DATA_FOLDER)
documents = loader.load()
print(f"Number of documents loaded: {len(documents)}")

Number of documents loaded: 63


In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
all_splits = text_splitter.split_documents(documents)
print(f"Number of paragraphs created: {len(all_splits)}")

Number of paragraphs created: 8847


In [None]:
# Add documents in batches to avoid exceeding max batch size
BATCH_SIZE = 5000
for i in range(0, len(all_splits), BATCH_SIZE):
    batch = all_splits[i : i + BATCH_SIZE]
    vector_store.add_documents(documents=batch)
    print(f"Added batch {i//BATCH_SIZE + 1}: {len(batch)} documents")
print(f"All {len(all_splits)} documents added to the vector store.")

In [None]:
def tmp_wtf(query, k=3, score=False):
    if score:
        return vector_store.vector_store.similarity_search_with_score(query, k=k)
    return vector_store.vector_store.similarity_search(query, k=k)


print()
tmp_wtf("How does Diffusion Models work?", k=1)

Result 1 document title: data/autres_articles/2412.18604v1.pdf
4 2 0 2 c e D 4 2

]

V C . s c [

1 v 4 0 6 8 1 . 2 1 4 2 : v i X r a

0.99

0.41

Perceived Younger

Perceived Older

MoreTraditional

MoreModern

0.0

0.96

0.95

0.92

HealthyFood

JunkFood

FormalFit

CasualFit

0.88

0.94

MoreDog

MoreCat

MoreGray-Crowned

LessGray-Crowned

0.31

0.50

Healthy Retina

Unhealthy Retina

SickLeaf

HealthyLeaf

0.99

0.18

0.64

0.70

0.83

0.12

Explaining in Diffusion: Explaining a Classifier Through Hierarchical Semantics with Text-to-Image Diffusion Models

Tahira Kazimi† Ritika Allada† Pinar Yanardag Virginia Tech {tahirakazimi, ritika88, pinary}@vt.edu explain-in-diffusion.github.io

Result 2 document title: data/autres_articles/2412.18604v1.pdf
this method has several

Recent approaches have begun using diffusion mod- els to generate counterfactual examples. One method utilizes shortcut learning to generate counterfactual im- ages but fails to make semantically meaningful edits 