Importations

In [1]:
!pip install sentence-transformers qdrant-client pandas

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.11.3-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.66.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting httpx>=0.20.0 (from httpx[http2]>=0.20.0->qdrant-client)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Downloading protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting grpcio>=1.41.0 (from qdrant-client)
  Downloading grpcio-1.66.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx>=0.20.0->httpx[http2]>=

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models

**Chargement des données**

In [None]:
df = pd.read_csv('votre_fichier_bible.csv')  # Remplacez par le chemin de votre fichier

**Génération des embeddings**

In [None]:
encoder = SentenceTransformer("BAAI/bge-m3")  # Remplacez par le modèle de votre choix
df['text_chunks'] = df['Text'].apply(lambda x: x.split('. '))  # Divisez le texte en morceaux

**Stochage des embeddings**

In [None]:
client = QdrantClient(url="http://localhost:6333")  # Changez l'URL si nécessaire

**Création d'une collection dans Qdrant**

In [None]:
client.create_collection(
    collection_name="bible",
    vectors_config=models.VectorParams(
        distance=models.Distance.COSINE,
        size=encoder.get_sentence_embedding_dimension()  # Taille des embeddings
    ),
)

**Créer les points et les téléverser**

In [None]:

# Créer les points et les téléverser
points = []
point_id = 1

for _, row in df.iterrows():
    for chunk in row['text_chunks']:
        vector = encoder.encode(chunk)  # Encoder chaque morceau
        payload = {
            'Book Name': row['Book Name'],
            'Chapter': row['Chapter'],
            'Verse': row['Verse'],
            'Text': chunk  # Utilisez le morceau de texte
        }
        points.append(
            models.PointStruct(
                id=point_id,  # ID unique pour chaque point
                vector=vector.tolist(),  # Convertir en liste
                payload=payload
            )
        )
        point_id += 1  # Incrémenter l'ID

# Téléverser les points dans la collection
client.upload_points(
    collection_name="bible",
    points=points,
)

print("Les embeddings ont été générés et sauvegardés dans Qdrant.")

**Recherche des voisins les plus proches**

In [None]:
def search_bible(question, k=5):
    question_embedding = encoder.encode(question)  # Encoder la question
    D, I = client.search(
        collection_name="bible",
        query_vector=question_embedding.tolist(),
        limit=k
    )

    results = []
    for distance, index in zip(D[0], I[0]):
        result = client.get_point(collection_name="bible", point_id=index)
        results.append({
            'distance': distance,
            'Book Name': result.payload['Book Name'],
            'Chapter': result.payload['Chapter'],
            'Verse': result.payload['Verse'],
            'Text': result.payload['Text']
        })

    return results

In [None]:
question = "Quel est le sens de la vie selon la Bible ?"
results = search_bible(question)
for result in results:
    print(f"Book: {result['Book Name']}, Chapter: {result['Chapter']}, Verse: {result['Verse']}, Text: {result['Text']}, Distance: {result['distance']:.4f}")