In [1]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from rank_bm25 import BM25Okapi
from nltk.corpus import wordnet

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Charger les données JSON depuis un fichier
json_file_path = "actions.json"
with open(json_file_path, "r", encoding="utf-8") as file:
    json_data = json.load(file)

In [3]:
# Extraire les textes et les identifiants
documents = list(json_data.keys())
doc_ids = list(json_data.values())

# Initialiser le modèle SBERT et le Cross-Encoder
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Calculer les embeddings des documents
embeddings = sbert_model.encode(documents, convert_to_tensor=True)

# Indexer avec FAISS
index = faiss.IndexFlatL2(embeddings.shape[1])
faiss.normalize_L2(embeddings.numpy())
index.add(embeddings.numpy())

# Initialiser BM25
tokenized_corpus = [doc.split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

In [4]:
def expand_query(query):
    words = query.split()
    expanded_words = set(words)
    for word in words:
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_words.add(lemma.name())
    return " ".join(expanded_words)

def search(query, top_k=10):
    query_expanded = expand_query(query)
    query_embedding = sbert_model.encode([query_expanded], convert_to_tensor=True)
    faiss.normalize_L2(query_embedding.numpy())
    _, faiss_results = index.search(query_embedding.numpy(), top_k)
    faiss_scores = [(documents[i], doc_ids[i]) for i in faiss_results[0]]
    
    bm25_scores = bm25.get_top_n(query_expanded.split(), documents, n=top_k)
    bm25_scores_with_ids = [(doc, doc_ids[documents.index(doc)]) for doc in bm25_scores]
    
    combined_results = list(set(faiss_scores + bm25_scores_with_ids))
    cross_scores = cross_encoder.predict([(query, doc[0]) for doc in combined_results])
    ranked_results = [x for _, x in sorted(zip(cross_scores, combined_results), reverse=True)]
    
    return ranked_results[:top_k]

In [5]:
# Exemple de recherche
query = "Comment isoler ma maison"
results = search(query)
for title, code in results:
    print(f"Titre: {title}, Code: {code}")

Titre:  Hydrolienne flottante, Code: 1604
Titre:  Campagne de mesures, Code: 955
Titre:  Haute pression flottante, Code: 724
Titre:  Autopartage, Code: 1508
Titre:  Oxycombustion, Code: 360
Titre:  Chauffage par induction, Code: 496
Titre:  Sensibilisation du personnel aux coûts de l'énergie, Code: 932
Titre:  Communication sur les bénéfices d'une politique d'économie d'énergie, Code: 450
Titre:  Four à oxycombustion, Code: 935
Titre:  Formation du personnel pour éviter les pertes de temps et de production, Code: 342
