# Mettre en œuvre et tester différentes approches pour la phase de récupération (basées sur des modèles vectoriels, des statistiques BM25, ou des embeddings).

## Modèle basé sur embeddings

In [2]:
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document


# Charger les guides depuis un fichier JSON
def load_guides(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        guides = json.load(f)
    return guides


# Convertir les guides en vecteurs et créer un retriever LangChain
def index_guides_embeddings(
    guides, model_name="sentence-transformers/all-mpnet-base-v2"
):
    # Construire les textes et les objets Document
    documents = [
        Document(
            page_content=f"{g['dataType']} - {g['type']} {g['subject']} : {g['title']} {(g['url'])}",
            metadata={
                "url": g["url"],
                "type": g["type"],
                "subject": g["subject"],
                "title": g["title"],
            },
        )
        for g in guides
    ]

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents)

    # Créer des embeddings avec LangChain
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    vector_store = FAISS.from_documents(splits, embedding_model)

    return vector_store.as_retriever()

In [3]:
guides = load_guides("./data/guides.json")

print("Nombre de guides : ", len(guides))

Nombre de guides :  54949


In [4]:
retriever = index_guides_embeddings(guides)

In [5]:
question = "How to repair a samsung TV ?"

# Requête au retriever pour obtenir les documents les plus pertinents
results = retriever.get_relevant_documents(question)

# Afficher les documents trouvés
for result in results:
    print(f"Document : {result.page_content}")

Document : guide - technique  : Repairing Samsung 60" LED TV UN60FH6003FXZA T-Con failure https://www.ifixit.com/Guide/Repairing+Samsung+60-Inch+LED+TV+UN60FH6003FXZA+T-Con+failure/39642
Document : guide - technique  : Causes of Samsung LED TVs with No Picture and How to Fix Them https://www.ifixit.com/Guide/Causes+of+Samsung+LED+TVs+with+No+Picture+and+How+to+Fix+Them/156208
Document : guide - technique  : Repairing Samsung SyncMaster SA300 Flicker and Vertical Lines https://www.ifixit.com/Guide/Repairing+Samsung+SyncMaster+SA300+Flicker+and+Vertical+Lines/50926
Document : guide - replacement T-Con Board : Samsung LED TV LE37A659 (and similar models) T-Con Board Replacement https://www.ifixit.com/Guide/Samsung+LED+TV+LE37A659+(and+similar+models)+T-Con+Board+Replacement/127143


  results = retriever.get_relevant_documents(question)


## Modèle basé sur statistiques

In [6]:
from langchain.retrievers import BM25Retriever


def index_guides_bm25(guides):
    # Construire les textes et les objets Document
    documents = [
        Document(
            page_content=f"{g['dataType']} - {g['type']} {g['subject']} : {g['title']} {(g['url'])}",
            metadata={
                "url": g["url"],
                "type": g["type"],
                "subject": g["subject"],
                "title": g["title"],
            },
        )
        for g in guides
    ]

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents)

    # Création du retriever BM25
    retriever = BM25Retriever.from_documents(splits)
    retriever.k = 5  # Nombre de documents retournés

    return retriever

In [7]:
retriever = index_guides_bm25(guides)

In [8]:
question = "How to repair a samsung TV ?"

# Requête au retriever pour obtenir les documents les plus pertinents
results = retriever.get_relevant_documents(question)

# Afficher les documents trouvés
for result in results:
    print(f"Document : {result.page_content}")

Document : guide - replacement Power button (video) : How to repair LG G5 Power button ? https://www.ifixit.com/Guide/How+to+repair+LG+G5+Power+button+-/76112
Document : guide - replacement  : How  to ajust the chain tension ? https://www.ifixit.com/Guide/How++to+ajust+the+chain+tension+-/129920
Document : guide - technique  : How to repair a bikini underwire https://www.ifixit.com/Guide/How+to+repair+a+bikini+underwire/169533
Document : guide - technique  : How to repair a taped seam https://www.ifixit.com/Guide/How+to+repair+a+taped+seam/169521
Document : guide - technique  : How to repair a broken seam https://www.ifixit.com/Guide/How+to+repair+a+broken+seam/169520


# Posts Reddit Tech Support

In [9]:
# Charger les posts depuis un fichier JSON
def load_posts(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        posts = json.load(f)
    return posts


# Convertir les posts en vecteurs et créer un retriever LangChain
def index_posts_embeddings(posts, model_name="sentence-transformers/all-mpnet-base-v2"):
    # Construire les textes et les objets Document
    documents = []
    for p in posts:
        text_comments = ""
        for comment in p["comments"]:
            text_comments += comment + "\n"
        documents.append(
            Document(
                page_content=f"{p['titre']} - {p['contenu']}",
                metadata={
                    "comments": text_comments,
                    "url": p["url"],
                    "titre": p["titre"],
                    "contenu": p["contenu"],
                },
            )
        )

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(documents)

    # Créer des embeddings avec LangChain
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    vector_store = FAISS.from_documents(splits, embedding_model)

    return vector_store.as_retriever()

In [10]:
posts = load_posts("./data/techsupport_posts.json")

retriever = index_posts_embeddings(posts)

In [11]:
question = "Blue screen PC ?"

# Requête au retriever pour obtenir les documents les plus pertinents
results = retriever.get_relevant_documents(question)

# Afficher les documents trouvés
for result in results:
    print(
        f"Document : {result.page_content} --COMMENTS-- : {result.metadata['comments']}"
    )

Document : Probably a virus - Yesterday, I was looking where to buy some specific shoes, and I remember clicking on a website that immediately said, host unexpectedly cut connection. In the evening nothong was wrong. Now I go to turn on my computer and it starts pretty normally but it suddenly runs out of battery. 

I plug in the cable, turn it back on and it's really slow, the mouse has a big delay and it doesn't feel 100% accurate, but it could be just me. After around a minute the blue screen of death, as it's apparently called shows up, but the text is all weird.

It restarts, things look normal, still running kinda slow and I start looking for something on my phone, go back to my laptop and the mouse isn't moving. I see it's been frozen for about 3 minutes, cause the clock hasn't moved. From then on I tried turning it off, trying again, but it just freezes. I tried closing the laptop, the same screen is still there. The weird blue screen comes again.

What do I do? --COMMENTS-- : 