# 1️⃣ Ingest documents PDFs.

In [1]:
from langchain_community.document_loaders import PyPDFLoader
import copy
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings
from typing import List
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
import json
from dotenv import load_dotenv
import os

In [2]:
#récupérer l'api key
load_dotenv(".env.txt")
api_key = os.getenv("API_KEY")

In [3]:
def loderDocument(Document):
    loader_pdf = PyPDFLoader(Document)
    pages_pdf = loader_pdf.load()
    pages_pdf_cut = copy.deepcopy(pages_pdf)
    text = "\n".join([doc.page_content for doc in pages_pdf_cut])
    return text

# 2️⃣ Embed the documents using SentenceTransformer.

In [4]:
def divise_chunk(text):
    text_splitter = CharacterTextSplitter(separator = ".",chunk_size=530, chunk_overlap=50)
    texts = text_splitter.split_text(text)
    return texts

In [5]:
# ChromaDB n'accepte pas directement SentenceTransformer comme modèle d'embedding.
# Cette classe sert d'adaptateur pour utiliser SentenceTransformer avec ChromaDB.
# Elle transforme les documents  en vecteurs numériques exploitables par ChromaDB.

class CustomEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        return [self.model.encode(d).tolist() for d in documents]

    def embed_query(self, query: str) -> List[float]:
        return self.model.encode([query])[0].tolist()

# 3️⃣ Store embeddings in a vector database  ChromaDB

In [6]:
def creeVectorstore(texts,embedding_model):
    # convertir  chunks à des petits documents pour devient  acceptable par chromdb
    documents = [Document(page_content=text) for text in texts]
    vectorstore  = Chroma.from_documents(
    documents=documents, 
    embedding=embedding_model,  
    persist_directory="./store_embeding")
    return vectorstore

    

# 4️⃣ Retrieve relevant information and generate an AI-powered response.

In [7]:
# det  chunks utiliser pour repondre a la question
def retrieve_documents(query, vectorstore):
    return vectorstore.similarity_search(query, 5)

In [8]:
def query_deepseek(prompt):
    api_url = "https://api.together.xyz/v1/chat/completions"
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    data = {
        "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.3,
        "max_tokens": 500
    }
    response = requests.post(api_url, headers=headers, json=data)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        return f"Erreur: {response.status_code} - {response.text}"

In [9]:
def generate_answer(user_query,vectorestore):
    retrieved_docs = retrieve_documents(user_query,vectorestore)
    context = "\n".join([doc.page_content for doc in retrieved_docs])
    prompt = f"donner la reponse en 200 mots au max ,Voici quelques informations de contexte :\n{context}\n\nQuestion : {user_query}\nRéponds en utilisant ces informations."
    return query_deepseek(prompt)

In [10]:
def agent(document):
    text=loderDocument(document)
    texts=divise_chunk(text)
    embedding_model = CustomEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorestore=creeVectorstore(texts,embedding_model)
    return vectorestore


In [11]:
def main():
    question='changer'
    while(question=='changer'):
        question=''
        path = input("Veuillez entrer le chemin du fichier .pdf: ")
        vectorestore=agent(path)
        while(question!='changer'):
            question= input("Tapez 'changer' pour changer le fichier sinon poser votre  question ")
            if(question!='changer'):
                answer=generate_answer(question,vectorestore)
                print(answer)
                        

In [13]:
if __name__ == "__main__":
    main()

Veuillez entrer le chemin du fichier .pdf: 3-Naissance_IA.pdf
Tapez 'changer' pour changer le fichier sinon poser votre  question changer
Veuillez entrer le chemin du fichier .pdf: 3-Naissance_IA.pdf


KeyboardInterrupt: Interrupted by user