## lecture PDF

In [3]:
from pathlib import Path
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders import PromptBuilder
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret
import os

# Configurez vos clés d'API ici
os.environ["HF_TOKEN_API"] = "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"
os.environ["GROQ_API_KEY"] = "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"

class Indexing:
    def __init__(self, document_store):
        self.converter = PyPDFToDocument()
        self.cleaner = DocumentCleaner()
        self.splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        self.writer = DocumentWriter(document_store=document_store)
        self.pipeline = Pipeline()
        self.pipeline.add_component("converter", self.converter)
        self.pipeline.add_component("cleaner", self.cleaner)
        self.pipeline.add_component("splitter", self.splitter)
        self.pipeline.add_component("writer", self.writer)

        self.pipeline.connect("converter", "cleaner")
        self.pipeline.connect("cleaner", "splitter")
        self.pipeline.connect("splitter", "writer")

    def get_pipeline(self) -> Pipeline:
        return self.pipeline

    def run_index_pipeline(self, path):
        self.pipeline.run({"converter": {"sources": [Path(path)]}})

class Query:
    def __init__(self, document_store, generator):
        self.template = """
                        {% for document in documents %}
                            {{ document.content }}
                        {% endfor %}

                        Please answer the question based on the given information.

                        {{question}}
                        """
        self.prompt_builder = PromptBuilder(template=self.template)
        self.rag_pipeline = Pipeline()
        self.retriever = InMemoryBM25Retriever(document_store)

        self.rag_pipeline.add_component("retriever", self.retriever)
        self.rag_pipeline.add_component("prompt_builder", self.prompt_builder)
        self.rag_pipeline.add_component("llm", generator)

        self.rag_pipeline.connect("retriever", "prompt_builder.documents")
        self.rag_pipeline.connect("prompt_builder", "llm")

    def run_pipeline(self, query):
        res = self.rag_pipeline.run(
            {
                "retriever": {"query": query},
                "prompt_builder": {"question": query},
            }
        )
        return res

# Initialisation du document store et des composants
doc_store = InMemoryDocumentStore()
idx = Indexing(doc_store)

# Spécifiez ici le chemin du fichier PDF que vous voulez indexer
pdf_path = "C:/Users/sarra/Downloads/SyllabusDuProjet.pdf"  # Remplacez par le chemin réel du fichier
idx.run_index_pipeline(pdf_path)

# Initialisation du générateur
generator = OpenAIGenerator(
    api_key=Secret.from_env_var("GROQ_API_KEY"),
    api_base_url="https://api.groq.com/openai/v1",
    model="gemma2-9b-it",
    generation_kwargs={"max_tokens": 4096},
)

# Création de l'objet de requête
query = Query(doc_store, generator)

# Spécifiez ici la requête que vous souhaitez exécuter
question = "quel est le sujet de ce document?"  # Remplacez par votre question

# Exécution de la requête et affichage de la réponse
response = query.run_pipeline(question)
print(response["llm"]["replies"][0])


Ce document décrit le syllabus d'un projet de Deep Learning intitulé "Kaggle Dataset Analysis with Keras" pour les étudiants en S1.  

Le projet vise à familiariser les étudiants avec le framework keras_core (ou TensorFlow) en leur permettant de construire et d'évaluer différents modèles de Deep Learning sur des jeux de données du site Kaggle.  



## lecture PDF+TXT


In [1]:
from pathlib import Path
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders import PromptBuilder
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret
import os

# Configurez vos clés d'API ici
os.environ["HF_TOKEN_API"] = "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"
os.environ["GROQ_API_KEY"] = "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"

class Indexing:
    def __init__(self, document_store):
        self.document_store = document_store

    def create_pdf_pipeline(self):
        """Pipeline for indexing PDF files"""
        pdf_pipeline = Pipeline()
        pdf_converter = PyPDFToDocument()
        cleaner = DocumentCleaner()
        splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        writer = DocumentWriter(document_store=self.document_store)

        pdf_pipeline.add_component("pdf_converter", pdf_converter)
        pdf_pipeline.add_component("cleaner", cleaner)
        pdf_pipeline.add_component("splitter", splitter)
        pdf_pipeline.add_component("writer", writer)

        pdf_pipeline.connect("pdf_converter", "cleaner")
        pdf_pipeline.connect("cleaner", "splitter")
        pdf_pipeline.connect("splitter", "writer")
        return pdf_pipeline

    def create_txt_pipeline(self):
        """Pipeline for indexing TXT files"""
        txt_pipeline = Pipeline()
        text_converter = TextFileToDocument()
        cleaner = DocumentCleaner()
        splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        writer = DocumentWriter(document_store=self.document_store)

        txt_pipeline.add_component("text_converter", text_converter)
        txt_pipeline.add_component("cleaner", cleaner)
        txt_pipeline.add_component("splitter", splitter)
        txt_pipeline.add_component("writer", writer)

        txt_pipeline.connect("text_converter", "cleaner")
        txt_pipeline.connect("cleaner", "splitter")
        txt_pipeline.connect("splitter", "writer")
        return txt_pipeline

    def run_index_pipeline(self, path):
        file_extension = Path(path).suffix.lower()
        if file_extension == ".pdf":
            pdf_pipeline = self.create_pdf_pipeline()
            pdf_pipeline.run({"pdf_converter": {"sources": [Path(path)]}})
        elif file_extension == ".txt":
            txt_pipeline = self.create_txt_pipeline()
            txt_pipeline.run({"text_converter": {"sources": [Path(path)]}})
        else:
            raise ValueError("Type de fichier non pris en charge. Veuillez utiliser un fichier PDF ou TXT.")

class Query:
    def __init__(self, document_store, generator):
        self.template = """
                        {% for document in documents %}
                            {{ document.content }}
                        {% endfor %}

                        Please answer the question based on the given information.

                        {{question}}
                        """
        self.prompt_builder = PromptBuilder(template=self.template)
        self.rag_pipeline = Pipeline()
        self.retriever = InMemoryBM25Retriever(document_store)

        self.rag_pipeline.add_component("retriever", self.retriever)
        self.rag_pipeline.add_component("prompt_builder", self.prompt_builder)
        self.rag_pipeline.add_component("llm", generator)

        self.rag_pipeline.connect("retriever", "prompt_builder.documents")
        self.rag_pipeline.connect("prompt_builder", "llm")

    def run_pipeline(self, query):
        res = self.rag_pipeline.run(
            {
                "retriever": {"query": query},
                "prompt_builder": {"question": query},
            }
        )
        return res

# Initialisation du document store et des composants
doc_store = InMemoryDocumentStore()
idx = Indexing(doc_store)

# Spécifiez ici le chemin du fichier que vous voulez indexer
file_path = "C:/Users/sarra/Desktop/hachathon.txt"  # Remplacez par le chemin réel du fichier
#file_path = "C:/Users/sarra/Downloads/SyllabusDuProjet.pdf" 
idx.run_index_pipeline(file_path)

# Initialisation du générateur
generator = OpenAIGenerator(
    api_key=Secret.from_env_var("GROQ_API_KEY"),
    api_base_url="https://api.groq.com/openai/v1",
    model="gemma2-9b-it",
    generation_kwargs={"max_tokens": 4096},
)

# Création de l'objet de requête
query = Query(doc_store, generator)

# Spécifiez ici la requête que vous souhaitez exécuter
question = "quel est le resumé de ce document?"  # Remplacez par votre question

# Exécution de la requête et affichage de la réponse
response = query.run_pipeline(question)
print(response["llm"]["replies"][0])




This document outlines the code for a genetic algorithm to evolve neural network architectures. 

Here's a breakdown:

* **Data Preprocessing:**
    * Loads CSV data and separates features (X) from target variable (y).
    * Encodes categorical target variables.
    * Splits data into training and testing sets.
    * Scales features using StandardScaler.

* **Genetic Algorithm Setup:**
    * Defines a custom toolbox using DEAP library for genetic operations.
    * Registers functions for individual creation (`generate_individual`), population creation (`generate_diverse_population`), attribute initialization (`attr_int`), and fitness evaluation (`eval_nn`).

* **Neural Network Definition:**
    *  `eval_nn` function builds and trains a simple sequential neural network based on the decoded individual's structure and weights.
    *  Fitness is evaluated based on the model's performance (loss and accuracy) on the test set.

* **Individual Representation:**
    *  Individuals are represent

## lecture PDF+TXT+CSV

In [2]:
from pathlib import Path
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders import PromptBuilder
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret
import os
import pandas as pd
from haystack import Document  # Ajout de l'import pour le type Document

# Configurez vos clés d'API ici
os.environ["HF_TOKEN_API"] = "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"
os.environ["GROQ_API_KEY"] = "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"

class Indexing:
    def __init__(self, document_store):
        self.document_store = document_store

    def create_pdf_pipeline(self):
        """Pipeline pour indexer les fichiers PDF"""
        pdf_pipeline = Pipeline()
        pdf_converter = PyPDFToDocument()
        cleaner = DocumentCleaner()
        splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        writer = DocumentWriter(document_store=self.document_store)

        pdf_pipeline.add_component("pdf_converter", pdf_converter)
        pdf_pipeline.add_component("cleaner", cleaner)
        pdf_pipeline.add_component("splitter", splitter)
        pdf_pipeline.add_component("writer", writer)

        pdf_pipeline.connect("pdf_converter", "cleaner")
        pdf_pipeline.connect("cleaner", "splitter")
        pdf_pipeline.connect("splitter", "writer")
        return pdf_pipeline

    def create_txt_pipeline(self):
        """Pipeline pour indexer les fichiers TXT"""
        txt_pipeline = Pipeline()
        text_converter = TextFileToDocument()
        cleaner = DocumentCleaner()
        splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        writer = DocumentWriter(document_store=self.document_store)

        txt_pipeline.add_component("text_converter", text_converter)
        txt_pipeline.add_component("cleaner", cleaner)
        txt_pipeline.add_component("splitter", splitter)
        txt_pipeline.add_component("writer", writer)

        txt_pipeline.connect("text_converter", "cleaner")
        txt_pipeline.connect("cleaner", "splitter")
        txt_pipeline.connect("splitter", "writer")
        return txt_pipeline

    def create_csv_pipeline(self, path):
        """Pipeline pour indexer les fichiers CSV"""
        # Lecture du CSV
        data = pd.read_csv(path)

        # Création d'une liste de documents Haystack
        documents = []
        for _, row in data.iterrows():
            content = ' '.join(f"{col}: {val}" for col, val in row.items())  # Conversion d'une ligne en texte
            documents.append(Document(content=content))

        # Ajout des documents au Document Store avec gestion des doublons
        self.document_store.write_documents(documents, policy="SKIP")  # Ignorer les doublons

    def run_index_pipeline(self, path):
        file_extension = Path(path).suffix.lower()
        if file_extension == ".pdf":
            pdf_pipeline = self.create_pdf_pipeline()
            pdf_pipeline.run({"pdf_converter": {"sources": [Path(path)]}})
        elif file_extension == ".txt":
            txt_pipeline = self.create_txt_pipeline()
            txt_pipeline.run({"text_converter": {"sources": [Path(path)]}})
        elif file_extension == ".csv":
            self.create_csv_pipeline(path)
        else:
            raise ValueError("Type de fichier non pris en charge. Veuillez utiliser un fichier PDF, TXT ou CSV.")

class Query:
    def __init__(self, document_store, generator):
        self.template = """
                        {% for document in documents %}
                            {{ document.content }}
                        {% endfor %}

                        Please answer the question based on the given information.

                        {{question}}
                        """
        self.prompt_builder = PromptBuilder(template=self.template)
        self.rag_pipeline = Pipeline()
        self.retriever = InMemoryBM25Retriever(document_store)

        self.rag_pipeline.add_component("retriever", self.retriever)
        self.rag_pipeline.add_component("prompt_builder", self.prompt_builder)
        self.rag_pipeline.add_component("llm", generator)

        self.rag_pipeline.connect("retriever", "prompt_builder.documents")
        self.rag_pipeline.connect("prompt_builder", "llm")

    def run_pipeline(self, query):
        res = self.rag_pipeline.run(
            {
                "retriever": {"query": query},
                "prompt_builder": {"question": query},
            }
        )
        return res

# Initialisation du document store et des composants
doc_store = InMemoryDocumentStore()
idx = Indexing(doc_store)

# Spécifiez ici le chemin du fichier que vous voulez indexer
file_path = "C:/Users/sarra/Desktop/iris.csv"  # Remplacez par le chemin réel du fichier CSV
idx.run_index_pipeline(file_path)

# Initialisation du générateur
generator = OpenAIGenerator(
    api_key=Secret.from_env_var("GROQ_API_KEY"),
    api_base_url="https://api.groq.com/openai/v1",
    model="gemma2-9b-it",
    generation_kwargs={"max_tokens": 4096},
)

# Création de l'objet de requête
query = Query(doc_store, generator)

# Spécifiez ici la requête que vous souhaitez exécuter
question = "quel est le resumé de ce document?"  # Remplacez par votre question

# Exécution de la requête et affichage de la réponse
response = query.run_pipeline(question)
print(response["llm"]["replies"][0])


Please provide me with the document so I can summarize it for you. 😊  

I need the text of the document to be able to understand it and create a summary. 



In [6]:
from pathlib import Path
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders import PromptBuilder
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret
import os
import pandas as pd
from haystack import Document

# Configurez vos clés d'API ici
os.environ["HF_TOKEN_API"] = "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"
os.environ["GROQ_API_KEY"] = "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"

class Indexing:
    def __init__(self, document_store):
        self.document_store = document_store

    def create_pdf_pipeline(self):
        """Pipeline pour indexer les fichiers PDF"""
        pdf_pipeline = Pipeline()
        pdf_converter = PyPDFToDocument()
        cleaner = DocumentCleaner()
        splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        writer = DocumentWriter(document_store=self.document_store)

        pdf_pipeline.add_component("pdf_converter", pdf_converter)
        pdf_pipeline.add_component("cleaner", cleaner)
        pdf_pipeline.add_component("splitter", splitter)
        pdf_pipeline.add_component("writer", writer)

        pdf_pipeline.connect("pdf_converter", "cleaner")
        pdf_pipeline.connect("cleaner", "splitter")
        pdf_pipeline.connect("splitter", "writer")
        return pdf_pipeline

    def create_txt_pipeline(self):
        """Pipeline pour indexer les fichiers TXT"""
        txt_pipeline = Pipeline()
        text_converter = TextFileToDocument()
        cleaner = DocumentCleaner()
        splitter = DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2)
        writer = DocumentWriter(document_store=self.document_store)

        txt_pipeline.add_component("text_converter", text_converter)
        txt_pipeline.add_component("cleaner", cleaner)
        txt_pipeline.add_component("splitter", splitter)
        txt_pipeline.add_component("writer", writer)

        txt_pipeline.connect("text_converter", "cleaner")
        txt_pipeline.connect("cleaner", "splitter")
        txt_pipeline.connect("splitter", "writer")
        return txt_pipeline

    def create_csv_pipeline(self, path):
        """Pipeline pour indexer les fichiers CSV"""
        # Lecture du CSV
        data = pd.read_csv(path)

        # Création d'une liste de documents Haystack
        documents = []
        for index, row in data.iterrows():
            content = ' '.join(f"{col}: {val}" for col, val in row.items())  # Conversion d'une ligne en texte
            documents.append(Document(content=content, id=f"csv_row_{index}"))  # Ajout d'un ID unique

        # Ajout des documents au Document Store avec gestion des doublons
        self.document_store.write_documents(documents, policy="SKIP")  # Ignorer les doublons

    def run_index_pipeline(self, path):
        file_extension = Path(path).suffix.lower()
        if file_extension == ".pdf":
            pdf_pipeline = self.create_pdf_pipeline()
            pdf_pipeline.run({"pdf_converter": {"sources": [Path(path)]}})
        elif file_extension == ".txt":
            txt_pipeline = self.create_txt_pipeline()
            txt_pipeline.run({"text_converter": {"sources": [Path(path)]}})
        elif file_extension == ".csv":
            self.create_csv_pipeline(path)
        else:
            raise ValueError("Type de fichier non pris en charge. Veuillez utiliser un fichier PDF, TXT ou CSV.")

        # Vérification : Afficher les documents dans le document_store
        all_docs = self.document_store.get_all_documents()
        print(f"Documents ajoutés au document_store : {len(all_docs)}")
        for doc in all_docs[:5]:  # Afficher les 5 premiers documents pour vérification
            print(doc.content)

class Query:
    def __init__(self, document_store, generator):
        self.template = """
                        {% for document in documents %}
                            {{ document.content }}
                        {% endfor %}

                        Please answer the question based on the given information.

                        {{question}}
                        """
        self.prompt_builder = PromptBuilder(template=self.template)
        self.rag_pipeline = Pipeline()
        self.retriever = InMemoryBM25Retriever(document_store)

        self.rag_pipeline.add_component("retriever", self.retriever)
        self.rag_pipeline.add_component("prompt_builder", self.prompt_builder)
        self.rag_pipeline.add_component("llm", generator)

        self.rag_pipeline.connect("retriever", "prompt_builder.documents")
        self.rag_pipeline.connect("prompt_builder", "llm")

    def run_pipeline(self, query):
        # Vérification : Exécuter une récupération de documents
        docs = self.retriever.retrieve(query=query, top_k=5)
        print(f"Documents récupérés pour la question '{query}':")
        for doc in docs:
            print(doc.content)

        # Exécuter le pipeline RAG complet
        res = self.rag_pipeline.run(
            {
                "retriever": {"query": query},
                "prompt_builder": {"question": query},
            }
        )
        return res

# Initialisation du document store et des composants
doc_store = InMemoryDocumentStore()
idx = Indexing(doc_store)

# Spécifiez ici le chemin du fichier que vous voulez indexer
file_path = "C:/Users/sarra/Desktop/iris.csv"  # Remplacez par le chemin réel du fichier CSV
idx.run_index_pipeline(file_path)

# Initialisation du générateur
generator = OpenAIGenerator(
    api_key=Secret.from_env_var("GROQ_API_KEY"),
    api_base_url="https://api.groq.com/openai/v1",
    model="gemma2-9b-it",
    generation_kwargs={"max_tokens": 4096},
)

# Création de l'objet de requête
query = Query(doc_store, generator)

# Spécifiez ici la requête que vous souhaitez exécuter
question = "quel est le resumé de ce document?"  # Remplacez par votre question

# Exécution de la requête et affichage de la réponse
response = query.run_pipeline(question)
print(response["llm"]["replies"][0])


AttributeError: 'InMemoryDocumentStore' object has no attribute 'get_all_documents'