add empyt nodes

VinciGit00 · VinciGit00 · commit ea27b2499ef5 · 2024-09-30T11:52:14.000+02:00
diff --git a/README.md b/README.md
@@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
 
 - <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
 
-
-This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
-```bash
-pip install scrapegraphai[other-language-models]
+  This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
+  ```bash
+  pip install scrapegraphai[other-language-models]
   ```
 - <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
 
@@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
   pip install scrapegraphai[more-browser-options]
   ```
 
-- <b>faiss Options</b>: this group includes faiss integration
+- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.
 
   ```bash
-  pip install scrapegraphai[faiss-cpu]
+  pip install scrapegraphai[qdrant]
   ```
 
 </details>
 
 
-
-### Installing "More Browser Options"
-
-This group includes an ocr scraper for websites
-```bash
-pip install scrapegraphai[screenshot_scraper]
-```
-
 ## 💻 Usage
 There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -100,8 +100,9 @@ screenshot_scraper = [
 ]
 
 # Group 5: Faiss CPU
-faiss-cpu = [
-    "faiss-cpu>=1.8.0",
+qdrant = [
+    "qdrant-client>=1.11.3",
+    "fastembed>=0.3.6"
 ]
 
 [build-system]
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -28,3 +28,6 @@
 from .generate_code_node import GenerateCodeNode
 from .search_node_with_context import SearchLinksWithContext
 from .reasoning_node import ReasoningNode
+from .fetch_node_level_k import FetchNodelevelK
+from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
+from .description_node import DescriptionNode
diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
@@ -0,0 +1,42 @@
+"""
+DescriptionNode Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class DescriptionNode(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
diff --git a/scrapegraphai/nodes/fetch_node_level_k.py b/scrapegraphai/nodes/fetch_node_level_k.py
@@ -0,0 +1,42 @@
+"""
+FetchNodelevelK Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class FetchNodelevelK(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "RAG",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+        self.cache_path = node_config.get("cache_path", False)
+
+    def execute(self, state: dict) -> dict:
+        pass
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -0,0 +1,50 @@
+"""
+GenerateAnswerNodeKLevel Module
+"""
+from typing import List, Optional
+from .base_node import BaseNode
+
+class GenerateAnswerNodeKLevel(BaseNode):
+    """
+    A node responsible for compressing the input tokens and storing the document
+    in a vector database for retrieval. Relevant chunks are stored in the state.
+
+    It allows scraping of big documents without exceeding the token limit of the language model.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "Parse".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "GANLK",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.embedder_model = node_config.get("embedder_model", None)
+        self.verbose = (
+            False if node_config is None else node_config.get("verbose", False)
+        )
+
+    def execute(self, state: dict) -> dict:
+        client = state["vectorial_db"]
+
+        answer = client.query(
+            collection_name="demo_collection",
+            query_text="This is a query document"
+        )
+
+        state["answer"] = answer
+
+        return state
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
@@ -26,7 +26,6 @@
 from .base_node import BaseNode
 from jsonschema import validate, ValidationError
 
-
 class GenerateCodeNode(BaseNode):
     """
     A node that generates Python code for a function that extracts data
@@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict:
         Raises:
             KeyError: If the input keys are not found in the state, indicating
                       that the necessary information for generating an answer is missing.
-            RuntimeError: If the maximum number of iterations is 
+            RuntimeError: If the maximum number of iterations is
             reached without obtaining the desired code.
         """
 
@@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict:
             self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
             state = self.semantic_comparison_loop(state)
             if state["errors"]["semantic"]:
-                continue      
+                continue
             break
 
         if state["iteration"] == self.max_iterations["overall"] and \
@@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict:
             state["errors"]["syntax"] = [syntax_message]
             self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
             analysis = syntax_focused_analysis(state, self.llm_model)
-            self.logger.info(f"""--- (Regenerating Code 
+            self.logger.info(f"""--- (Regenerating Code
                              to fix the Error) ---""")
-            state["generated_code"] = syntax_focused_code_generation(state, 
+            state["generated_code"] = syntax_focused_code_generation(state,
                                                                      analysis, self.llm_model)
             state["generated_code"] = extract_code(state["generated_code"])
         return state
@@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict:
             self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
             analysis = execution_focused_analysis(state, self.llm_model)
             self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
-            state["generated_code"] = execution_focused_code_generation(state, 
+            state["generated_code"] = execution_focused_code_generation(state,
                                                                         analysis, self.llm_model)
             state["generated_code"] = extract_code(state["generated_code"])
         return state
 
     def validation_reasoning_loop(self, state: dict) -> dict:
         for _ in range(self.max_iterations["validation"]):
-            validation, errors = self.validate_dict(state["execution_result"], 
+            validation, errors = self.validate_dict(state["execution_result"],
                                                     self.output_schema.schema())
             if validation:
                 state["errors"]["validation"] = []
@@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict:
 
     def semantic_comparison_loop(self, state: dict) -> dict:
         for _ in range(self.max_iterations["semantic"]):
-            comparison_result = self.semantic_comparison(state["execution_result"], 
+            comparison_result = self.semantic_comparison(state["execution_result"],
                                                          state["reference_answer"])
             if comparison_result["are_semantically_equivalent"]:
                 state["errors"]["semantic"] = []
@@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code):
             if not extract_data:
                 raise NameError("Function 'extract_data' not found in the generated code.")
 
-            result = extract_data(self.raw_html)            
+            result = extract_data(self.raw_html)
             return True, result
         except Exception as e:
             return False, f"Error during execution: {str(e)}"
@@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema):
             validate(instance=data, schema=schema)
             return True, None
         except ValidationError as e:
-            errors = e.errors()
+            errors = [e.message]
             return False, errors
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py

Original file line number	Diff line number	Diff line change
`@@ -100,8 +100,9 @@ screenshot_scraper = [`
`100`	`100`	`]`
`101`	`101`
`102`	`102`	`# Group 5: Faiss CPU`
`103`		`-faiss-cpu = [`
`104`		`- "faiss-cpu>=1.8.0",`
	`103`	`+qdrant = [`
	`104`	`+ "qdrant-client>=1.11.3",`
	`105`	`+ "fastembed>=0.3.6"`
`105`	`106`	`]`
`106`	`107`
`107`	`108`	`[build-system]`