Skip to content

Commit ea27b24

Browse files
committed
add empyt nodes
1 parent d14fb54 commit ea27b24

8 files changed

+178
-165
lines changed

README.md

+5-14
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,9 @@ Additional dependecies can be added while installing the library:
3838

3939
- <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.
4040

41-
42-
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
43-
```bash
44-
pip install scrapegraphai[other-language-models]
41+
This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
42+
```bash
43+
pip install scrapegraphai[other-language-models]
4544
```
4645
- <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.
4746

@@ -55,23 +54,15 @@ pip install scrapegraphai[other-language-models]
5554
pip install scrapegraphai[more-browser-options]
5655
```
5756

58-
- <b>faiss Options</b>: this group includes faiss integration
57+
- <b>qdrants Options</b>: this group includes qdrant integration for RAGnode and DeepScraperGraph.
5958

6059
```bash
61-
pip install scrapegraphai[faiss-cpu]
60+
pip install scrapegraphai[qdrant]
6261
```
6362

6463
</details>
6564

6665

67-
68-
### Installing "More Browser Options"
69-
70-
This group includes an ocr scraper for websites
71-
```bash
72-
pip install scrapegraphai[screenshot_scraper]
73-
```
74-
7566
## 💻 Usage
7667
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
7768

pyproject.toml

+3-2
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,9 @@ screenshot_scraper = [
100100
]
101101

102102
# Group 5: Faiss CPU
103-
faiss-cpu = [
104-
"faiss-cpu>=1.8.0",
103+
qdrant = [
104+
"qdrant-client>=1.11.3",
105+
"fastembed>=0.3.6"
105106
]
106107

107108
[build-system]

scrapegraphai/nodes/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,6 @@
2828
from .generate_code_node import GenerateCodeNode
2929
from .search_node_with_context import SearchLinksWithContext
3030
from .reasoning_node import ReasoningNode
31+
from .fetch_node_level_k import FetchNodelevelK
32+
from .generate_answer_node_k_level import GenerateAnswerNodeKLevel
33+
from .description_node import DescriptionNode
+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
DescriptionNode Module
3+
"""
4+
from typing import List, Optional
5+
from .base_node import BaseNode
6+
7+
class DescriptionNode(BaseNode):
8+
"""
9+
A node responsible for compressing the input tokens and storing the document
10+
in a vector database for retrieval. Relevant chunks are stored in the state.
11+
12+
It allows scraping of big documents without exceeding the token limit of the language model.
13+
14+
Attributes:
15+
llm_model: An instance of a language model client, configured for generating answers.
16+
verbose (bool): A flag indicating whether to show print statements during execution.
17+
18+
Args:
19+
input (str): Boolean expression defining the input keys needed from the state.
20+
output (List[str]): List of output keys to be updated in the state.
21+
node_config (dict): Additional configuration for the node.
22+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
23+
"""
24+
25+
def __init__(
26+
self,
27+
input: str,
28+
output: List[str],
29+
node_config: Optional[dict] = None,
30+
node_name: str = "RAG",
31+
):
32+
super().__init__(node_name, "node", input, output, 2, node_config)
33+
34+
self.llm_model = node_config["llm_model"]
35+
self.embedder_model = node_config.get("embedder_model", None)
36+
self.verbose = (
37+
False if node_config is None else node_config.get("verbose", False)
38+
)
39+
self.cache_path = node_config.get("cache_path", False)
40+
41+
def execute(self, state: dict) -> dict:
42+
pass
+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
FetchNodelevelK Module
3+
"""
4+
from typing import List, Optional
5+
from .base_node import BaseNode
6+
7+
class FetchNodelevelK(BaseNode):
8+
"""
9+
A node responsible for compressing the input tokens and storing the document
10+
in a vector database for retrieval. Relevant chunks are stored in the state.
11+
12+
It allows scraping of big documents without exceeding the token limit of the language model.
13+
14+
Attributes:
15+
llm_model: An instance of a language model client, configured for generating answers.
16+
verbose (bool): A flag indicating whether to show print statements during execution.
17+
18+
Args:
19+
input (str): Boolean expression defining the input keys needed from the state.
20+
output (List[str]): List of output keys to be updated in the state.
21+
node_config (dict): Additional configuration for the node.
22+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
23+
"""
24+
25+
def __init__(
26+
self,
27+
input: str,
28+
output: List[str],
29+
node_config: Optional[dict] = None,
30+
node_name: str = "RAG",
31+
):
32+
super().__init__(node_name, "node", input, output, 2, node_config)
33+
34+
self.llm_model = node_config["llm_model"]
35+
self.embedder_model = node_config.get("embedder_model", None)
36+
self.verbose = (
37+
False if node_config is None else node_config.get("verbose", False)
38+
)
39+
self.cache_path = node_config.get("cache_path", False)
40+
41+
def execute(self, state: dict) -> dict:
42+
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
GenerateAnswerNodeKLevel Module
3+
"""
4+
from typing import List, Optional
5+
from .base_node import BaseNode
6+
7+
class GenerateAnswerNodeKLevel(BaseNode):
8+
"""
9+
A node responsible for compressing the input tokens and storing the document
10+
in a vector database for retrieval. Relevant chunks are stored in the state.
11+
12+
It allows scraping of big documents without exceeding the token limit of the language model.
13+
14+
Attributes:
15+
llm_model: An instance of a language model client, configured for generating answers.
16+
verbose (bool): A flag indicating whether to show print statements during execution.
17+
18+
Args:
19+
input (str): Boolean expression defining the input keys needed from the state.
20+
output (List[str]): List of output keys to be updated in the state.
21+
node_config (dict): Additional configuration for the node.
22+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
23+
"""
24+
25+
def __init__(
26+
self,
27+
input: str,
28+
output: List[str],
29+
node_config: Optional[dict] = None,
30+
node_name: str = "GANLK",
31+
):
32+
super().__init__(node_name, "node", input, output, 2, node_config)
33+
34+
self.llm_model = node_config["llm_model"]
35+
self.embedder_model = node_config.get("embedder_model", None)
36+
self.verbose = (
37+
False if node_config is None else node_config.get("verbose", False)
38+
)
39+
40+
def execute(self, state: dict) -> dict:
41+
client = state["vectorial_db"]
42+
43+
answer = client.query(
44+
collection_name="demo_collection",
45+
query_text="This is a query document"
46+
)
47+
48+
state["answer"] = answer
49+
50+
return state

scrapegraphai/nodes/generate_code_node.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from .base_node import BaseNode
2727
from jsonschema import validate, ValidationError
2828

29-
3029
class GenerateCodeNode(BaseNode):
3130
"""
3231
A node that generates Python code for a function that extracts data
@@ -96,7 +95,7 @@ def execute(self, state: dict) -> dict:
9695
Raises:
9796
KeyError: If the input keys are not found in the state, indicating
9897
that the necessary information for generating an answer is missing.
99-
RuntimeError: If the maximum number of iterations is
98+
RuntimeError: If the maximum number of iterations is
10099
reached without obtaining the desired code.
101100
"""
102101

@@ -170,7 +169,7 @@ def overall_reasoning_loop(self, state: dict) -> dict:
170169
self.logger.info(f"--- (Checking if the informations exctrcated are the ones Requested) ---")
171170
state = self.semantic_comparison_loop(state)
172171
if state["errors"]["semantic"]:
173-
continue
172+
continue
174173
break
175174

176175
if state["iteration"] == self.max_iterations["overall"] and \
@@ -195,9 +194,9 @@ def syntax_reasoning_loop(self, state: dict) -> dict:
195194
state["errors"]["syntax"] = [syntax_message]
196195
self.logger.info(f"--- (Synax Error Found: {syntax_message}) ---")
197196
analysis = syntax_focused_analysis(state, self.llm_model)
198-
self.logger.info(f"""--- (Regenerating Code
197+
self.logger.info(f"""--- (Regenerating Code
199198
to fix the Error) ---""")
200-
state["generated_code"] = syntax_focused_code_generation(state,
199+
state["generated_code"] = syntax_focused_code_generation(state,
201200
analysis, self.llm_model)
202201
state["generated_code"] = extract_code(state["generated_code"])
203202
return state
@@ -217,14 +216,14 @@ def execution_reasoning_loop(self, state: dict) -> dict:
217216
self.logger.info(f"--- (Code Execution Error: {execution_result}) ---")
218217
analysis = execution_focused_analysis(state, self.llm_model)
219218
self.logger.info(f"--- (Regenerating Code to fix the Error) ---")
220-
state["generated_code"] = execution_focused_code_generation(state,
219+
state["generated_code"] = execution_focused_code_generation(state,
221220
analysis, self.llm_model)
222221
state["generated_code"] = extract_code(state["generated_code"])
223222
return state
224223

225224
def validation_reasoning_loop(self, state: dict) -> dict:
226225
for _ in range(self.max_iterations["validation"]):
227-
validation, errors = self.validate_dict(state["execution_result"],
226+
validation, errors = self.validate_dict(state["execution_result"],
228227
self.output_schema.schema())
229228
if validation:
230229
state["errors"]["validation"] = []
@@ -240,7 +239,7 @@ def validation_reasoning_loop(self, state: dict) -> dict:
240239

241240
def semantic_comparison_loop(self, state: dict) -> dict:
242241
for _ in range(self.max_iterations["semantic"]):
243-
comparison_result = self.semantic_comparison(state["execution_result"],
242+
comparison_result = self.semantic_comparison(state["execution_result"],
244243
state["reference_answer"])
245244
if comparison_result["are_semantically_equivalent"]:
246245
state["errors"]["semantic"] = []
@@ -342,7 +341,7 @@ def create_sandbox_and_execute(self, function_code):
342341
if not extract_data:
343342
raise NameError("Function 'extract_data' not found in the generated code.")
344343

345-
result = extract_data(self.raw_html)
344+
result = extract_data(self.raw_html)
346345
return True, result
347346
except Exception as e:
348347
return False, f"Error during execution: {str(e)}"
@@ -357,5 +356,5 @@ def validate_dict(self, data: dict, schema):
357356
validate(instance=data, schema=schema)
358357
return True, None
359358
except ValidationError as e:
360-
errors = e.errors()
359+
errors = [e.message]
361360
return False, errors

0 commit comments

Comments
 (0)