-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathrobots_node.py
131 lines (104 loc) · 5.05 KB
/
robots_node.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
RobotsNode Module
"""
from typing import List, Optional
from urllib.parse import urlparse
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import AsyncChromiumLoader
from ..helpers import robots_dictionary
from ..prompts import TEMPLATE_ROBOT
from .base_node import BaseNode
class RobotsNode(BaseNode):
"""
A node responsible for checking if a website is scrapeable or not based on the robots.txt file.
It uses a language model to determine if the website allows scraping of the provided path.
This node acts as a starting point in many scraping workflows, preparing the state
with the necessary HTML content for further processing by subsequent nodes in the graph.
Attributes:
llm_model: An instance of the language model client used for checking scrapeability.
force_scraping (bool): A flag indicating whether scraping should be enforced even
if disallowed by robots.txt.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
force_scraping (bool): A flag indicating whether scraping should be enforced even
if disallowed by robots.txt. Defaults to True.
node_name (str): The unique identifier name for the node, defaulting to "Robots".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "RobotNode",
):
super().__init__(node_name, "node", input, output, 1)
self.llm_model = node_config["llm_model"]
self.force_scraping = (
False if node_config is None else node_config.get("force_scraping", False)
)
self.verbose = (
True if node_config is None else node_config.get("verbose", False)
)
def execute(self, state: dict) -> dict:
"""
Checks if a website is scrapeable based on the robots.txt file and updates the state
with the scrapeability status. The method constructs a prompt for the language model,
submits it, and parses the output to determine if scraping is allowed.
Args:
state (dict): The current state of the graph. The input keys will be used to fetch the
Returns:
dict: The updated state with the output key containing the scrapeability status.
Raises:
KeyError: If the input keys are not found in the state, indicating that the
necessary information for checking scrapeability is missing.
KeyError: If the large language model is not found in the robots_dictionary.
ValueError: If the website is not scrapeable based on the robots.txt file and
scraping is not enforced.
"""
self.logger.info(f"--- Executing {self.node_name} Node ---")
input_keys = self.get_input_keys(state)
input_data = [state[key] for key in input_keys]
source = input_data[0]
output_parser = CommaSeparatedListOutputParser()
if not source.startswith("http"):
raise ValueError("Operation not allowed")
else:
parsed_url = urlparse(source)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
document = loader.load()
if "ollama" in self.llm_model.model:
self.llm_model.model = self.llm_model.model.split("/")[-1]
model = self.llm_model.model.split("/")[-1]
else:
model = self.llm_model.model
try:
agent = robots_dictionary[model]
except KeyError:
agent = model
prompt = PromptTemplate(
template=TEMPLATE_ROBOT,
input_variables=["path"],
partial_variables={"context": document, "agent": agent},
)
chain = prompt | self.llm_model | output_parser
is_scrapable = chain.invoke({"path": source})[0]
if "no" in is_scrapable:
self.logger.warning(
"\033[31m(Scraping this website is not allowed)\033[0m"
)
if not self.force_scraping:
raise ValueError("The website you selected is not scrapable")
else:
self.logger.warning(
"""\033[33m(WARNING: Scraping this website is
not allowed but you decided to force it)\033[0m"""
)
else:
self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m")
state.update({self.output[0]: is_scrapable})
return state