-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathgenerate_code_node.py
235 lines (182 loc) · 8.45 KB
/
generate_code_node.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""
GenerateCodeNode Module
"""
from typing import List, Optional
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel
from langchain_core.utils.pydantic import is_basemodel_subclass
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_mistralai import ChatMistralAI
from langchain_community.chat_models import ChatOllama
import ast
import sys
from io import StringIO
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from .base_node import BaseNode
from pydantic import ValidationError
from ..utils import transform_schema
from jsonschema import validate, ValidationError
class GenerateCodeNode(BaseNode):
"""
...
Attributes:
llm_model: An instance of a language model client, configured for generating answers.
verbose (bool): A flag indicating whether to show print statements during execution.
Args:
input (str): Boolean expression defining the input keys needed from the state.
output (List[str]): List of output keys to be updated in the state.
node_config (dict): Additional configuration for the node.
node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
"""
def __init__(
self,
input: str,
output: List[str],
node_config: Optional[dict] = None,
node_name: str = "GenerateCode",
):
super().__init__(node_name, "node", input, output, 2, node_config)
self.llm_model = node_config["llm_model"]
if isinstance(node_config["llm_model"], ChatOllama):
self.llm_model.format="json"
self.verbose = (
True if node_config is None else node_config.get("verbose", False)
)
self.force = (
False if node_config is None else node_config.get("force", False)
)
self.script_creator = (
False if node_config is None else node_config.get("script_creator", False)
)
self.is_md_scraper = (
False if node_config is None else node_config.get("is_md_scraper", False)
)
self.additional_info = node_config.get("additional_info")
def execute(self, state: dict) -> dict:
"""
...
Args:
state (dict): The current state of the graph. The input keys will be used
to fetch the correct data from the state.
Returns:
dict: The updated state with the output key containing the generated answer.
Raises:
KeyError: If the input keys are not found in the state, indicating
that the necessary information for generating an answer is missing.
"""
template_code_generator = """
**Task**: Create a Python function named `extract_data(html: str) -> dict()` using BeautifulSoup that extracts relevant information from the given HTML code string and returns it in a dictionary matching the Desired JSON Output Schema.
**User's Request**:
{user_input}
**Desired JSON Output Schema**:
```json
{json_schema}
```
**Initial Task Analysis**:
{initial_analysis}
**HTML Code**:
```html
{html_code}
```
**HTML Structure Analysis**:
{html_analysis}
Based on the above analyses, generate the `extract_data(html: str) -> dict()` function that:
1. Efficiently extracts the required data from the given HTML structure.
2. Processes and structures the data according to the specified JSON schema.
3. Returns the structured data as a dictionary.
Your code should be well-commented, explaining the reasoning behind key decisions and any potential areas for improvement or customization.
Use only the following pre-imported libraries:
- BeautifulSoup from bs4
- re
**Output ONLY the Python code of the extract_data function, WITHOUT ANY IMPORTS OR ADDITIONAL TEXT.**
**Response**:
"""
self.logger.info(f"--- Executing {self.node_name} Node ---")
input_keys = self.get_input_keys(state)
input_data = [state[key] for key in input_keys]
user_prompt = input_data[0] # get user prompt
refined_prompt = input_data[1] # get refined prompt
html_info = input_data[2] # get html analysis
reduced_html = input_data[3] # get html code
answer = input_data[4] # get answer generated from the generate answer node for verification
if self.node_config.get("schema", None) is not None:
self.output_schema = self.node_config["schema"].schema() # get JSON output schema
self.simplefied_schema = transform_schema(self.output_schema) # get JSON output schema
prompt = PromptTemplate(
template=template_code_generator,
partial_variables={
"user_input": user_prompt,
"json_schema": str(self.simplefied_schema),
"initial_analysis": refined_prompt,
"html_code": reduced_html,
"html_analysis": html_info
})
output_parser = StrOutputParser()
chain = prompt | self.llm_model | output_parser
generated_code = chain.invoke({})
# syntax check
print("\Checking code syntax...")
generated_code = self.extract_code(generated_code)
syntax_valid, syntax_message = self.syntax_check(generated_code)
if not syntax_valid:
print(f"Syntax not valid: {syntax_message}")
# code execution
print("\nExecuting code in sandbox...")
execution_success, execution_result = self.create_sandbox_and_execute(generated_code, reduced_html)
if not execution_success:
print(f"Executio failed: {execution_result}")
print("Code executed successfully.")
print(f"Execution result:\n{execution_result}")
validation, errors = self.validate_dict(execution_result, self.output_schema)
if not validation:
print(f"Output does not match the schema: {errors}")
state.update({self.output[0]: generated_code})
return state
def syntax_check(self, code):
try:
ast.parse(code)
return True, "Syntax is correct."
except SyntaxError as e:
return False, f"Syntax error: {str(e)}"
def create_sandbox_and_execute(self, function_code, html_doc):
# Create a sandbox environment
sandbox_globals = {
'BeautifulSoup': BeautifulSoup,
're': re,
'__builtins__': __builtins__,
}
# Capture stdout
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
# Execute the function code in the sandbox
exec(function_code, sandbox_globals)
# Get the extract_data function from the sandbox
extract_data = sandbox_globals.get('extract_data')
if not extract_data:
raise NameError("Function 'extract_data' not found in the generated code.")
# Execute the extract_data function with the provided HTML
result = extract_data(html_doc)
return True, result
except Exception as e:
return False, f"Error during execution: {str(e)}"
finally:
# Restore stdout
sys.stdout = old_stdout
def validate_dict(self, data: dict, schema):
try:
validate(instance=data, schema=schema)
return True, None
except ValidationError as e:
errors = e.errors()
return False, errors
def extract_code(self, code: str) -> str:
# Pattern to match the code inside a code block
pattern = r'```(?:python)?\n(.*?)```'
# Search for the code block, if present
match = re.search(pattern, code, re.DOTALL)
# If a code block is found, return the code, otherwise return the entire string
return match.group(1) if match else code