Description
Bug Description
I'm using SubQuestionQueryEngine with multiple tools, but it crashes with the following exception:
Traceback (most recent call last):
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\output_parsers\utils.py", line 45, in parse_json_markdown
json_obj = json.loads(json_string)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Program Files\Python312\Lib\json_init_.py", line 346, in loads
return _default_decoder.decode(s)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Program Files\Python312\Lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 3 column 5 (char 8)During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\output_parsers\utils.py", line 52, in parse_json_markdown
json_obj = yaml.safe_load(json_string)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\yaml_init_.py", line 125, in safe_load
return load(stream, SafeLoader)
^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\yaml_init_.py", line 81, in load
return loader.get_single_data()
^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\yaml\constructor.py", line 49, in get_single_data
node = self.get_single_node()
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\yaml\composer.py", line 39, in get_single_node
if not self.check_event(StreamEndEvent):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\yaml\parser.py", line 98, in check_event
self.current_event = self.state()
^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\yaml\parser.py", line 171, in parse_document_start
raise ParserError(None, None,
yaml.parser.ParserError: expected '', but found ''
in "", line 4, column 5:
semantic_search = pipeline('sema ...
^During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\AI\llama\rag2.py", line 119, in
response = top_query_engine.query(query)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\instrumentation\dispatcher.py", line 274, in wrapper
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\base\base_query_engine.py", line 53, in query
query_result = self._query(str_or_query_bundle)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\query_engine\sub_question_query_engine.py", line 145, in _query
sub_questions = self._question_gen.generate(self._metadatas, query_bundle)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\question_gen\llm_generators.py", line 81, in generate
parse = self._prompt.output_parser.parse(prediction)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\question_gen\output_parser.py", line 11, in parse
json_dict = parse_json_markdown(output)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\sergii.vashchyshchuk\AppData\Roaming\Python\Python312\site-packages\llama_index\core\output_parsers\utils.py", line 54, in parse_json_markdown
raise OutputParserException(
llama_index.core.output_parsers.base.OutputParserException: Got invalid JSON object. Error: Extra data: line 3 column 5 (char 8) expected '', but found ''
in "", line 4, column 5:
semantic_search = pipeline('sema ...
^. Got JSON string: []# Create a semantic search model semantic_search = pipeline('semantic-search') # Iterate over each tool for tool_name, tool_description in tools.items(): # Perform semantic search to find relevant phrases in the user question relevance_scores = semantic_search(user_question, [tool_description]) relevance_score = relevance_scores[0]['score']
After debugging, I find out that when the model (llama3:70b-instruct) is asked to generate questions for the metadata, it generates a python script to generate questions instead of questions themselves:
> Here is a Python solution that generates the desired output:
> ```
> import json
>
> def generate_sub_questions(tools, user_question):
> sub_questions = []
>
> # Extract relevant keywords from the user question
> keywords = [word.lower() for word in user_question.split()]
>
> # Iterate over each tool
> for tool_name, tool_description in tools.items():
> # Check if any keyword is present in the tool description
> for keyword in keywords:
> if keyword in tool_description.lower():
> # Generate sub-questions based on the tool and user question
> sub_questions.extend(generate_sub_questions_for_tool(tool_name, user_question))
>
> return {"items": sub_questions}
>
> def generate_sub_questions_for_tool(tool_name, user_question):
> sub_questions = []
>
> # Example 1: Uber/Lyft financials
> if "uber" in tool_name or "lyft" in tool_name:
> sub_questions.extend([
> {"sub_question": f"What is the revenue growth of {tool_name.split('_')[0].capitalize()}", "tool_name": tool_name},
> {"sub_question": f"What is the EBITDA of {tool_name.split('_')[0].capitalize()}", "tool_name": tool_name}
> ])
>
> # Example 2: CargoWise Wiki
> elif "cargowise" in tool_name.lower():
> sub_questions.extend([
> {"sub_question": f"How to migrate data from old column to new one in {tool_name.split('_')[0].capitalize()}?", "tool_name": tool_name},
> {"sub_question": f"What transformations of which type should I write for migrating data in {tool_name.split('_')[0].capitalize()}?", "tool_name": tool_name}
> ])
>
> return sub_questions
>
> # Example 1
> tools = {
> "uber_10k": "Provides information about Uber financials for year 2021",
> "lyft_10k": "Provides information about Lyft financials for year 2021"
> }
> user_question = "Compare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021"
> print(json.dumps(generate_sub_questions(tools, user_question), indent=4))
which fails the parser.
I believe you need to make the question_gen_prompt prompt clearer, so that LLM understands we want a result, not a script generating the result.
Version
0.10.37
Steps to Reproduce
It is inconsistent. For some queries/tools/metadata it works, for some it doesn't. Even for the same query sometimes it works, sometimes doesn't.
Relevant Logs/Tracbacks
No response