# LLM_JSON_Repair

* For GenAI purposes we use LLMs
* We need outputs in JSON format from LLMs
* LLMs sometimes don't give response in proper JSON format
* To address this the `parse_json_output` function is meant to handle all sorts of broken JSON issues
* We've also used `json_repair` package to further help us out

In [1]:
!pip install --upgrade -r "/home/ec2-user/SageMaker/15. Essential Code/requirements.txt" -q

In [5]:
import json, re, pprint, ast
import pprint
from json_repair import repair_json

In [6]:
# FIX LLM JSON OUTPUTS | DONT WORRY, IT WORKS FOR ALL BAD RESPONSES OF LLM
#=========================================================================
def parse_json_output(input_str):

    def remove_non_printable_chars(s):
        """
        Remove all non-printable characters using regex
        """
        return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', s)
    
    def semifinal_resolve(input_str):
        """
        Get all the content between first and last curly bracket
        """
        # Find the index of the first curly bracket
        start_index = input_str.index('{')
        # Find the index of the last curly bracket
        end_index = input_str.rindex('}')
        # Extract the content between the curly brackets
        json_content = input_str[start_index:end_index+1]
        # Parse the extracted JSON content
        parsed_json = json.loads(json_content)
        return parsed_json
    
    def final_resolve(input_str):
        fixed_json = repair_json(input_str)
        if isinstance(fixed_json, dict):
            pass
        elif isinstance(fixed_json, str):
            try:
                fixed_json = json.loads(fixed_json)
            except json.JSONDecodeError:
                try:
                    fixed_json = ast.literal_eval(fixed_json)
                except ValueError:
                    pass

        return fixed_json

    # Check if the string contains the regular non-printable characters then replace them
    input_str = input_str.replace('\n', ' ').replace('    ', '')
    
    # Check if the input string is marked as JSON within triple backticks
    if input_str.startswith("```json") and input_str.endswith("```"):
        # Extract the JSON content between the triple backticks
        json_content = input_str[7:-3].strip()
        # Parse the extracted JSON content
        try:
            parsed_json = json.loads(json_content)
        except json.JSONDecodeError as e:
            print("Failed at 1st step:", e)
            try:
                parsed_json = semifinal_resolve(input_str)
                return parsed_json
            except ValueError:
                print("No valid JSON-like content found")
                return None
        return parsed_json

    if re.search(r'[\x00-\x1F\x7F-\x9F]', input_str):
        # If it contains all sorts of non-printable characters, remove them
        input_str_cleaned = remove_non_printable_chars(input_str)
        # Check if the last character is a quotation mark
        if input_str_cleaned[-1] == '"':
            # Remove the last character (extra closing quotation mark)
            input_str_cleaned = input_str_cleaned[:-1]
        # Check if the first character is a quotation mark
        if input_str_cleaned[0] == '"':
            # Remove the first character (extra opening quotation mark)
            input_str_cleaned = input_str_cleaned[1:]
        # Now parse the cleaned JSON string
        try:
            parsed_json = json.loads(input_str_cleaned)
        except json.JSONDecodeError as e:
            print("Failed at 2nd step:", e)
            try:
                parsed_json = semifinal_resolve(input_str)
                return parsed_json
            except ValueError:
                print("No valid JSON-like content found")
                return None

    # If all previous conditions fail, attempt to extract JSON-like content surrounded by curly brackets
    try:
        parsed_json = json.loads(input_str)
        return parsed_json
    except json.JSONDecodeError as e:
        print("Failed at 3rd step:", e)
        try:
            parsed_json = semifinal_resolve(input_str)
            return parsed_json
        except ValueError:
            print("Failed at semifinal_resolve")
            try:
                parsed_json = final_resolve(input_str)
                return parsed_json
            except ValueError:
                print("No valid JSON-like content found")
                return None

## Test the `parse_json_output`

In [7]:
bad_json_string = """
            {
                "Loss": {
                        "Hi there": budy,
                        "whats up": nothing
                        }
            }
"""

fixed = parse_json_output(bad_json_string)
fixed

Failed at 3rd step: Expecting value: line 1 column 26 (char 25)
Failed at semifinal_resolve


{'Loss': {'Hi there': 'budy', 'whats up': 'nothing'}}

In [8]:
output = """
```{json}
{{
    "Summary": "The call involved a customer following up on a request regarding account extensions and splits. The agent provided updates and escalated the issue for further investigation.",
    "Primary topic": "Account extensions and splits",
    "Primary topic explanation": "Customer inquired about extending account terms and splitting an account.",
    "Secondary topic": "Escalation and investigation",
    "Secondary topic explanation": "Agent escalated the issue for further investigation and provided updates to the customer.",
    "Issue resolution": "Partially resolved",
    "Issue resolution explanation": "The agent raised the issue for investigation and provided the customer with a reference number for tracking. The resolution is pending further updates from the team."
}{}
```
"""

pprint.pp(repair_json(output))

('{"Summary": "The call involved a customer following up on a request '
 'regarding account extensions and splits. The agent provided updates and '
 'escalated the issue for further investigation.", "Primary topic": "Account '
 'extensions and splits", "Primary topic explanation": "Customer inquired '
 'about extending account terms and splitting an account.", "Secondary topic": '
 '"Escalation and investigation", "Secondary topic explanation": "Agent '
 'escalated the issue for further investigation and provided updates to the '
 'customer.", "Issue resolution": "Partially resolved", "Issue resolution '
 'explanation": "The agent raised the issue for investigation and provided the '
 'customer with a reference number for tracking. The resolution is pending '
 'further updates from the team."}')
