In [21]:
import os
import openai
import json
import re
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI

chatllm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key="EMPTY", openai_api_base="http://localhost:8000/v1")

In [64]:
def strict_text(system_prompt, user_prompt, output_format, delimiter = '###', num_tries = 3, verbose = False):
    ''' Ensures that OpenAI will always adhere to the desired output json format. 
    Uses rule-based iterative feedback to ask GPT to self-correct.
    Keeps trying up to num_tries it it does not. Returns empty json if unable to after num_tries iterations.'''

    # start off with no error message
    error_msg = ''
    
    for i in range(num_tries):
        
        # make the output format keys with a unique identifier
        new_output_format = {}
        for key in output_format.keys():
            new_output_format[f'{delimiter}{key}{delimiter}'] = output_format[key]
        output_format_prompt = f'''\nYou are to output the following in json format: {new_output_format}
You must use "{delimiter}{{key}}{delimiter}" to enclose the each {{key}}.
Don't return anything else except the json!!!'''

        response = chatllm.invoke(
            [
              SystemMessage(content = system_prompt + output_format_prompt + error_msg),
              HumanMessage(content = str(user_prompt)),
            ]
        )
        
        # res = response['choices'][0]['message']['content']
        res = response.content

        if verbose:
            print('System prompt:', system_prompt + output_format_prompt + error_msg)
            print('\nUser prompt:', str(user_prompt))
            print('\nGPT response:', res)
        
        # try-catch block to ensure output format is adhered to
        try:
            # check key appears for each element in the output
            for key in new_output_format.keys():
                # if output field missing, raise an error
                if key not in res: raise Exception(f"{key} not in json output")
                
            # if all is good, we then extract out the fields
            # Use regular expressions to extract keys and values
            pattern = fr",*\s*['|\"]{delimiter}([^#]*){delimiter}['|\"]: "

            matches = re.split(pattern, res[1:-1])

            # remove null matches
            my_matches = [match for match in matches if match !='']

            # remove the ' or " from the value matches
            curated_matches = [match[1:-1] if match[0] in '\'"' else match for match in my_matches]

            # create a dictionary
            end_dict = {}
            for i in range(0, len(curated_matches), 2):
                end_dict[curated_matches[i]] = curated_matches[i+1]

            return end_dict

        except Exception as e:
            error_msg = f"\n\nResult: {res}\n\nError message: {str(e)}\nYou must use \"{delimiter}{{key}}{delimiter}\" to enclose the each {{key}}."
            print("An exception occurred:", str(e))
            print("Current invalid json format:", res)
         
    return {}

In [65]:
res = strict_text(system_prompt = 'You are a classifier',
                    user_prompt = 'It is a beautiful day',
                    output_format = {"Sentiment": "Type of Sentiment",
                                    "Tense": "Type of Tense"})

print(res)

{'Sentiment': 'Positive', 'Tense': 'Present'}


In [71]:
questions = ["What is the main idea of this paper?",
             "What is the definition of the problem?",
             "What are the difficulties of the problem?",
             "What the papers in the past do on the problem?",
             "What downsides the past papers have?",
             "Summarize the main steps of the method and the advantage of the steps."]

In [72]:
sections = [
  "1 Introduction",
  "2 Problem Definition",
  "3 Framework Of Search",
  "4 Top-K Star Matching",
  "5 Top-K Join & Pattern Decomposition",
  "6 Evaluation",
  "7 Related Work",
  "8 Conclusion",
  "Acknowledgments",
  "References"
]

section_summaries = [
  "The text discusses the use of graphs to represent real-life data, with an emphasis on multi-modal graphs that contain both structured and unstructured data. The authors propose a new graph data model called the neural-symbolic graph database (NSGD) that extends property graph models with content and structural embeddings in every node and edge. They also introduce a novel algorithm called NSMatch for subgraph search over NSGDs, which uses sophisticated ranking functions to generate top answers in a monotonic decreasing order of matching score. The authors provide experimental results that demonstrate the effectiveness and efficiency of their algorithms.",
  "The text discusses the problem of Neural-Symbolic Subgraph Matching (NSMatch) in Neural-Symbolic Graph Databases (NSGDs). NSGDs are directed labeled graphs that consist of nodes, edges, labels, attributes, and vectors representing unstructured data. NSMatch aims to find a subgraph match in an NSGD based on a given graph pattern and a similarity function. The problem is to find the top-k subgraph matches in the NSGD that meet a certain matching score. NSMatch is NP-hard due to its special case being NP-complete.",
  "The text discusses an algorithmic framework for attacking the hard problem of NSMatch. The framework consists of three main steps: pattern decomposition, star matching, and top-k join.",
  "The text discusses the SMat algorithm for computing top-k star matches in a star pattern. The algorithm consists of three phases: Phase 1 identifies candidate node matches and top-h leaf matches; Phase 2 selects the best k star matches to form a pseudo top-k star matches and sorts the leaf matches; and Phase 3 maintains the priority queue and pops the best match until the desired top-k matches are obtained. The algorithm uses the content vectors Csq and Cg, as well as the structural vector Sg, to identify edges and compute the similarity function δ(·) between Csq and Cg. The value of h, which determines the number of leaf matches to be scanned, is pre-determined in the system. The algorithm is efficient and can accurately complete the missing edges of top-k star matches.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         5 Top-K Join & Pattern Decomposition",
  "",
  "",
  "The text discusses the related work on subgraph search and approximate nearest neighbors search (ANNS) for graph-based methods. It categorizes the subgraph search methods into symbolic methods and neural approaches, and provides examples of each category. It also focuses on graph-based ANNS methods, particularly HNSW and NSG, and explains how NSMatch differs from traditional subgraph matching and existing ANNS algorithms.",
  "The text discusses a new method called neural-symbolic subgraph matching (NSMatch) that uses a neuralsymbolic graph database (NSGD) model to support applications of multi-modal knowledge graphs and multi-modal social medias. The method includes a general and efficient algorithmic framework to process NSMatch, strategies of edge deletion to speed up graph-based ANNS, and a neural-symbolic learning model to complete the missing edges of the NSGD. The text also mentions future work, such as studying incremental NSMatch and investigating more pattern types over large NSGDs.",
  "The text acknowledges the financial support provided by various organizations to the authors Ye Yuan and Jianbin Qin for their research on the National Key R&D Program of China, the NSFC, and the DITDP.",
  "The text discusses various approaches and techniques for efficient and accurate subgraph matching, similarity search, and knowledge graph completion. It covers techniques such as using approximate nearest neighbor queries, translating embeddings, and graph-based methods. The paper also discusses the use of knowledge graphs for text-centric information retrieval and efficient k-nearest neighbor graph construction."
]



In [73]:
for question in questions:
  print("###########################")
  prompt = f"For answering the question: {question}, you need to choose which 1-2 sections to read and return the indexes of the section titles you want to read. The section titles are as follows:\n"
  for i, section_title in enumerate(sections):
    prompt += str(i)
    prompt += f"  ##{section_title}##\n"

  res = strict_text(system_prompt = 'You are a paper reader',
                    user_prompt = prompt,
                    output_format = {"indexes": ["the indexes of the section titles you want to read, only include 1-2 of them"]})
  
  print("@@@prompt:@@@")
  print(prompt)
  print("@@@res@@@")
  print(res)
  print("@@@choosed sections@@@")


  for index in res['indexes']:
    

###########################
@@@prompt:@@@
For answering the question: What is the main idea of this paper?, you need to choose which 1-2 sections to read and return the indexes of the section titles you want to read. The section titles are as follows:
0  ##1 Introduction##
1  ##2 Problem Definition##
2  ##3 Framework Of Search##
3  ##4 Top-K Star Matching##
4  ##5 Top-K Join & Pattern Decomposition##
5  ##6 Evaluation##
6  ##7 Related Work##
7  ##8 Conclusion##
8  ##Acknowledgments##
9  ##References##

@@@res@@@
{'indexes': '["2", "4", "5", "6"]'}
###########################
@@@prompt:@@@
For answering the question: What is the definition of the problem?, you need to choose which 1-2 sections to read and return the indexes of the section titles you want to read. The section titles are as follows:
0  ##1 Introduction##
1  ##2 Problem Definition##
2  ##3 Framework Of Search##
3  ##4 Top-K Star Matching##
4  ##5 Top-K Join & Pattern Decomposition##
5  ##6 Evaluation##
6  ##7 Related Work#

In [70]:
res = strict_text(system_prompt = 'You are a paper reader',
                    user_prompt = prompt,
                    output_format = {"indexes": ["the indexes of the section titles you want to read, only include 1-2 of them"]})

print(res)

{'indexes': "['2', '4', '6']"}


In [75]:
from jsonformer import Jsonformer
from transformers import AutoModelForCausalLM, AutoTokenizer



In [76]:
model = chatllm
tokenizer = chatllm

json_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "number"},
        "is_student": {"type": "boolean"},
        "courses": {
            "type": "array",
            "items": {"type": "string"}
        }
    }
}

prompt = "Generate a person's information based on the following schema:"
jsonformer = Jsonformer(model, tokenizer, json_schema, prompt)
generated_data = jsonformer()

print(generated_data)

TypeError: Got unknown type G