# 使用LangChain加载PDF，并对文本进行切割。

In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import NLTKTextSplitter

In [3]:
loader = PyMuPDFLoader("../data/sentic-gcn.pdf")
docs = loader.load()

In [3]:
text_splitter = NLTKTextSplitter(chunk_size=500)

In [4]:
texts = []
for doc in docs:
    texts.extend(text_splitter.split_text(doc.page_content))

Created a chunk of size 1120, which is longer than the specified 500
Created a chunk of size 1758, which is longer than the specified 500
Created a chunk of size 516, which is longer than the specified 500


# 通过langchain的Prompt和OpenAI，将Text转换为CPM-BEE可用的数据集。

In [5]:
from langchain.llms import OpenAI
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import PromptTemplate, LLMChain


In [6]:
input_schema = ResponseSchema(name="input",
                              description="the text inputed")
question_schema = ResponseSchema(name="question",
                                 description="the question generated by llms")
answer_schema = ResponseSchema(name="<ans>",
                               description="the anwser generated by llms")

response_schemas = [input_schema, 
                    question_schema,
                    answer_schema]

In [7]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [8]:
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "\`\`\`json" and "\`\`\`":

```json
{
	"input": string  // the text inputed
	"question": string  // the question generated by llms
	"<ans>": string  // the anwser generated by llms
}
```


In [9]:
template_string = """Ask {numbers} and generate the answer base on the text \
that is delimited by triple backticks. \
text: ```{text}```

{format_instructions}
"""

In [10]:
prompt = PromptTemplate(
    input_variables=["numbers",'text',"format_instructions"],
    template=template_string,
)

In [11]:
llm = OpenAI()
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
qa = []
for text in texts:
    response = llm_chain.run({
        "numbers": 5,
        "text": text,
        "format_instructions": format_instructions
    })
    try:
        output_dict = output_parser.parse(response)
        qa.append(output_dict)
    except:
        print(f"patse error: {response}")
        pass

In [14]:
# save qa to json file
import json
with open("../data/sentic-gcn.json", "w") as f:
    json.dump(qa, f)