In [4]:
"""
Use of openai plus langchain for processing information in a pdf
Generated using chatGPT for incorporating asyncio for concurrent running of prompts
Generated by pasting my code from the analysis_v3 script with the following question:
Can you modify the below python code to incorporate asyncio to allow concurrent running of the paper_search() function?
"""
from pathlib import Path  # directory setting
import asyncio # For async asking of prompts
from dotenv import load_dotenv, find_dotenv  # loading in API keys
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI  # LLM import
from langchain import LLMChain  # Agent import
from langchain.output_parsers import (  # Structuring the output format from the LLM questions
    StructuredOutputParser,
    ResponseSchema
)
from langchain.chains import LLMMathChain
from langchain.prompts import PromptTemplate

from langchain.prompts.chat import ( # prompts for designing inputs
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate
)


import pydantic
from langchain.agents import AgentExecutor, initialize_agent, AgentType
from langchain.schema import AgentFinish
from langchain.agents.tools import Tool
from langchain.chains import LLMMathChain

#from ..Server.PDFDataExtractor.pdfdataextractor.demo import read_single
import sys 
import os
sys.path.append(os.path.abspath("/Users/desot1/Dev/automating-metadata/Server/PDFDataExtractor/pdfdataextractor"))

from demo import read_single
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pyalex
import requests
import json


# Load in API keys from .env file
load_dotenv(find_dotenv())


True

TODO: 
- Make a checker that can evaluate outputs. 
- Make an agent that can take the tools - search over document, check, and return ORCHID id. 

STEP ONE: Agent that searches over a document. 

In [None]:
async def async_paper_search(query, docs, chain, output_parser):
    """
    Async version of paper search, run question for the document concurrently with other questions
    """
    format_instructions = output_parser.get_format_instructions()
    out = await chain.arun(doc_text=docs, query=query, format_instructions=format_instructions)  # need to have await combined with chain.arun
    results = output_parser.parse(out)
    print(type(results))
    return results


async def langchain_paper_search(file_path):

    #%% Setup, defining framework of paper info extraction
    # Define language model to use
    llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

    # Structured Output Schema
    #motivation_schema = ResponseSchema(name="motivation", description="This is the question or challenge that the work of this paper seeks to address.")
    #methods_schema = ResponseSchema(name="methods", description="This is the experimental methods and characterization techniques used by the authors in this paper.")
    #results_schema = ResponseSchema(name="results", description="This is a summary of the major results and conclusions obtained in the paper.")
    #figure_schema = ResponseSchema(name="figures", description="This is a comma separated list of descriptions for each figure in the paper.")
    #future_work_schema = ResponseSchema(name="future", description="This is any remaining questions or future work described by the authors in the Conclusions section of the paper.")
    author_schema = ResponseSchema(name="author", description="This is a list of the authors of this paper.")


    # Defining system and human prompts with variables
    system_template = "You are a world class research assistant who produces answers based on facts. \
                        You are tasked with reading the following publication text and answering questions based on the information: {doc_text}.\
                        You do not make up information that cannot be found in the text of the provided paper."

    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)  # providing the system prompt

    human_template = "{query}. {format_instructions}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

    chain = LLMChain(llm=llm, prompt=chat_prompt)


    #%% Extracting info from paper
    # Define the PDF document, load it in
    loader = PyPDFLoader(str(file_path))  # convert path to string to work with loader
    document = loader.load()

    # Define all the queries and corresponding schemas in a list
    queries_schemas_docs = [
        #("What are the experimental methods and techniques used by the authors? This can include ways that data was collected as well as ways the samples were synthesized.", [methods_schema], document),
        #"What is the scientific question, challenge, or motivation that the authors are trying to address?", [motivation_schema], document),
        #("Provide a summary of the results and discussions in the paper. What results were obtained and what conclusions were reached?", [results_schema], document),
        #("Provide a summary of each figure described in the paper. Your response should be a one sentence summary of the figure description, \
        # beginning with 'Fig. #  - description...', with each figure description separated by a comma. For example:'Fig. 1 - description..., Fig. 2 - description..., Fig. 3 - description...'", [figure_schema], document),
        #("What future work or unanswered questions are mentioned by the authors?", [future_work_schema], document),
        ("Who is(are) the author(s) of this paper?", [author_schema], document)
    ]

    tasks = []

    # Run the queries concurrently using asyncio.gather
    for query, schemas, docs in queries_schemas_docs:
        output_parser = StructuredOutputParser.from_response_schemas(schemas)
        task = async_paper_search(query, docs, chain, output_parser)
        tasks.append(task)

    summary = await asyncio.gather(*tasks)

    # Extracting individual elements from the summary
    # title, authors, materials, methods, motive, results, figures, future, tags = summary
    #methods, motive, results, figures, future, 
    title = summary

    #llm_output = motive | methods | figures | results | future | 
    llm_output = title

    return llm_output

#def quality_check(input, llm): 
    
def get_orchid(authors): 
    orchid = []
    print(type(authors))
    author_info = {}
    print(authors)
    author_list = authors[0]['author']
    #author_list = author_string.split(', ')
    
    print(author_list[0])
    for author in range(len(author_list)): 
        try: 
            url = "https://api.openalex.org/autocomplete/authors?q=" + author_list[author]
        except: 
            print("Your author might not be registered with ORCHID")
        response = json.loads(requests.get(url).text)
        
        if response["meta"]["count"] == 1: 
            orchid = response["results"][0]["external_id"]
            author_info[author_list[author]] = {orchid, response["results"][0]["hint"]}
        elif response["meta"]["count"] == 0: #FAKE - Create a test so we can check if the return is valid. 
            print("There are no ORCHID suggestions for this author")
        else: 
            orchid = response["results"][0]["external_id"]
            author_info[author_list[author]] = {orchid, response["results"][0]["hint"]}
            #create an async function which ranks the authors based on the similarity to the paper. 

    print(author_info)
    

llm_output = get_orchid(await langchain_paper_search("/Users/desot1/Dev/automating-metadata/app/uploads/Navahas2018.pdf"))


In [5]:
from pydantic import BaseModel, Field

from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA

In [7]:
class DocumentInput(BaseModel):
    question: str = Field()


llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

tools = []
files = [
    # https://digitalassets.tesla.com/tesla-contents/image/upload/IR/TSLA-Q1-2023-Update
    {
        "name": "navahas-research",
        "path": "/Users/desot1/Dev/automating-metadata/app/uploads/Navahas2018.pdf",
    },
]

for file in files:
    loader = PyPDFLoader(file["path"])
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(pages)
    embeddings = OpenAIEmbeddings()
    retriever = FAISS.from_documents(docs, embeddings).as_retriever()

    # Wrap retrievers in a Tool
    tools.append(
        Tool(
            args_schema=DocumentInput,
            name=file["name"],
            description=f"useful when you want to answer questions about {file['name']}",
            func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever),
        )
    )

ValueError: Could not import tiktoken python package. This is needed in order to for OpenAIEmbeddings. Please install it with `pip install tiktoken`.

In [8]:
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [10]:
llm = ChatOpenAI(
    temperature=0,
    model_name="gpt-3.5-turbo-0613",
)

agent = initialize_agent(
    agent=AgentType.OPENAI_FUNCTIONS,
    tools=tools,
    llm=llm,
    verbose=True,
)

agent({"input": "did alphabet or tesla have more revenue?"})

AttributeError: OPENAI_FUNCTIONS