In [8]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI


# Load environment variables from .env file
load_dotenv()


# Ensure the OpenAI API key is set in the environment
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set")


# Select the OpenAI model
model_name = "gpt-4o-mini"  # Replace with the desired model name

# Initialize the OpenAI LLM (Language Learning Model)
llm = ChatOpenAI(api_key=openai_api_key, model=model_name)

In [9]:
# Load docs
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./Basics of Writing Review Articles.pdf")
docs = loader.load()


# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=25)
all_splits = text_splitter.split_documents(docs)

In [None]:
# Index
from langchain_chroma import Chroma

from langchain_openai.embeddings import OpenAIEmbeddings

!rm -rf ../data/review/vectorstore # remove old database files if any

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=OpenAIEmbeddings(),
    persist_directory="data/review/vectorstore")

vector_store.add_documents(all_splits)

In [10]:
# Retrieve
retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)

In [None]:
# Create papers retriever
def papers_retriever(urls):
    !rm -rf ../data/paper/vectorstore

    papers = [PyPDFLoader(url).load() for url in urls]
    papers_list = [item for sublist in papers for item in sublist]
    
    paper_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
    all_splits = paper_splitter.split_documents(papers_list)
    
    papers_vectorstore = Chroma(
                          collection_name="example_collection",
                          embedding_function=OpenAIEmbeddings(),
                          persist_directory="data/paper/vectorstore")
    papers_vectorstore.add_documents(all_splits)

    papers_retriever = papers_vectorstore.as_retriever(
                       search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)
    
    return papers_retriever

In [11]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough


# It takes in the documents as {context} and user provides a {topic}.
template = """
You are an expert in writing reviews who helps a research to write a review on a specific topic.
Use the tips in the context and write a scientific review on a topic provided by a user.

Context: {context}
Topic: {topic}

Provide the review in JSON format with the following fields in the order:
-"Abstract"
-"Keywords"
-"Introduction"
-"Methods"
-"Results"
-"Discussion"
-"Conclusion"
This JSON will be automatically parsed, so ensure the format is precise.


"""

prompt = ChatPromptTemplate.from_template(template)

model = llm

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
# Initialize an empty list to store the PDF URLs
pdf_urls = []

while True:
    # Ask the user to input a file path
    user_input = input("Enter a PDF file path or 'Finished' to stop: ")

    # Check if the user wants to stop the loop
    if user_input.lower() == "finished":
        break

    # Validate if the input ends with '.pdf'
    if user_input.endswith(".pdf"):
        # Store the valid PDF URL in the list
        pdf_urls.append(user_input)
    else:
        print("Invalid input. Please enter a URL that ends with '.pdf'.")

# Output the collected PDF URLs
print("Collected PDF URLs:", pdf_urls)

In [None]:
papers_retriever = papers_retriever(pdf_urls)

In [None]:
# Using LangCHain LCEL to supply the prompt and generate output
chain = (
        {"context": retriever | format_docs, "papers": papers_retriever, "topic": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
     )

topic = input("Enter a topic:")

chain.invoke(topic)