In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

You should consider upgrading via the 'C:\Users\douglassaturnino\Documents\Structured_Data_PDFs\venv\Scripts\python.exe -m pip install --upgrade pip' command.
You should consider upgrading via the 'C:\Users\douglassaturnino\Documents\Structured_Data_PDFs\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Cdeahroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv

In [None]:
load_dotenv()

## Process PDF document

### Load PDF document

In [None]:
loader = PyPDFLoader("data/DOUGLASSATURNINO.pdf")
pages = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                               chunk_overlap=200,
                                               length_function=len,
                                               separators=["\n\n", "\n", " "])
text_splitter.splitter.plit_documents(pages)                                           

In [None]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()    
test_vector = embedding_function.embed_query("cat")

In [None]:
from langchain.evalutation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance", embeddings=embedding_function)

evaluator.evaluate_string(prediction="Amsterdam", reference="coffeeshop")

In [None]:
evaluator.evaluate_string(prediction="Paris", reference="coffeeshop")

In [None]:
def create_vectorstore(chunks, embedding_function, vectorstore_path):
    # Create a new database from the documents
    vectorstore = Chroma.from_documents(documents=chunks,
                                        ids=list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory = "vectorstore")

    vectorstore.persist()
    return vectorstore                                    

In [None]:
# Create vectorstore
vectorstore = create_vectorstore(chunks=chunks,
                                 embedding_fuction=embedding_function,
                                 vectorstore_path="vectorstore_chroma")

## Query for relevant data

In [None]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [None]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the article?")
relevant_chunks

In [None]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DONT'T MAKE UP ANYTHING

{context}

---

Answer the question based on the above context: {question}
"""

In [None]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_contect for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the title of the paper?")

# Generate responses

In [None]:
llm.invoce(prompt)

## Using Langchain Expression Language

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)

rag_chain.invoke("What's the title of this paper?")

# Generate structured responses

In [None]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of he answer based on the sources")

class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    papel_title: str = Field(description="Title of the paper")
    paper_summary: str = Field(description="Summary of the paper")
    publication_year: int = Field(description="Year of publication of the paper")
    paper_authors: str = Field(description="Names of the authors of the paper")

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm.with_strutured_output(ExtractedInfo)
)

rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")

In [None]:
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'

answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df