In [12]:
from langchain.prompts.prompt import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

import os
import warnings
warnings.filterwarnings("ignore")


In [13]:
load_dotenv()

True

In [29]:
model_id = "llama3-8b-8192"
file_name = "0706.1996"
doc_ext =".pdf"
doc_path = "../data/Training_docs/"
db_path = "../data/vector_databases/AWS/"

In [30]:
# Set the model ID and parameters
llm = ChatGroq(
    model=model_id,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [6]:
def load_pdf_data(pdf_file_path):
    """
    this function loads text data from pdf file
    """
    loader = PyPDFLoader(file_path=pdf_file_path)
    documents = loader.load()
    return documents

In [6]:
def split_to_chunks(documents, chunk_size=800, chunk_overlap=80):
    """
    this function splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)##arning ,may be recursive
    return chunks

In [7]:
def create_embedding_vector_db(chunks, file_name, target_directory=f""+db_path):
    """
    this function uses the open-source embedding model HuggingFaceEmbeddings 
    to create embeddings and store those in a vector database called FAISS, 
    which allows for efficient similarity search
    """
    # instantiate embedding model
    embedding = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    # create the vector store 
    vectorstore = FAISS.from_documents(
        documents=chunks,
        embedding=embedding
    )
    # save vector database locally
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    vectorstore.save_local(f"{target_directory}/{file_name}_vector_db")

In [31]:
def retrieve_from_vector_db(vector_db_path):
    """
    this function splits out a retriever object from a local vector database
    """
    # instantiate embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2'
    )
    react_vectorstore = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True
    )
    retriever = react_vectorstore.as_retriever()
    return retriever

In [9]:
loaded_pdf =  load_pdf_data(doc_path+file_name+doc_ext)

In [10]:
doc_chunks = split_to_chunks(loaded_pdf)

In [11]:
create_embedding_vector_db(chunks=doc_chunks, file_name = file_name, target_directory=db_path)

In [32]:
doc_retriever = retrieve_from_vector_db(db_path+file_name+"_vector_db")	

In [33]:
def connect_chains(retriever):
    """
    this function connects stuff_documents_chain with retrieval_chain
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain

In [34]:
doc_retrieval_chain = connect_chains(doc_retriever)

In [35]:
def print_output(
    inquiry,
    retrieval_chain=doc_retrieval_chain
):
    result = retrieval_chain.invoke({"input": inquiry})
    print(result['answer'].strip("\n"))

In [36]:
print_output("Give me the summary the text in 3 sentences.")

The text appears to be a scientific paper or article discussing the Radulescu Planet RB, which is described as resembling a planet with peaks and valleys. The author, Radulescu, references several of his own publications on topics such as cancer therapy, retinoblastoma protein, and host defense against microbial infection. The text also mentions several peptides and their potential applications in medicine, including MCR peptides and sequences similar to nociceptin/orphanin FQ and ß-endorphin.


In [66]:
print_output("Please write only the name of the author of this document, if there is no author mentioned, write only unknown")

Radulescu


In [65]:
print_output("if there is a date of publication of the document, Please write only the date of publication, if there is no date of publication, please write only today's date.")

June 2007


In [59]:
print_output("if there is an indication that this document is peer reviewed, please write yes, otherwise unclear")


Unclear


In [60]:
print_output("Please assign a relevance between 0 and 1 of this document to longevity, and write only the number")

0.7


In [62]:
print_output("Please write just the name of the institution that published this article, if it not clear just write unclear")

Unclear


In [63]:
print_output("Please write a brief summary of the document")

The document discusses the potential roles of the retinoblastoma protein (RB) in various biological processes, including anti-cancer and anti-aging effects. The author identifies specific regions of the RB protein that may be involved in these processes, including docking sites for other molecules, calcium binding sites, and sequences that resemble other proteins involved in insulin signaling and oxygen binding. The author suggests that RB may play a role in blocking insulin receptor activation, countering neurodegeneration, and promoting cell survival, among other functions.


In [49]:
print_output("can you create a list of the name of the file, the date of publication, the author, and the relevance of this document?")

Based on the provided context, here is the list of information you requested:

1. File name: Planet RB
2. Date of publication: June 2007
3. Author: Razvan Tudor Radulescu
4. Relevance: The document appears to be a mini-review of the author's research on the retinoblastoma tumor suppressor protein (RB), highlighting various putative or proven roles of RB, including its potential to bind to other molecules, self-associate, and interact with calcium, oxygen, and other molecules. The document also mentions the author's previous research and publications on the topic.


In [None]:
import pandas as pd
cols = "File_Name","Relevance_for_longevity","Date_of_publication", "Author","Institution","Peer Reviewed","Summary"

df = pd.DataFrame(columns=cols)
print_output("can you append the info required to the datafrae df?")

I'm happy to help! However, I need to clarify that the provided context is a text, not a dataset. But I can help you extract the information and append it to a dataframe if you provide a dataframe structure.

Assuming you have a dataframe with columns for "Residue Number", "Putative/Proven Role", and "Reference", you can use the following code to append the information:

```
import pandas as pd

# Create a dataframe
df = pd.DataFrame(columns=["Residue Number", "Putative/Proven Role", "Reference"])

# Append the information
df = df.append({"Residue Number": 1-26, "Putative/Proven Role": "signal peptide", "Reference": 1}, ignore_index=True)
df = df.append({"Residue Number": 3-5, "Putative/Proven Role": "antineoplastic & anti-aging through block of insulin binding to its receptor", "Reference": 2}, ignore_index=True)
df = df.append({"Residue Number": 13-18, "Putative/Proven Role": "amyloidogenic hexa-alanine counteracting (prion-associated) neurodegeneration", "Reference": 3, 4}, ignore_i