In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

from huggingface_hub import login



In [2]:
# Load credentials
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env', verbose=True)




True

In [3]:
# Retrieve the Hugging Face token from environment variables
hf_token = os.getenv('HF_TOKEN')

In [4]:
# Log in to Hugging Face
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
# Define the directory containing info as PDF documents
directory_path = "../data/Training_docs/"



In [None]:
# Initialize the directory loader
directory_loader = PyPDFDirectoryLoader(directory_path)



In [None]:
# Load all documents from the directory
# running time: 1.5 min
documents = directory_loader.load()



In [None]:
# Combine the content of all documents into a single string
text_data = "\n".join([doc.page_content for doc in documents])



In [6]:
# or from a sinlge PDF file
def load_pdf_data(pdf_path):
    loader = PyPDFLoader(pdf_path)
    return loader.load()

In [7]:
# Sumarizing Text

In [8]:
doc_name = "../../data/Training_docs/Episode_1.pdf"

In [9]:
#Load data
# running time only 49 sec
docs= load_pdf_data("../data/Training_docs/"+ doc_name)

In [10]:
# Split into Chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_documets(text, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return text_splitter.split_documents(text)

In [11]:
docum_chunks = split_documets(docs)

In [12]:
#Create Embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [None]:
def create_embedding_vector_db(chunks, db_name, target_directory=f"../data/vector_databases"):
    """ This function creates a vector database from the chunks of text using HuggingFace embeddings and stores it in the specified directory. """
    # instantiate the embeddings
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    # create the vector store
    vectorstore = FAISS.from_documents(documents=chunks, embedding=embedding)
    # save the vector store
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    vectorstore.save_local(f"{target_directory}/{db_name}_vector_db")
    #return vector_store

In [34]:
# fix the cell below for the locatioon of the dir

In [18]:
create_embedding_vector_db(chunks=docum_chunks, db_name="docum")

---

In [19]:
#retrieve from vector database
def retrieve_from_vector_db(vector_db_path):
    """ This function splits out a retriever object from a local db. """
    #instantiate the embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    # load the vector store
    vectorstore = FAISS.load_local(vector_db_path, embeddings, allow_dangerous_deserialization=True)
    retriever = vectorstore.as_retriever()
    return retriever

In [21]:
loc_vector_db = "../data/vector_databases/docum_vector_db"

In [22]:
vec_retriever = retrieve_from_vector_db(loc_vector_db)

In [24]:
# Generation

In [35]:
# Imports: 
# Hub: used to pull predefined prompts or configurations from the LangChain hub.
# create_stuff_documents_chain: creates a chain to format and pass a list of documents to a LLM.
# create_retrieval-chain: sets up a retrieval chain, combining a retriever and a document processing chain.
# HuggingFacePipeline: a wrapper for Hugging Face models, allowing them to be used as LLMs in LangChain.
from langchain import hub
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.llms import HuggingFacePipeline

In [29]:
# Define the query
query = """
    Given the information {information}, please tell me the most important points in the text.
    Provide a summary of the text in 5 bullet points.
    """



In [30]:
# Define the prompt template
prompt_template = PromptTemplate(
    input_variables=["information"],
    template=query
)




In [36]:
# Load the Llama 3 model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_auth_token=hf_token,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Wrap the model using HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=model)

Loading checkpoint shards: 100%|██████████| 4/4 [00:51<00:00, 12.82s/it]
Some parameters are on the meta device because they were offloaded to the disk and cpu.
  llm = HuggingFacePipeline(pipeline=model)


In [37]:
def connect_chains(retriever, llm):
    """Integrates a document processing chain with a retrieval chain. 
    Parameters:
     - retriever: The retriever object, responsible for fetching relevant documents.
     - llm: The language model object, used for generating responses.
    Returns:
        - retrieval_chain: A chain that combines the retriever and document processing.
    """
    stuff_documents_chain = create_stuff_documents_chain(
        llm=llm,
        prompt=hub.pull("langchain-ai/retrieval-qa-chat")
    )
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=stuff_documents_chain
    )
    return retrieval_chain


In [38]:
a_retrieval_chain = connect_chains(vec_retriever,llm)

In [44]:
def generate_response(query, retrieval_chain):
    """Generates a response to a given query using the retrieval chain.
    Parameters:
        - query: The input question or prompt for which a response is needed.
        - retrieval_chain: The chain that combines the retriever and document processing.
    Returns:
        - response: The generated response based on the input query and retrieved documents.
    """
    response = retrieval_chain({"query": query})
    return response["result"]

In [55]:
def print_response(inquiry, retrieval_chain=a_retrieval_chain):
    """Prints the query and its corresponding response in a formatted manner.
    Parameters:
        - query: The input question or prompt.
        - response: The generated response based on the input query.
    """
    result = retrieval_chain.invoke({"input": inquiry})
    print(result["answer"].strip("\n"))

In [54]:
print_response("Give me the summary of the text.")

KeyError: 'input'

In [39]:
# Create a text generation pipeline
#text_generation_pipeline = pipeline(
#    "text-generation",
#    model=model,
#    tokenizer=tokenizer,
#    max_length=1024,
#    temperature=0.7,
#    top_p=0.9,
#    repetition_penalty=1.1
#)



In [40]:
# Wrap the pipeline in a LangChain LLM
#llm = HuggingFacePipeline(pipeline=text_generation_pipeline)



In [41]:
# Create the LLM chain
#chain = LLMChain(llm=llm, prompt=prompt_template)



In [42]:
# Invoke the chain with the text data
#output = chain.invoke(input={"information": text_data})



In [43]:
# Print the output
#print(output)