In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings  import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA



In [17]:
def pdf_loader(data):
    """Load PDF files from a directory and split them into chunks."""
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

document_text = pdf_loader("D:\chatbot\data")
print(f"Loaded {len(document_text)} documents.")

Loaded 26 documents.


In [18]:
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300 , chunk_overlap=20)
documents = text_splitter.split_documents(document_text)

print(f"Split into {len(documents)} chunks.")

Split into 187 chunks.


In [19]:
#Create embeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embeddings model loaded.")

Embeddings model loaded.


In [22]:
from tqdm import tqdm
# Show progress while preparing documents
documents = list(tqdm(documents, desc="Processing documents"))


Processing documents: 100%|██████████| 187/187 [00:00<00:00, 93573.71it/s]


In [23]:
from langchain.vectorstores import FAISS

vectorstore = FAISS.from_documents(documents, embeddings)
print("FAISS Vector store created.")


FAISS Vector store created.


In [42]:
from dotenv import load_dotenv
import os
from groq import Groq

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")

groq_client = Groq(api_key=groq_api_key)



# Initialize your Groq model with the API Key
model = ChatGroq(model="llama3-8b-8192", api_key=groq_api_key)
print(model.model_name)



# Create a retriever from the vectorstore
retriever = vectorstore.as_retriever()

# Initialize the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm=model, retriever=retriever)

print("RetrievalQA chain initialized.")


# Perform a query using the qa_chain
query = "What is this PDF about?"
response = qa_chain.run(query)
print("Response:", response)

llama3-8b-8192
RetrievalQA chain initialized.
Response: I don't know. This conversation just started, and I haven't seen the PDF. However, based on the context provided, it appears to be about machine learning and data science. The questions and answers mention topics like F1 score, cross-validation, and machine learning algorithms, which are commonly discussed in the field of data science.
