# PDF Chatbot with Llama-2

## 1. Setup and Configuration

In [None]:
import time
import torch
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings # Deprecated
# from langchain_huggingface import HuggingFaceEmbeddings # New


In [None]:
# # Downloading model with timeout and retries
# # Not uploading the modelfile to github, either download manually or use the code below

# import requests
# from tqdm import tqdm

# def download_with_timeout(repo_id, filename, timeout=60, retries=3):
#     url = f"https://huggingface.co/{repo_id}/resolve/main/{filename}"
    
#     for attempt in range(retries):
#         try:
#             response = requests.get(url, timeout=timeout, stream=True)
#             response.raise_for_status()
            
#             total_size = int(response.headers.get('content-length', 0))
#             with open(filename, 'wb') as f:
#                 for data in tqdm(response.iter_content(chunk_size=4096), total=total_size//4096, unit='KB'):
#                     f.write(data)
#             break  # Exit loop on success
#         except requests.exceptions.RequestException as e:
#             print(f"Attempt {attempt + 1} failed: {e}")
#             if attempt < retries - 1:
#                 time.sleep(2 ** attempt)  # Exponential backoff
#             else:
#                 print("Max retries reached. Download failed.")

# download_with_timeout("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q4_K_M.gguf")

In [None]:
start_time = time.time()

# Check for GPU availability and print Torch version
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

if device != 'cpu':
    print('Torch version:', torch.__version__)

end_time = time.time()
print(f"Time taken to execute the version tests: {round(end_time - start_time, 4)} seconds")

## 2. Load and Process PDF Document

In [None]:
start_time = time.time()

# Load the PDF document
loader = PyPDFLoader("Harry Potter and the Sorcerers Stone.pdf")
data = loader.load()

end_time = time.time()
print(f"Time taken to load the PDF document: {round(end_time - start_time, 4)} seconds")

In [None]:
# Verify the number of documents loaded
print(f"Number of documents loaded: {len(data)}")

In [None]:
start_time = time.time()

# Split the document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
print(f"Number of text chunks created: {len(all_splits)}")

end_time = time.time()
print(f"Time taken to split the documents: {round(end_time - start_time, 4)} seconds")

## 3. Create Embeddings and Vector Store

In [None]:
start_time = time.time()

# Load the embedding model
model_name = "sentence-transformers/all-mpnet-base-v2"

print("Using device: ", device)

# Load the embeddings model to GPU/CPU based on availability
embed_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

if device != 'cpu': 
    # After you are done with the model, clear the GPU memory:
    torch.cuda.empty_cache()  # Free up the cached memory on the GPU
    print("GPU memory cleared.")

end_time = time.time()
print(f"Time taken to load the embeddings model: {round(end_time - start_time, 4)} seconds")

In [None]:
start_time = time.time()

# Create the Chroma vector store
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embed_model)

end_time = time.time()
print(f"Time taken to create the vector store: {round(end_time - start_time, 4)} seconds")

## 4. Load the Language Model

In [None]:
start_time = time.time()

# Setup callback manager and LLM parameters
n_gpu_layers = -1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


end_time = time.time()
print(f"Time taken to setup callback manager and LLM parameters: {round(end_time - start_time, 4)} seconds")

In [None]:
start_time = time.time()

# Load the LlamaCpp model
llm = LlamaCpp(
    model_path="models/llama-2-7b-chat.Q4_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=False,
)

end_time = time.time()
print(f"Time taken to initialize the LlamaCpp model: {round(end_time - start_time, 4)} seconds")

## 5. Create and Run the RAG Pipeline

In [None]:
start_time = time.time()

# Create the RetrievalQA pipeline
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

end_time = time.time()
print(f"Time taken to create the RetrievalQA pipeline: {round(end_time - start_time, 4)} seconds")

In [None]:
start_time = time.time()

# Ask a question
query = "Who is Harry Potter?"
print(f"Question: {query}")

# Get the answer from the pipeline
result = rag_pipeline(query)
print(result.content[0])

end_time = time.time()
print(f"Time taken to get the answer: {round(end_time - start_time, 4)} seconds")