In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import requests
import json

### Download the book "20,000 Leagues Under the Sea" by Jules Verne from the Gutenberg project and save it localy.
Download link: "https://www.gutenberg.org/cache/epub/164/pg164.txt"
<!--  -->
Load the text file and remove the introduction as well as the table of contents.

In [None]:
# The URL of the text
url = "https://www.gutenberg.org/cache/epub/164/pg164.txt"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Save the content to a text file
    with open("TwentyThousandLeaguesUnderTheSea-JulesVerne.txt", "w", encoding="utf-8") as file:
        file.write(response.text)
    print("Download successful, content saved to 'TwentyThousandLeaguesUnderTheSea-JulesVerne.txt'")
else:
    print(f"Failed to download the file. Status code: {response.status_code}")

# Open the file and load its contents into a string called docs
with open("TwentyThousandLeaguesUnderTheSea-JulesVerne.txt", "r", encoding="utf-8") as file:
    docs = file.read()
    # Remove empty lines from the string
    docs = "\n".join([line for line in docs.splitlines() if line.strip()])
    # Remove introduction and ToC 
    docs = docs[2700:]

# Check the first few characters to confirm it's loaded
print(docs[:500]) 


### Chunk documents or textfile(s) into chunks! Experiment with chunk size as well as chunk overlap

In [None]:
# Initialize the text splitter to divide text into chunks
# - `chunk_size` is the maximum size of each chunk (1000 characters in this case)
# - `chunk_overlap` is the number of characters that will overlap between chunks (100 characters)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Code for processing text files
# Split the provided text documents into chunks using the defined text splitter
docs = text_splitter.split_text(docs)

# Print the number of chunks created from the documents
print(f"Current number of chunks {len(docs)}.")
print(docs[0])  # Uncomment to print the first chunk for verification


###  Initialize model for converting text into numerical representation (embeddings). We are using all-MiniLM-L6-v2 from HuggingFace

In [None]:
# Define the embedding model to convert text into embeddings
# Using the "all-MiniLM-L6-v2" model from the Sentence Transformers library
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=embedding_model, model_kwargs={'device': 'cpu'})


 ###  Convert & store PDF chunks in our vectorstore

In [None]:
# Create a vector store for storing text embeddings (for text files)
# Use the embedding function defined earlier
vectorstore = Chroma.from_texts(docs, embedding_function, persist_directory="./chroma_db_text")

# Print the number of entries in the vector store
print(vectorstore._collection.count())

### Formulate a question and retrieve top k similar documents

In [None]:
# Example questions for retrieval (uncomment one at a time to use)

# question = "Who is Ned Land, and what is his expertise?"
# question = "What tactics does the ship Abraham Lincoln use to track down the sea creature?"
question = "What dangers do Professor Aronnax and the others face in the forest?"

# Convert the vector store into a retriever object to search for similar documents
# Using a similarity-based search with the top 3 similar results
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retrieved_docs = retriever.invoke(question)

# Print the retrieved documents and the number of similar documents found
print(retrieved_docs)
print(f"Collected most {len(retrieved_docs)} similar documents.")

# Function to format the retrieved documents into a readable format
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a context from the formatted documents
context = format_docs(retrieved_docs)
print(context)  # Uncomment to print the formatted context for verification


### Provide question and context to the LLM model

In [None]:

# Define headers for the HTTP request to indicate we are sending JSON data
headers = { "Content-Type": "application/json" }

# Combine the user query and context into a single prompt
user_query = f"\nQuestion: {question}\nContext: {context}"

# Define the JSON payload for the POST request
data = {
    "messages": [
        {"role": "assistant", "content": "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."},
        {"role": "user", "content": user_query}
    ],
    "temperature": 0.7,  # Controls the randomness of the output
    "max_tokens": -1,     # Maximum number of tokens in the output (unlimited in this case)
    "stream": False       # Whether to stream the output or not
}

# Make a POST request to the local server running at localhost:1234
response = requests.post("http://localhost:1234/v1/chat/completions", headers=headers, data=json.dumps(data))

# Check if the request was successful
if response.status_code == 200:
    # Extract the assistant's response from the JSON response
    # bot_response = response.json()  # Uncomment to see the full response
    bot_response = response.json()["choices"][0]["message"]["content"]
    print("Answer:", bot_response)
else:
    # Print an error message if the request failed
    print("Failed to get response:", response.status_code, response.text)