# Dependencies

In [None]:
# move to root directory
import os
os.chdir('..')

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
import bs4
from langchain import hub
from langchain.document_loaders import DirectoryLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Ingest into vectordb

### Embedding model

model=thenlper/gte-large
volume=/data/llm/hf-tei-data
docker images run --gpus all --env HTTPS_PROXY=$https_proxy --env HTTP_PROXY=$http_proxy -p 8188:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:turing-0.6 --model-id $model

In [None]:
from langchain_community.embeddings import HuggingFaceHubEmbeddings
embeddings_model = HuggingFaceHubEmbeddings(model="http://localhost:8188")

### Load, chunk, store source data

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("") # Add the path to the .pdf document here
pages = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
splits = text_splitter.split_documents(pages)
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings_model, persist_directory="database/chromadb")

In [None]:

vecstore = Chroma(persist_directory="database/chromadb", embedding_function=)

# Retrieval

In [None]:
# Retrieve and generate using the relevant snippets of the blog.

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke("Who are the new hires?")
retrieved_docs

In [None]:
len(retrieved_docs)

# Chat Model

In [None]:
from langchain_core.messages import HumanMessage
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    openai_api_version=os.environ["OPENAI_API_VERSION"],
    azure_deployment="llm-rag-chatgpt35",
    max_tokens=2048,
    temperature=0.7
)

In [None]:
# Create prompt template
from langchain_core.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

# RAG Chain

In [None]:
# Pipeline
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)


In [None]:
rag_chain.invoke("Who are the new hires")

# Modularized

In [None]:
import os
os.chdir('..')
from rag.generate import *
from rag.ingest import *
from rag.retrieve import *
import textwrap

In [None]:
os.getcwd()

In [None]:
# Query
query = "Who are the new hires?"
chromadb_save_path = "database/chromadb/flex_neo"

# Ingest
print("Ingest")
embedding_model = get_embedding_model("http://localhost:8188")
documents = load_documents("") # Add the path to the .pdf document here
chunks = chunk_documents(doc= documents, chunk_size=2000, chunk_overlap=0)
save_chunks_to_chroma(chunks, embedding_model, chromadb_save_path)

# Retriever
print("\nRetrieval")
retriever = get_chroma_retriever(chromadb_save_path, embedding_model)
contexts = retrieve(retriever, query)

# Rag Chain
print("\nGeneration")
llm = get_llm_azure(deployment_name = "llm-rag-chatgpt35", max_tokens=2048, temperature=0.7)
prompt_template = get_prompt_template() # Use the default template by not giving input args
rag_chain = create_rag_chain(retriever, prompt_template, llm)

# Chat
import time
t1 = time.time()
generated = rag_chain.invoke(query)
t2 = time.time() -t1
word_count = len(generated.split())
print(f"Generation took {(t2)*1000:.4f} ms. {word_count} words. {word_count / t2:.4f} words per second.\n")

print("Results:")
print(f" Query:\n\t{query}")
print(f" Rag generation:\n\t{textwrap.fill(generated, width=100)}")

In [None]:
# Streaming
async for chunk in rag_chain.astream(query):
    print(chunk, end="", flush=True)

In [None]:
response = await stream_output(rag_chain, query)
response

In [None]:
query = "What does intel flex engineering do?"
print(" Response without RAG:\n")
print(textwrap.fill(llm.invoke(query).content, width=100))
print()
print(" Response with RAG:\n")
print(textwrap.fill(rag_chain.invoke(query)))

# Mistral 7b

In [None]:
from rag.generate import *
from rag.ingest import *
from rag.retrieve import *
import textwrap

In [None]:
import os
os.getcwd()

In [None]:
# Query
query = "Who are the new hires?"
chromadb_save_path = "database/chromadb/flex_neo"

# Ingest
print("Ingest")
embedding_model = get_embedding_model("http://localhost:8188")
documents = load_documents("") # Add the path to the document here
chunks = chunk_documents(doc= documents, chunk_size=2000, chunk_overlap=0)
save_chunks_to_chroma(chunks, embedding_model, chromadb_save_path)

# Retriever
print("\nRetrieval")
retriever = get_chroma_retriever(chromadb_save_path, embedding_model)
contexts = retrieve(retriever, query)

# Rag Chain 
print("\nGeneration")
mistral_llm = get_llm_llamacpp("/data/llm/models/mistral-7b-instruct-v0.2.FP16.gguf")
prompt_template = get_prompt_template() # Use the default template by not giving input args
rag_chain = create_rag_chain(retriever, prompt_template, mistral_llm)

# Chat
import time
t1 = time.time()
generated = rag_chain.invoke(query)
t2 = time.time() -t1
word_count = len(generated.split())
print(f"Generation took {(t2)*1000:.4f} ms. {word_count} words. {word_count / t2:.4f} words per second.\n")

print("Results:")
print(f" Query:\n\t{query}")
print(f" Rag generation:\n\t{textwrap.fill(generated, width=100)}")