In [1]:
import os
import bs4
import json
import re
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import AIMessage, HumanMessage

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["GROQ_API_KEY"] = "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"

In [3]:
web_pages = [
    "https://nebula9.ai/services/gen-ai/",
    "https://nebula9.ai/services/cloud-services/",
    "https://nebula9.ai/services/artificial-intelligence-machine-learning/",
    "https://nebula9.ai/services/reporting-and-analytics/",
    "https://nebula9.ai/services/consulting-and-advisory/",
    "https://nebula9.ai/services/product-management/",
    "https://nebula9.ai/services/tech-engineering/",
    "https://nebula9.ai/industries/",
    "https://nebula9.ai/industries/education/",
    "https://nebula9.ai/industries/publishing/",
    "https://nebula9.ai/industries/sports-entertainment/",
    "https://nebula9.ai/industries/healthcare/",
    "https://nebula9.ai/industries/banking/",
    "https://nebula9.ai/industries/insurance/",
    "https://nebula9.ai/industries/financialservices/",
    "https://nebula9.ai/industries/retail/",
    "https://nebula9.ai/industries/travelairlines/",
    "https://nebula9.ai/industries/manufacturing/",
    "https://nebula9.ai/case-studies/",
    "https://nebula9.ai/category/blog/",
    "https://nebula9.ai/careers/",
    "https://nebula9.ai/contact-us/",
    "https://nebula9.ai/about-us/",
    "https://nebula9.ai/contact-us/",
    "https://nebula9.ai/approach/",
    "https://nebula9.ai/engagement-model/"
]

In [4]:
# Initialize the LLM with the desired model
llm = ChatGroq(model="llama3-8b-8192")

In [5]:
# Load documents from the web pages
loader = WebBaseLoader(web_paths=web_pages)
docs = loader.load()

In [7]:
# Split documents into chunks for better processing
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = []

for doc in docs:
    # Split the content into chunks
    doc_chunks = text_splitter.split_documents([doc])
    
    # Assign a unique chunk_index for each chunk
    for idx, chunk in enumerate(doc_chunks):
        chunk.metadata["chunk_index"] = idx
        splits.append(chunk)

In [8]:
def clean_text(text):
    # Remove tabs and newlines
    text = text.replace("\t", " ").replace("\n", " ")
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [9]:
# Prepare a dictionary to store the content in the desired format
scraped_data = []

# Extract and format the scraped data
for doc in splits:
    cleaned_text = clean_text(doc.page_content)
    scraped_data.append({
        "page_url": doc.metadata.get("source", "Unknown"), 
        "title": doc.metadata.get("title", "No Title Available"),
        "text": cleaned_text,                         
        "chunk_index": doc.metadata.get("chunk_index", 0),
        "word_count": len(cleaned_text.split())
    })

In [10]:
# 3. Use Hugging Face embeddings 
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [11]:
# 4. Initialize ChromaDB and store the embeddings
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    persist_directory="chroma_db"  
)

# 5. Create a retriever
retriever = vectorstore.as_retriever()

# 6. Persist the database to reuse it later
#vectorstore.persist()

In [12]:
# 6. Contextualize questions with history-aware retriever
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [13]:
# 7. Define QA chain
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# 8. Create final RAG chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [14]:
# 9. Maintain chat history and ask questions
chat_history = []

# First question
question_1 = "What are the services offered by Nebula9.ai ? "
response_1 = rag_chain.invoke({"input": question_1, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question_1),
        AIMessage(content=response_1["answer"]),
    ]
)
print("Q1:", question_1)
print("A1:", response_1["answer"])

Q1: What are the services offered by Nebula9.ai ? 
A1: According to the provided context, Nebula9.ai offers the following services:

1. Generative AI
2. Artificial Intelligence & Machine Learning
3. Tech Engineering
4. Reporting and Analytics
5. Cloud Services
6. Product Management
7. Consulting and Advisory


In [15]:
# Follow-up question
question_2 = "What are cloud solutions services ?"
response_2 = rag_chain.invoke({"input": question_2, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question_2),
        AIMessage(content=response_2["answer"]),
    ]
)
print("Q2:", question_2)
print("A2:", response_2["answer"])

Q2: What are cloud solutions services ?
A2: According to the provided context, Nebula9.ai's cloud solutions services include:

1. Tailored Cloud Solutions: Custom-built solutions that align with specific business requirements.
2. Built for Growth: Adaptable cloud services designed for scalability, ensuring operations grow alongside the business.
3. Uncompromised Security: Implementing cutting-edge security measures to protect against breaches and unauthorized interventions.
4. Value-Driven Efficiency: Cloud strategies designed to deliver maximum ROI, optimizing costs and offering unparalleled value on every investment.
