In [210]:
import re
import os
import time

from pydantic_settings import BaseSettings, SettingsConfigDict

import requests
from bs4 import BeautifulSoup

from typing import TypedDict

from uuid import uuid4
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore, PineconeEmbeddings

from langchain.chains import RetrievalQA
from langchain.agents import AgentExecutor
from langchain_deepseek import ChatDeepSeek
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain.tools.retriever import create_retriever_tool
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

In [160]:
class ProjectConfiguration(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env")

    NVIDIA_API_KEY:str
    PINECONE_API_KEY:str
    DEEPSEEK_API_KEY:str
    
class SiteLinks:
    about:dict={"link":"https://www.uniben.edu/about-uniben.html", "tag":["h5", "h3", "h2", "p", "li", "h4"]}
    vchancellor:dict={"link":"https://www.uniben.edu/vchancellor.html", "tag":["h4", "p", "li"]}
    office_unit:dict={"link":"https://www.uniben.edu/offices-units.html", "tag":["h5", "p"]}
    why_choose_uniben:dict={"link":"https://uniben.edu/why_choose_uniben.html", "tag":["h5", "h3", "p"]}
    admission_policy:dict={"link":"https://www.uniben.edu/admission_policy.html", "tag":["h5", "h3", "p"]}

In [161]:
project_config = ProjectConfiguration()

In [162]:
os.environ["NVIDIA_API_KEY"] = project_config.NVIDIA_API_KEY
os.environ["PINECONE_API_KEY"] = project_config.PINECONE_API_KEY
os.environ["DEEPSEEK_API_KEY"] = project_config.DEEPSEEK_API_KEY

In [182]:
class DataBaseClass:
    def __init__(self):
        self.splitter = RecursiveCharacterTextSplitter(chunk_size=5000, 
                                                       chunk_overlap=100,
                                                       length_function=len)

    def GetInfo(self, section_name=None, **kwarg):
        response = requests.get(kwarg["link"])
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            extracted_text = " ".join([re.sub(r'\s+', ' ', i.get_text(strip=False).replace("\n", " ").replace("\r", " ")).strip() for i in soup.find_all(kwarg["tag"])])
        else:
            raise(f"Error getting text from {link}")
            
        splitted_text = self.splitter.split_text(extracted_text)

        documents = [Document(page_content=text,
                              metadata={"source": kwarg["link"], 
                                        "section": section_name}) for text in splitted_text]

        return documents
        
    def GetListOfText(self, **kwarg):
        extracted_text = []
        info_to_extract = ["about", "vchancellor", "office_unit"]
        
        for i in info_to_extract:
            extracted_text.extend(self.GetInfo(i, **kwarg[i]))

        return extracted_text

    def LoadDB(self, list_of_documents):
        pc = Pinecone(api_key=project_config.PINECONE_API_KEY)
        model_name = "BAAI/bge-small-en"
        model_kwargs = {"device": "cpu"}
        encode_kwargs = {"normalize_embeddings": True}
        embeddings = HuggingFaceBgeEmbeddings(
            model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
        )
    
        index_name = "customer-care-db" 
        
        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
        
        if index_name not in existing_indexes:
            pc.create_index(
                name=index_name,
                dimension=384,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
            )
            while not pc.describe_index(index_name).status["ready"]:
                time.sleep(1)
        
        index = pc.Index(index_name)
    
        vector_store = PineconeVectorStore(index=index, embedding=embeddings)
    
        uuids = [str(uuid4()) for _ in range(len(list_of_documents))]

        vector_store.add_documents(documents=list_of_documents, ids=uuids)

        return vector_store
            
    def SplitAndEmbeddText(self, splitted_documents):
        vector_store = self.LoadDB(splitted_documents)
        retriever = vector_store.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 5})
        return retriever

    def GetVectorStoreRetriever(self, site_links):
        retiever = self.SplitAndEmbeddText(self.GetListOfText(**site_links))
        return retiever

In [183]:
database_class = DataBaseClass()

In [184]:
retriever = database_function.GetVectorStoreRetriever(SiteLinks.__dict__)

In [208]:
class BotClass:
    def __init__(self, retriever, model_name:str="meta/llama-3.3-70b-instruct"):
        self.retriever = retriever
        self.llm = ChatDeepSeek(model="deepseek-chat")

    def InfoRetrieverAgent(self):
        tool = create_retriever_tool(retriever=retriever, 
                                     name="uniben_data_reteiever",
                                     description="Retrieve data relevant to the University of Benin")

        custom_prompt = PromptTemplate(
            input_variables=["context", "question"],
            template=(
                "You are a helpful assistant providing information about the University of Benin.\n"
                "Use only the context provided to answer the question, and return the text exactly as it appears.\n\n"
                "Context:\n{context}\n\nQuestion: {question}\nAnswer:")
                    )
        
        qa_chain = RetrievalQA.from_chain_type(
                                                llm=self.llm,
                                                retriever=self.retriever,
                                                chain_type="stuff",
                                                chain_type_kwargs={"prompt": custom_prompt},
                                                return_source_documents=False
                                            )
        
        return qa_chain

In [211]:
class BotSchema(TypedDict):
    user_query:str
    bot_response:str