<a href="https://colab.research.google.com/github/ChowchowWorks/Customer_service_rag/blob/main/Rag_Pipeline_Prototype_Version_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1: Import Libraries

In [3]:
import os
from google.colab import userdata

!pip install langchain_community langchain chromadb transformers sentence-transformers
!pip install -U langchain-huggingface
!pip install wikipedia
!pip install pypdf

os.environ['LANGCHAIN_API_KEY'] = "API_KEY"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "API_KEY"
os.environ['USER_AGENT'] = 'MyColabApp/1.0 (Python/3.9; GoogleColab)'



In [4]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Section 2: Load Documents

(a) Load the documents into a global variable

In [5]:
# For the purposes of testing, this code uses a Wikipedia page as a document
loader = WikipediaLoader(query="National University of Singapore", lang = 'en')
docs = loader.load()

(b) Split the documents into more manageable chunks

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 300, chunk_overlap = 50)
texts = splitter.split_documents(docs)

# Section 3: Indexing

(a) Initialise the pipeline by indexing the documents

In [7]:
vectorstore = Chroma.from_documents(texts, embedding)

(b) Choose the number of similar documents to be retrieved from the pipeline

In [8]:
k = 3
retriever = vectorstore.as_retriever(search_kwargs = {'k' : k})

# Section 4: Implementing the Generator

(a) Defining the prompt template

In [9]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

(b) Huggingface Inferface Client

In [10]:
from huggingface_hub import InferenceClient
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token= "API_KEY")

(c) Runnable Class and Initialisation

In [11]:
from langchain_core.runnables import Runnable

class HuggingFaceChatRunnable(Runnable):
    def __init__(self, client, prompt_template, temperature, max_tokens):
        self.client = client
        self.prompt_template = prompt_template
        self.temperature = temperature
        self.max_tokens = max_tokens

    def invoke(self, inputs: dict) -> str:
        prompt_str = self.prompt_template.format(**inputs)

        response = self.client.chat_completion(
            messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt_str}],
            temperature = self.temperature,
            max_tokens = self.max_tokens)
        return response.choices[0].message["content"]

chat = HuggingFaceChatRunnable(client, prompt_template=prompt, temperature= 0.4, max_tokens= 1024)

# Section 5: Retrieval and Response

(a) Retrieval

In [12]:
from logging import exception
# Start by asking the question
question = input("Ask me anything!\n")
# retreive the relevant documents
relevant = retriever.invoke(question)
# checks
if len(relevant) != k:
  raise exception(f"Wrong number of relevant documents: expected {k}, got {len(relevant)}")

Ask me anything!
How many students does NUS intake each year?


(b) Response

In [13]:
response = chat.invoke({"context": relevant, "question": question})
print(response)

 The context provided does not contain information about the number of students that NUS intakes each year.
