# Coursera AI Engineering Course Assistant Bot

This is a bot for answer the questions which user asks regarding the machine learning course that was done in coursera. This is implemnted using the learned concepts of RAG and lang chain thoughout the course

In [None]:
# Install Libraries
!pip install torch
!pip install transformers langchain_community langchain_text_splitters langchain_core
!pip install sentence-transformers
!pip install chromadb
!pip install huggingface_hub
!pip install accelerate
!pip install gradio

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Module Imports
import os
import torch
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import gradio as gr

We need to define the folder that contain our knowledge base, which are the documents of the transcripted vedios in coursera course vedios
We also need to define the prompt template that we are going to use for the langchain

In [None]:
folder_path: str = "data/"

template = """You are an AI assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question concisely.
If you don't know the answer, just say that you don't know.
Keep the answer to three sentences maximum.

Question: {question}
Context: {context}
Answer:"""

chain_save_path = "rag_chain.json"

# Preprocess Stage
In here we are going to implement functions for the preporcess steps

### Preprocess the document files we are going to use as our data bank

In [None]:
def preprocess_data_files():
    for filename in os.listdir(folder_path):
        full_old_path = os.path.join(folder_path, filename)
        new_filename = filename.replace(",","").replace(" ", "-")
        full_new_path = os.path.join(folder_path, new_filename)
        os.rename(full_old_path, full_new_path)

    txt_list: list[str] = [filename for filename in os.listdir(folder_path) if os.path.splitext(filename)[1] =='.txt']
    pdf_list: list[str] = [filename for filename in os.listdir(folder_path) if os.path.splitext(filename)[1] =='.pdf']
    return txt_list, pdf_list

### Function to load txt and pdf files

In [None]:
# TXT loader
def text_loader(txt_filepath: str):
    txt_loader = TextLoader(txt_filepath)
    return txt_loader.load()

# PDF loader
def pdf_file_loader(pdf_filepath: str):
    pdf_loader = PyPDFLoader(pdf_filepath, extract_images=False,)
    return pdf_loader.load()

### In here we define the text splitter to split the data of the files to chunks

In [None]:
def split_text_for_chunks(document):
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return text_splitter.split_documents(document)

# Model Creating Stage
In here we define the function to load a llm model and create the pipeline to use the llm for generation part

### Create LLM for langchain for the text generation part using the retrieved documents and the user query

In [None]:
def define_LLM():
    model_name: str = 'mistralai/Mistral-7B-Instruct-v0.2'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map='auto'
    )
    pipe = pipeline(
        'text-generation',
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=1024,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        repetition_penalty=1.1
    )

    return HuggingFacePipeline(pipeline=pipe)

# VectorDB Creating Stage
In here we define the function to create the vectorDB. we use vector DB to store the embbeding context of the knowledge base 

### Create Embedding model to generate embeddings

In [None]:
def initialize_embedding_model():
    embedding_model_name = "BAAI/bge-small-en-v1.5"
    embeddings_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    return embeddings_model

### Create Vector DB for save the embeddings

In [None]:
def intialize_vector_DB(embeddings_model):
    vector_db_path = "./chroma_db_hf_embeddings"
    if os.path.exists(vector_db_path) and os.listdir(vector_db_path):
        vector_db = Chroma(
            persist_directory=vector_db_path,
            embedding_function=embeddings_model
        )
    else:
        vector_db=Chroma.from_documents(
            documents=[],
            embedding=embeddings_model,
            persist_directory=vector_db_path
        )
        vector_db.persist()
    return vector_db

### Load document to vectorDB

In [None]:
def load_chunks_to_vector_db(doc_chunks, vector_db):
    vector_db.add_documents(doc_chunks)
    vector_db.persist()

### Load all documents, split it to chunks and then save them in vector db

In [None]:
def read_files_and_load(txt_list: list[str], pdf_list: list[str], vector_db):
    for txt_filename in txt_list:
        full_path = os.path.join(folder_path, txt_filename)
        loaded_file = text_loader(full_path)
        chunks = split_text_for_chunks(loaded_file)
        load_chunks_to_vector_db(chunks, vector_db)

    for pdf_filename in pdf_list:
        try:
            full_path = os.path.join(folder_path, pdf_filename)
            loaded_file = text_loader(full_path)
            chunks = split_text_for_chunks(loaded_file)
            load_chunks_to_vector_db(chunks, vector_db)
        except Exception as e:
            print(f'Exception occured when reading {pdf_filename}')

# LangChain Create Stage
In here we are going to assemble all the functions that we created above and create the final langchain to get the user query and answer to that by gaining knowledge from our knowledge base

In [None]:
def create_langchain():
    txt_list, pdf_list = preprocess_data_files()
    embedding_model = initialize_embedding_model()
    vector_db = intialize_vector_DB(embedding_model)
    read_files_and_load(txt_list, pdf_list, vector_db)
    retriever = vector_db.as_retriever(search_kwargs={"k": 3})
    llm = define_LLM()
    prompt = ChatPromptTemplate.from_template(template)
    rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()
    )
    return rag_chain

# Execution of the QA Bot

In [None]:
qa_rag_chain = create_langchain()

In [None]:
def process_query(user_question):
    print(user_question)
    if not user_question:
        return "Please enter a query to get a response."
    try:
        response = qa_rag_chain.invoke(user_question)
        answer_prefix = "Answer:"
        answer_start_index = response.find(answer_prefix)
        if answer_start_index != -1:
            extracted_answer = response[answer_start_index + len(answer_prefix):].strip()
        else:
            print("Warning: 'Answer:' prefix not found in response. Printing full output.")
            extracted_answer = ""
        return extracted_answer
    except Exception as e:
        print(f"An error occurred: {e}")

# Create A User Interface to Interact with BOT

Here we create a interactive user interface using dradio, which is a python library developed to implement UIs to AI applications specifically. Here we implement simple UI with one text box to get the user input and one output box to show the response to the user

In [None]:
demo = gr.Interface(
    fn=process_query, 
    inputs=gr.Textbox(
        lines=5,
        label="Your Input Query:",
        placeholder="Type your question here..."
    ),
    outputs=gr.Textbox(
        lines=10,
        label="Output Response:",
        interactive=False
    ),
    title="Simple RAG Query Interface",
    description="Enter a query and get a response from the (simulated) RAG system."
)

demo.launch()