# RAG-Tutor

### Set the OpenAI API Key as an Environment Variable

In [1]:
%load_ext dotenv
%dotenv

### Import the Libraries

In [2]:
from langchain_community.document_loaders.pdf import PyPDFLoader

from langchain_text_splitters import (MarkdownHeaderTextSplitter, 
                                      TokenTextSplitter)

from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.messages import SystemMessage
from langchain_core.prompts import (PromptTemplate,
                                    HumanMessagePromptTemplate, 
                                    ChatPromptTemplate)
from langchain_core.runnables import (RunnablePassthrough, 
                                      RunnableLambda, 
                                      chain)

from langchain_openai import (ChatOpenAI, 
                              OpenAIEmbeddings)

from langchain_chroma.vectorstores import Chroma

### Load the Course Transcript

In [3]:
loader_pdf = PyPDFLoader("Introduction_to_Tableau.pdf")

In [4]:
docs_list = loader_pdf.load()

In [5]:
docs_list

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2024-10-11T11:01:10+03:00', 'author': 'python-docx', 'moddate': '2024-10-11T11:01:10+03:00', 'source': 'Introduction_to_Tableau.pdf', 'total_pages': 49, 'page': 0, 'page_label': '1'}, page_content="# Introduction to Tableau \n## Welcome to Tableau \nHi, everyone. \nI'm Ned and I'll be your instructor for this \ncourse. \nTableau is an invaluable tool. \nOne needs to learn on their journey to become a \nsuccessful business intelligence analyst or \ndata scientist. \nThe art of these professions is storytelling \nusing data to tell stories and convince top \nmanagement of the right course of action. \nBy completing this part of the program, you \nwill know how to create charts and dashboards \nin tableaux. \nThis is an essential step on your way to a data \nscientist role. \n \n## Why use Tableau: Make your data make an impact \nTableau has grown to become

In [6]:
len(docs_list)

49

In [7]:
string_list_concat = "".join([i.page_content for i in docs_list])

In [8]:
string_list_concat



### Split the Course Transcript with MarkdownHeaderTextSplitter

In [9]:
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [("#", "Section Title"), ("##", "Lecture Title")])

In [10]:
docs_list_md_split = md_splitter.split_text(string_list_concat)

In [11]:
docs_list_md_split

[Document(metadata={'Section Title': 'Introduction to Tableau', 'Lecture Title': 'Welcome to Tableau'}, page_content="Hi, everyone.\nI'm Ned and I'll be your instructor for this\ncourse.\nTableau is an invaluable tool.\nOne needs to learn on their journey to become a\nsuccessful business intelligence analyst or\ndata scientist.\nThe art of these professions is storytelling\nusing data to tell stories and convince top\nmanagement of the right course of action.\nBy completing this part of the program, you\nwill know how to create charts and dashboards\nin tableaux.\nThis is an essential step on your way to a data\nscientist role."),
 Document(metadata={'Section Title': 'Introduction to Tableau', 'Lecture Title': 'Why use Tableau: Make your data make an impact'}, page_content="Tableau has grown to become one of the most\npopular business intelligence tools in the\nentire world.\nIt is A B I software that allows non technical\nusers to visualize their data and work with it\nalmost immediat

In [12]:
len(docs_list_md_split)

22

### Create a Chain to Correct the Course Transcript

In [13]:
string_list_split = [i.page_content for i in docs_list_md_split]

In [14]:
string_list_split

["Hi, everyone.\nI'm Ned and I'll be your instructor for this\ncourse.\nTableau is an invaluable tool.\nOne needs to learn on their journey to become a\nsuccessful business intelligence analyst or\ndata scientist.\nThe art of these professions is storytelling\nusing data to tell stories and convince top\nmanagement of the right course of action.\nBy completing this part of the program, you\nwill know how to create charts and dashboards\nin tableaux.\nThis is an essential step on your way to a data\nscientist role.",
 "Tableau has grown to become one of the most\npopular business intelligence tools in the\nentire world.\nIt is A B I software that allows non technical\nusers to visualize their data and work with it\nalmost immediately lowering,\nknow how barriers dramatically in the past.\nBusiness analysts needed the help of it\npersonnel who could assist them in gathering\nraw data and preprocessing it.\nOnly then could business analysts start working\non the visualization of such data

In [15]:
PROMPT_FORMATTING_S = '''Improve the following Tableau lecture transcript by:
- Splitting the text into meaningful paragraphs
- Correcting any misplaced punctuation
- Fixing mistranscribed words (e.g., changing 'tableaux' to 'Tableau')"
'''

PROMPT_TEMPLATE_FORMATTING_H = '''This is the transcript:
{lecture_transcript}
'''

In [16]:
prompt_formatting_s = SystemMessage(content = PROMPT_FORMATTING_S)
prompt_template_formatting_h = HumanMessagePromptTemplate.from_template(template = PROMPT_TEMPLATE_FORMATTING_H)
chat_prompt_template_formatting = ChatPromptTemplate.from_messages([prompt_formatting_s, prompt_template_formatting_h])

In [17]:
chat = ChatOpenAI(model_name = "gpt-3.5-turbo",
                  temperature = 0,
                  max_tokens = 100)

In [18]:
str_output_parser = StrOutputParser()

In [19]:
chain_formatting =  chat_prompt_template_formatting | chat | str_output_parser

In [20]:
string_list_formatted = chain_formatting.batch(string_list_split)

In [21]:
# Override the docs_list_md_split list such that the page_content parameter of each Document objects stores the updated lecture.
for i, j in zip(docs_list_md_split, string_list_formatted):
    i.page_content = j

In [22]:
docs_list_md_split

[Document(metadata={'Section Title': 'Introduction to Tableau', 'Lecture Title': 'Welcome to Tableau'}, page_content="Hi, everyone. I'm Ned and I'll be your instructor for this course. Tableau is an invaluable tool that one needs to learn on their journey to become a successful business intelligence analyst or data scientist. The art of these professions is storytelling using data to tell stories and convince top management of the right course of action.\n\nBy completing this part of the program, you will know how to create charts and dashboards in Tableau. This is an essential step on your way to a data scientist role"),
 Document(metadata={'Section Title': 'Introduction to Tableau', 'Lecture Title': 'Why use Tableau: Make your data make an impact'}, page_content='Tableau has grown to become one of the most popular business intelligence tools in the entire world. It is ABI software that allows non-technical users to visualize their data and work with it almost immediately, lowering kn

### Split the Lectures with TokenTextSplitter

In [23]:
token_splitter = TokenTextSplitter(encoding_name = "cl100k_base",
                                   chunk_size = 500,
                                   chunk_overlap = 50)

In [24]:
docs_list_tokens_split = token_splitter.split_documents(docs_list_md_split)

In [25]:
len(docs_list_tokens_split)

22

In [26]:
docs_list_tokens_split

[Document(metadata={'Section Title': 'Introduction to Tableau', 'Lecture Title': 'Welcome to Tableau'}, page_content="Hi, everyone. I'm Ned and I'll be your instructor for this course. Tableau is an invaluable tool that one needs to learn on their journey to become a successful business intelligence analyst or data scientist. The art of these professions is storytelling using data to tell stories and convince top management of the right course of action.\n\nBy completing this part of the program, you will know how to create charts and dashboards in Tableau. This is an essential step on your way to a data scientist role"),
 Document(metadata={'Section Title': 'Introduction to Tableau', 'Lecture Title': 'Why use Tableau: Make your data make an impact'}, page_content='Tableau has grown to become one of the most popular business intelligence tools in the entire world. It is ABI software that allows non-technical users to visualize their data and work with it almost immediately, lowering kn

### Create Embeddings, Vector Store, and Retriever

In [27]:
embedding = OpenAIEmbeddings(model = "text-embedding-3-small")

In [28]:
vectorstore = Chroma.from_documents(documents = docs_list_tokens_split, embedding = embedding, persist_directory = "./intro-to-tableau" )

In [29]:
retriever = vectorstore.as_retriever(search_type = "mmr", search_kwargs={"k":2, "lambda_mult":0.7})

In [30]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x0000014FE92B7EF0>, search_type='mmr', search_kwargs={'k': 2, 'lambda_mult': 0.7})

### Create Prompts and Prompt Templates for the Q&A Chatbot Chain

In [31]:
PROMPT_CREATING_QUESTION = '''Lecture: {question_lecture}
Title: {question_title}
Body: {question_body}'''

PROMPT_RETRIEVING_S = '''You will receive a question from a student taking a Tableau course, which includes a title and a body. 
The corresponding lecture will also be provided.

Answer the question using only the provided context.

At the end of your response, include the section and lecture names where the context was drawn from, formatted as follows: 
Resources: 
Section: *Section Title*, Lecture: *Lecture Title* 
...
Replace *Section Title* and *Lecture Title* with the appropriate titles.'''

PROMPT_TEMPLATE_RETRIEVING_H = '''This is the question:
{question}

This is the context:
{context}'''

prompt_creating_question = PromptTemplate.from_template(template = PROMPT_CREATING_QUESTION)
prompt_retrieving_s = SystemMessage(content = PROMPT_RETRIEVING_S)
prompt_template_retrieving_h = HumanMessagePromptTemplate.from_template(template = PROMPT_TEMPLATE_RETRIEVING_H)

chat_prompt_template_retrieving = ChatPromptTemplate.from_messages([prompt_retrieving_s, prompt_template_retrieving_h])

### Create the First Version of the Q&A Chatbot Chain

In [32]:
extract_question = RunnableLambda(lambda x:x.to_string())

In [33]:
question_chain = prompt_creating_question | extract_question

In [34]:
retrieval_dict = {
    "context": retriever,
    "question": RunnablePassthrough()
}

In [35]:
retrieval_chain = question_chain | retrieval_dict

In [36]:
chain_retrieving = retrieval_chain | chat_prompt_template_retrieving | chat | str_output_parser

In [37]:
result = chain_retrieving.invoke({"question_lecture": "Adding a custom calculation",
                                  "question_title": "Why are we using SUM here? It's unclear to me.",
                                  "question_body": "This question refers to calculating the GM%."})

In [38]:
result

'To calculate the GM% (Gross Margin Percentage), we are using SUM because we are likely summing up the total revenue and total cost to calculate the gross margin. By dividing the gross margin by the total revenue and multiplying by 100, we can get the GM%. SUM is used to aggregate the values needed for this calculation.\n\nResources:\nSection: Introduction to Tableau, Lecture: Creating custom fields'

### Create a Runnable Function to Format the Context

In [39]:
from langchain_core.runnables import RunnableLambda

def format_context(dictionary):
    formatted_string = ""
    retrieved_list = dictionary["context"] 
    
    for i, doc in enumerate(retrieved_list, start=1):
        section = doc.metadata.get("Section Title", "Unknown Section")
        lecture = doc.metadata.get("Lecture Title", "Unknown Lecture")
        content = doc.page_content

        formatted_string += f"""
Document {i}
Section Title: {section}
Lecture Title: {lecture}
Content: {content}

-------------------
"""

    new_dictionary = {
        "context": formatted_string,
        "question": dictionary["question"]
    }

    return new_dictionary


In [40]:
format_context_runnable = RunnableLambda(format_context)

In [41]:
chain_retrieving_improved = retrieval_chain | format_context_runnable | chat_prompt_template_retrieving | chat | str_output_parser

In [42]:
result_improved = chain_retrieving_improved.invoke({"question_lecture": "Adding a custom calculation",
                                                    "question_title": "Why are we using SUM here? It's unclear to me.",
                                                    "question_body": "This question refers to calculating the GM%."})

In [43]:
result_improved

'To calculate the GM% (Gross Margin Percentage), we are using SUM because we are likely summing up the total revenue and total cost to calculate the gross margin. The formula for GM% is (Revenue - Cost) / Revenue * 100. By using SUM, we can aggregate the revenue and cost values to calculate the GM% for the dataset.\n\nResources:\nSection: Introduction to Tableau, Lecture: Creating custom fields'

### Stream the Response

In [44]:
result_streamed = chain_retrieving_improved.stream({"question_lecture": "Adding a custom calculation",
                                                    "question_title": "Why are we using SUM here? It's unclear to me.",
                                                    "question_body": "This question refers to calculating the GM%."})

In [45]:
result_streamed

<generator object RunnableSequence.stream at 0x0000014FE9419E40>

In [46]:
for token in result_streamed:
    print(token, end="", flush=True)

To calculate the GM% (Gross Margin Percentage), we are using SUM because we are likely summing up the total revenue and total cost to calculate the gross margin. The formula for GM% is (Revenue - Cost) / Revenue * 100. By using SUM, we can aggregate the revenue and cost values to calculate the GM% accurately.

Resources:
Section: Introduction to Tableau, Lecture: Creating custom fields