In [2]:
import os

In [3]:
#UNCOMMENT FROM SECOND RUN (ensure the models are already cached, then run the same program)
os.environ['HF_HUB_OFFLINE']="1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

### Visual Representation

```mermaid
graph LR
    A(START) --query--> B(vectorstore)
    B --query + retrieved docs--> C(check document relevency)
    C --relevent docs + query--> D(generate answer)
    D --relevent docs + ans--> E(check for hallucinations)
    E --query + relevant docs + answer--> F(highlight doc snippet)
    F --> G(END)
```

### Indexing

In [None]:
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import re

In [None]:
def clean_page_content(document):

    if not re.findall('[0-9]{2}:[0-9]{2}: [0-9A-Za-z ;.,!?-]*',document.page_content):
        return document

    page_content = document.page_content
    matches = re.findall('[0-9]{2}:[0-9]{2}: [0-9A-Za-z .,!?]*',page_content)
    idx1 = page_content.find(matches[0])
    idx2 = page_content.find(matches[-2]) + len(matches[-2])
    page_content = page_content[:idx1] + page_content[idx2:]
    page_content = re.sub('[\[][0-9][\]]','',page_content)
    page_content = re.sub('[0-9]{2}:[0-9]{2}: ','',page_content)
    document.page_content = page_content
    
    return document

In [6]:
def generate_vectorstore(loader, embedding_model, chunk_size ,chunk_overlap=0):

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    vectorstore = Chroma(embedding_function=embedding_model, collection_metadata={"hnsw:space": "cosine"})

    for doc in loader.lazy_load():
        chunks = text_splitter.split_documents([clean_page_content(doc)])
        vectorstore.add_documents(chunks)
    
    return vectorstore

In [7]:
loader = WebBaseLoader(
    web_paths= [
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-view-Discussions-as-a-student/ta-p/314",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-view-the-rubric-for-my-graded-discussion/ta-p/319",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-subscribe-to-a-discussion-podcast-as-a-student/ta-p/368",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-know-if-I-have-a-peer-review-discussion-to-complete/ta-p/419",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-submit-a-peer-review-to-a-discussion/ta-p/355",
    "https://community.canvaslms.com/t5/Student-Guide/Where-can-I-find-my-peers-feedback-for-peer-reviewed-discussions/ta-p/428",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-create-a-course-discussion-as-a-student/ta-p/300",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-subscribe-to-a-discussion-as-a-student/ta-p/352",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-view-and-sort-discussion-replies-as-a-student/ta-p/465",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-change-discussion-settings-to-manually-mark-discussion/ta-p/366",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-mark-discussion-replies-as-read-or-unread-as-a-student/ta-p/284",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-reply-to-a-discussion-as-a-student/ta-p/334",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-attach-a-file-to-a-discussion-reply-as-a-student/ta-p/375",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-embed-an-image-in-a-discussion-reply-as-a-student/ta-p/313",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-like-a-reply-in-a-course-discussion-as-a-student/ta-p/392",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-view-a-discussion-thread-as-a-student/ta-p/485668",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-mention-a-user-in-a-discussion-reply-as-a-student/ta-p/485669",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-report-a-reply-in-a-discussion/ta-p/542169",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-reply-to-a-discussion-as-a-student-in-Canvas-for/ta-p/645002",
    "https://community.canvaslms.com/t5/Student-Guide/How-do-I-translate-a-discussion-using-AI-Translations-as-a/ta-p/660442",],
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(id="content"),
    },
    bs_get_text_kwargs={"separator": " ", "strip": True}, # \n may help with data cleaning using regex but " " is better
)

In [8]:
embedding_model = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-mpnet-base-v2',
    model_kwargs={"local_files_only": True},
    encode_kwargs={
        "normalize_embeddings": True, # cosine-friendly
        "batch_size": 24 # how many texts are embedded at once
    }
)   

In [9]:
vectorstore = generate_vectorstore(loader, embedding_model, chunk_size=800)

In [10]:
vectorstore.get(include=['documents','embeddings'])

{'ids': ['bf754b3f-0938-4534-b3b7-15ffcf3a39be',
  'b910bc5c-2f28-4107-ac3a-ba1ecb5e1c24',
  '71c37bcc-4330-4708-a4ff-08ef3a80510a',
  '4bf75ee6-3fee-49bd-be20-399a151584c7',
  'd57d84b1-b530-47a4-8958-78148c813aea',
  'cc00b813-db8d-4e5e-aa01-237cba350db1',
  'e0758389-8758-44b6-889c-e35d0d0599c8',
  'ad1b8adc-74f7-4938-beea-a9bd43232b6a',
  'e7af6036-28d5-45c8-bb12-bc027341d610',
  '9d71cf42-0a7e-4121-bb6a-3f796712e07c',
  '28878af3-432d-4005-8427-e2521371b23e',
  'fd777081-1977-413e-833b-791de2552456',
  '5843330b-ad8f-4d23-8fee-bed554bcc339',
  '1080fc39-9d93-4cef-8f75-8f36f83ed3d3',
  '724534ca-9865-4b45-92d1-db750a65a2aa',
  'ba83f1d8-69f2-45f2-b186-da3f32b1d7d8',
  '19ebe3e3-e9b1-40e9-b593-e6a35c240586',
  '93aa2165-d55c-4702-af01-41820be9f1df',
  '73bf0686-0c74-409b-a308-2c8218da00d6',
  '64abcfac-79be-4c20-ab21-3011a5e5ebba',
  'c376c7f6-eb59-429d-8a17-1e3da67cfe67',
  'f69e051d-4237-4aea-b580-96fba388537c',
  'c13f0ddc-df82-45c5-ab69-492438c1aced',
  'b6d47aa8-1f43-4c3f-b687-

In [11]:
retriever = vectorstore.as_retriever(
    search_type='similarity',
    search_kwargs={'k':4}
)

### Question

In [12]:
question = "How to permanently delete a discussion reply?"

### Retrieve Docs

In [13]:
docs = retriever.invoke(question)

In [14]:
docs

[Document(id='64abcfac-79be-4c20-ab21-3011a5e5ebba', metadata={'source': 'https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399'}, page_content="How do I edit or delete discussion replies as a student? If your instructor allows, you can edit and delete your own discussion posts. If the edit or delete option does not display, your instructor has restricted this setting in your course. Note: This setting does not affect discussions in course groups. \n\nThis guide covered how to edit or delete discussion replies as a student. Open Discussions In Course Navigation, click the Discussions link. Open Discussion Click a discussion title. Edit Discussion Reply Locate the reply you want to edit and click the Options icon . Then select the Edit option . To copy a permalink to a specific reply, select the Copy Link option . Save Edits Edit the discussion reply in the Rich Content Editor . To post your edits, click the Save button . Delet

### Check document relevency

In [15]:
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import Literal
from langchain_ollama import ChatOllama

In [16]:
# data model
class GradeDocuments(BaseModel):
    """Binary score for relevency check on retrieved documents."""

    binary_score: Literal['yes','no'] = Field(description="Documents are relevant to the question, 'yes' or 'no'." )

In [17]:
# Make sure Ollama Application is open and running in the background
# Make sure to download the model using "ollama pull <model>"
# Model choice is very important! Having tried the relevency check with "qwen3:0.6b" model the results were poor with the model indicating all documents as non-relevant when in fact some were relevant.
structured_llm_grader = ChatOllama(model='gpt-oss:20b', temperature=0).with_structured_output(GradeDocuments)

In [18]:
system = ("""
You are a grader assessing relevance of a retrieved document to a user question.
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
""").strip()

In [19]:
grade_prompt = ChatPromptTemplate([
    ('system', system),
    ('human', "Retrieved document:\n\n{document}\n\nUser question: {question}")
])

In [20]:
retrieval_grader = grade_prompt | structured_llm_grader

### Filter out non-relevant docs

In [21]:
docs

[Document(id='64abcfac-79be-4c20-ab21-3011a5e5ebba', metadata={'source': 'https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399'}, page_content="How do I edit or delete discussion replies as a student? If your instructor allows, you can edit and delete your own discussion posts. If the edit or delete option does not display, your instructor has restricted this setting in your course. Note: This setting does not affect discussions in course groups. \n\nThis guide covered how to edit or delete discussion replies as a student. Open Discussions In Course Navigation, click the Discussions link. Open Discussion Click a discussion title. Edit Discussion Reply Locate the reply you want to edit and click the Options icon . Then select the Edit option . To copy a permalink to a specific reply, select the Copy Link option . Save Edits Edit the discussion reply in the Rich Content Editor . To post your edits, click the Save button . Delet

In [22]:
docs_to_use = []
for doc in docs:
    res = retrieval_grader.invoke({'document': doc.page_content, 'question': question})
    if res.binary_score == 'yes': 
        docs_to_use.append(doc)

    print('-'*50)
    print(doc.id,'|',res)
    print('-'*50)
    print(doc.page_content)
    print('-'*50)
    print()

--------------------------------------------------
64abcfac-79be-4c20-ab21-3011a5e5ebba | binary_score='yes'
--------------------------------------------------
How do I edit or delete discussion replies as a student? If your instructor allows, you can edit and delete your own discussion posts. If the edit or delete option does not display, your instructor has restricted this setting in your course. Note: This setting does not affect discussions in course groups. 

This guide covered how to edit or delete discussion replies as a student. Open Discussions In Course Navigation, click the Discussions link. Open Discussion Click a discussion title. Edit Discussion Reply Locate the reply you want to edit and click the Options icon . Then select the Edit option . To copy a permalink to a specific reply, select the Copy Link option . Save Edits Edit the discussion reply in the Rich Content Editor . To post your edits, click the Save button . Delete Reply To delete your reply to a discussion to

In [23]:
docs_to_use

[Document(id='64abcfac-79be-4c20-ab21-3011a5e5ebba', metadata={'source': 'https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399'}, page_content="How do I edit or delete discussion replies as a student? If your instructor allows, you can edit and delete your own discussion posts. If the edit or delete option does not display, your instructor has restricted this setting in your course. Note: This setting does not affect discussions in course groups. \n\nThis guide covered how to edit or delete discussion replies as a student. Open Discussions In Course Navigation, click the Discussions link. Open Discussion Click a discussion title. Edit Discussion Reply Locate the reply you want to edit and click the Options icon . Then select the Edit option . To copy a permalink to a specific reply, select the Copy Link option . Save Edits Edit the discussion reply in the Rich Content Editor . To post your edits, click the Save button . Delet

### Generate Results

In [24]:
from langchain_core.output_parsers import StrOutputParser

In [28]:
system = """
You are an assistant for question-answering tasks. Answer the question based STRICTLY upon the retrieved documents.
If the context is insufficient for you to come to a final answer, say "I don't know".
Use three-to-five sentences maximum and keep the answer concise.
""".strip()

In [29]:
prompt = ChatPromptTemplate([
    ('system', system),
    ('human', "Retrieved documents:\n\n<docs>{documents}</docs>\n\nUser question: <question>{question}</question>")
])

In [32]:
def format_docs(docs):
    return "\n\n".join(
        [
        f"<doc{i}>:\nSource:{doc.metadata['source']}\nContent:{doc.page_content}\n</doc{i}>" 
        for (i, doc) in enumerate(docs,start=1)
        ]
    )

In [40]:
llm = ChatOllama(model='gpt-oss:20b', temperature=0, num_thread=4)

In [41]:
rag_chain = prompt | llm | StrOutputParser()

In [42]:
generation = rag_chain.invoke({'documents': format_docs(docs_to_use), 'question': question})

In [43]:
print(generation)

You can delete a reply by clicking the reply’s Options icon and choosing **Delete**, then confirming.  
If other students have replied to that post, Canvas will keep a “Deleted by [you]” notification in place, so the reply cannot be removed entirely. In short, you can delete it, but you can’t permanently erase it when it has replies attached.


### Check for Hallucinations

In [47]:
# data model
class GradeHallucinations(BaseModel):
    """Binary score for presence of hallucinations in 'generation' answer."""

    binary_score: Literal['yes','no'] = Field(
        ...,
        description="Answer is grounded in the facts, 'yes' or 'no'."
    )

In [48]:
structured_llm_grader = ChatOllama(model='gpt-oss:20b', temperature=0, num_thread=4).with_structured_output(GradeHallucinations)

In [49]:
system = """
You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. 
Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts.
""".strip()

In [50]:
hallucination_prompt = ChatPromptTemplate([
    ('system', system),
    ('human', "Set of facts:\n\n<facts>{documents}</facts>\n\nLLM generation: <generation>{generation}</generation>")
])

In [51]:
hallucination_grader = hallucination_prompt | structured_llm_grader

In [52]:
response = hallucination_grader.invoke({'documents': format_docs(docs_to_use), 'generation': generation})

In [53]:
print(response)

binary_score='yes'


### Highlight used Docs

In [60]:
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate

In [61]:
# data model
class HighlightDocuments(BaseModel):
    """Return the specific part of a document used for answering the question."""

    id: list[str] = Field(description="List of id of docs used to answers the question")

    source: list[str] = Field(description="List of sources used to answers the question")

    segment: list[str] = Field(description="List of direct segements from used documents that answers the question")

In [62]:
llm = ChatOllama(model='gpt-oss:20b', temperature=0, num_thread=4)

In [66]:
parser = PydanticOutputParser(pydantic_object=HighlightDocuments)

In [67]:
system = """
You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

Your task is to identify and extract the exact inline segments from the provided documents that directly correspond to the content used to generate the given answer. 

The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't use the specific document don't mention it.

Used documents:
<docs>{documents}</docs>

User question: <question>{question}</question>

Generated answer: 
<answer>{generation}</answer>

<format_instruction>
{format_instructions}
</format_instruction>
""".strip()

In [69]:
prompt = PromptTemplate(
    template= system,
    input_variables=["documents", "question", "generation"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [73]:
# doc_lookup = prompt | llm | parser
# lookup_response = doc_lookup.invoke({"documents":format_docs(docs_to_use), "question": question, "generation": generation})

In [70]:
final_prompt = prompt.invoke({'documents': format_docs(docs_to_use), 'question': question, 'generation': generation})

In [72]:
print(final_prompt.text)

You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

Your task is to identify and extract the exact inline segments from the provided documents that directly correspond to the content used to generate the given answer. 

The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't use the specific document don't mention it.

Used documents:
<docs><doc1>:
Source:https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta

In [74]:
lookup_response = llm.invoke(final_prompt)

In [75]:
lookup_response

AIMessage(content='{"id":["doc1"],"source":["https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399"],"segment":["To delete your reply to a discussion topic, click the reply Options icon , then select the Delete option . Confirm Delete Click the OK button. Verify Delete If you delete a discussion reply with other course user replies attached, Canvas shows a Deleted by notification. The notification includes your name. This entry cannot be removed."]}', additional_kwargs={}, response_metadata={'model': 'gpt-oss:20b', 'created_at': '2025-10-21T17:45:41.113504Z', 'done': True, 'done_reason': 'stop', 'total_duration': 634625268142, 'load_duration': 20819093166, 'prompt_eval_count': 1621, 'prompt_eval_duration': 186369237172, 'eval_count': 1269, 'eval_duration': 426047021620, 'model_name': 'gpt-oss:20b', 'model_provider': 'ollama'}, id='lc_run--534f02f7-748c-4580-9b7e-eb023aad48de-0', usage_metadata={'input_tokens': 1621, 'output_to

In [76]:
final_response = parser.parse(lookup_response.content)

In [77]:
final_response

HighlightDocuments(id=['doc1'], source=['https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399'], segment=['To delete your reply to a discussion topic, click the reply Options icon , then select the Delete option . Confirm Delete Click the OK button. Verify Delete If you delete a discussion reply with other course user replies attached, Canvas shows a Deleted by notification. The notification includes your name. This entry cannot be removed.'])

In [80]:
for id, source, segment in zip(final_response.id, final_response.source, final_response.segment):
    print(f"ID: {id}\nSource: {source}\nText Segment: {segment}\n")
    print('-'*50)

ID: doc1
Source: https://community.canvaslms.com/t5/Student-Guide/How-do-I-edit-or-delete-discussion-replies-as-a-student/ta-p/399
Text Segment: To delete your reply to a discussion topic, click the reply Options icon , then select the Delete option . Confirm Delete Click the OK button. Verify Delete If you delete a discussion reply with other course user replies attached, Canvas shows a Deleted by notification. The notification includes your name. This entry cannot be removed.

--------------------------------------------------


![Text Segment in Original Source](./attachments/text-segment-in-source.png)