In [2]:
from llama_index.core import PromptHelper, ServiceContext, StorageContext, VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from pinecone import Pinecone
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()


True

In [3]:
pc = Pinecone()
index_name = os.getenv("PINECONE_INDEX_NAME")
index = pc.Index(index_name)
index.describe_index_stats()
vector_store = PineconeVectorStore(
    pinecone_index=index,
)

In [4]:
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_index = VectorStoreIndex.from_documents(
    [], storage_context=storage_context, embed_model=embed_model
)

In [5]:
import pandas as pd

qa_df = pd.read_csv("test_q.csv")
qa_df.head()

Unnamed: 0.1,Unnamed: 0,Question,Ideal Answer,Query Engine Answer,Retrieved Content
0,0,How do I know if a student has a scholarship?,Missionaries can see if a student has a schola...,"To determine if a student has a scholarship, y...",Outreach to All PC103 and Online Degree Studen...
1,1,How do I know if a student is registered for a...,There is not a way for Missionaries to verify ...,To verify if a student is registered for an In...,Check For Institute Enrollment For College Cre...
2,2,How do I know if a student is a returned missi...,Missionaries can see if a student is a returne...,To determine if a student is a returned missio...,How do I know if a student is a returned missi...
3,3,How do I know if student is member of the church?,Missionaries can see the Church membership sta...,To verify if a student is a member of The Chur...,Verify A Student's Church Membership Status Pa...
4,4,What information should I track for each student?,The most important things Missionaries should ...,To effectively track each student's progress a...,Monitoring Student Progress Path For Pathwayco...


In [6]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode
 

retriever_k = 35
sparse_k = retriever_k * 5
query_mode = VectorStoreQueryMode.DEFAULT
 
retriever = vector_index.as_retriever(
    vector_store_query_mode=query_mode,
    similarity_top_k=retriever_k,
    sparse_top_k=sparse_k,
)

In [7]:
import time
import asyncio
import logging
from typing import Any, List, Optional, Tuple, Dict
from collections import defaultdict
from llama_index.core.chat_engine import CondensePlusContextChatEngine
from llama_index.core.indices.query.schema import QueryBundle
from llama_index.core.schema import MetadataMode, NodeWithScore
from llama_index.core.base.llms.types import ChatMessage, MessageRole
from llama_index.core.callbacks import CallbackManager, trace_method
from llama_index.core.types import Thread

import voyageai
vo = voyageai.Client()  # This will automatically use the environment variable VOYAGE_API_KEY.


from app.engine.custom_node_with_score import CustomNodeWithScore

from llama_index.core.chat_engine.types import (
    AgentChatResponse,
    BaseChatEngine,
    StreamingAgentChatResponse,
    ToolOutput,
)


logger = logging.getLogger(__name__)

class CustomCondensePlusContextChatEngine(CondensePlusContextChatEngine):
    
    def _retrieve_context(self, message: str) -> Tuple[str, List[NodeWithScore]]:
        """Build context for a message from retriever."""
        nodes = self._retriever.retrieve(message)
        for postprocessor in self._node_postprocessors:
            nodes = postprocessor.postprocess_nodes(
                nodes, query_bundle=QueryBundle(message)
            )
        
        # print(nodes)

        # **Organize nodes using the earlier functions**
        organized_nodes = self._organize_nodes(nodes, message=message)
        # print(len(organized_nodes))
        # print(organized_nodes)

        # Custom formatting of context_str
        context_str = "We have {} nodes:\n".format(len(organized_nodes))
        
        nodes_with_citation_node_id = []
        
        for i, node_with_score in enumerate(organized_nodes, start=1):
            node_text = node_with_score.node.get_text()
            context_str += f'node_id: {i}\ntext: {node_text}\n\n'

            # Create a new node with citation_node_id
            new_node = CustomNodeWithScore(node=node_with_score.node, score=node_with_score.score)

            # Assign citation_node_id to the new node
            new_node.citation_node_id = str(i)
            nodes_with_citation_node_id.append(new_node)
            
        return context_str, nodes_with_citation_node_id

    async def _aretrieve_context(self, message: str) -> Tuple[str, List[CustomNodeWithScore]]:
        """Build context for a message from retriever asynchronously."""
        nodes = await self._retriever.aretrieve(message)
        for postprocessor in self._node_postprocessors:
            nodes = postprocessor.postprocess_nodes(
                nodes, query_bundle=QueryBundle(message)
            )
        
        # print(len(nodes))
        # print(nodes)
        # **Organize nodes using the earlier functions**
        organized_nodes = self._organize_nodes(nodes, message=message)
        # print(len(organized_nodes))
        # for node in organized_nodes:
        #     print(node.text)
        #     print(node.metadata["url"])

        # Custom formatting of context_str
        context_str = "We have {} nodes:\n".format(len(organized_nodes))
        
        nodes_with_citation_node_id = []
        
        for i, node_with_score in enumerate(organized_nodes, start=1):
            node_text = node_with_score.node.get_text()
            context_str += f'node_id: {i}\ntext: {node_text}\n\n'

            # Create a new node with citation_node_id
            new_node = CustomNodeWithScore(node=node_with_score.node, score=node_with_score.score)

            # Assign citation_node_id to the new node
            new_node.citation_node_id = str(i)
            nodes_with_citation_node_id.append(new_node)

        return context_str, nodes_with_citation_node_id

    def _organize_nodes(self, nodes: List[NodeWithScore], message: str = "") -> List[NodeWithScore]:
        """Organize nodes by URL, sequence, and merge overlapping nodes."""
        rerank_model = "rerank-lite-1"
        rerank_threshold = 0.21
        rerank_k = 17
        rerank = True
        
        #retrieved_threshold filter is ignored because it's value is 0

        # CHANGED - use node context metadata for text to send to llm
        node_texts_ordered = [node.text for node in nodes]

        # CHANGED - dedup node texts
        node_texts = list(set(node_texts_ordered))

        if rerank and len(node_texts) > 1:
            success = False
            retries = 0
            while not success and retries < 3:
                try:
                    reranking = vo.rerank(message, node_texts, model=rerank_model, top_k=rerank_k)
                    node_texts = [r.document for r in reranking.results if r.relevance_score >= rerank_threshold]
                    # print(f"---\n{node_texts}\n---\n\n")
                    success = True
                except:
                    time.sleep(5) # originally 60
                    retries += 1

        nodes_ranked = []
        if len(node_texts) == 0:
            node_texts = ['qwer asdf']
        else:
            # CHANGED - get filtered and reranked nodes
            nodes_ranked = [nodes[node_texts_ordered.index(node_text)] for node_text in node_texts]
            # for node_text in node_texts:
            #     node_text_nodes.append((node_text, nodes[node_texts_ordered.index(node_text)]))

        # Step 5: Group nodes by page (URL)
        pages = defaultdict(list)
        for node_with_score in nodes_ranked:
            node = node_with_score.node
            url = node.metadata.get('url', 'unknown_url')
            pages[url].append(node_with_score)
                
        # Step 6: Order nodes on each page by sequence number
        for url, page_nodes in pages.items():
            pages[url] = sorted(page_nodes, key=lambda x: x.node.metadata.get('sequence', 0))

        # Step 7: Merge overlapping nodes with the same headers
        organized_nodes = []
        for url, page_nodes in pages.items():
            merged_nodes = self._merge_nodes_with_headers(page_nodes)
            organized_nodes.extend(merged_nodes)
        
        return organized_nodes

    def _merge_nodes_with_headers(self, nodes: List[NodeWithScore]) -> List[NodeWithScore]:
        """Merge nodes by aligning words and removing duplicates at the same positions."""
        if not nodes:
            return []

        # Use the metadata from the first node
        merged_node_with_score = nodes[0]
        total_score = 0.0

        # Collect lists of words from each node's text
        texts_words = [node_with_score.node.get_text().split() for node_with_score in nodes]
        max_length = max(len(words) for words in texts_words)

        merged_words = []
        for i in range(max_length):
            words_at_position = []
            scores_at_position = []
            for idx, words in enumerate(texts_words):
                if i < len(words):
                    words_at_position.append(words[i])
                    scores_at_position.append(nodes[idx].score)
                else:
                    words_at_position.append(None)
                    scores_at_position.append(nodes[idx].score)

            # Remove None placeholders
            words_present = [(word, score) for word, score in zip(words_at_position, scores_at_position) if word]

            # If all words at this position are the same, keep one
            unique_words = set(word for word, _ in words_present)
            if len(unique_words) == 1:
                merged_words.append(words_present[0][0])
            else:
                # Choose the word with the highest associated score
                word_scores = {}
                for word, score in words_present:
                    word_scores[word] = word_scores.get(word, 0) + score
                # Select the word with the highest cumulative score
                chosen_word = max(word_scores.items(), key=lambda x: x[1])[0]
                merged_words.append(chosen_word)

        merged_text = ' '.join(merged_words)

        # Sum up the scores for averaging
        for node_with_score in nodes:
            total_score += node_with_score.score

        # Calculate the average score
        average_score = total_score / len(nodes)

        # Update the merged node's text and score
        merged_node_with_score.node.text = merged_text
        merged_node_with_score.score = average_score

        return [merged_node_with_score]


    def _split_header_content(self, text: str) -> Tuple[str, str]:
        """Split the text into header and content."""
        lines = text.split('\n', 1)
        if len(lines) > 1:
            return lines[0] + '\n', lines[1]
        return '', text

    def _merge_content(self, existing: str, new: str) -> str:
        """Merge two content strings, removing duplicate words."""
        combined = existing + ' ' + new
        words = combined.split()
        return ' '.join(sorted(set(words), key=words.index))
    
    def reset(self) -> None:
        # Clear chat history
        self._memory.reset()
        self.chat_history.clear()
        
    def get_chat_history(self) -> List[ChatMessage]:
        return self.chat_history

In [8]:
import os
 
from app.engine.index import get_index
from app.engine.node_postprocessors import NodeCitationProcessor
from fastapi import HTTPException
 
from app.engine.custom_condense_plus_context import CustomCondensePlusContextChatEngine
def get_chat_engine(filters=None, params=None):
    system_prompt = os.getenv("SYSTEM_PROMPT")
    # citation_prompt = os.getenv("SYSTEM_CITATION_PROMPT", None)
    top_k = int(os.getenv("TOP_K", 35))
 
    node_postprocessors = []
    # if citation_prompt:
    #     node_postprocessors = [NodeCitationProcessor()]
    #     system_prompt = f"{system_prompt}\n{citation_prompt}"
   
    node_postprocessors = [NodeCitationProcessor()]
       
    index = get_index(params)
    if index is None:
        raise HTTPException(
            status_code=500,
            detail=str(
                "StorageContext is empty - call 'poetry run generate' to generate the storage first"
            ),
        )
 
    retriever = index.as_retriever(
        similarity_top_k=top_k,
        filters=filters,
    )
 
    SYSTEM_CITATION_PROMPT = """
    You are a helpful assistant who assists service missionaries with their BYU Pathway questions. You are responding with information from a knowledge base that consists of multiple nodes. Each node contains metadata such as node ID, file name, and other relevant details. To ensure accuracy and transparency, please include a citation for every fact or statement derived from the knowledge base.
 
    Use the following format for citations: [^context number], as the identifier of the data node.
 
    Example:
    We have two nodes:
      node_id: 1
      text: Information about how service missionaries support BYU Pathway students.
 
      node_id: 2
      text: Details on training for service missionaries.
 
    User question: How do service missionaries help students at BYU Pathway?
    Your answer:
    Service missionaries provide essential support by mentoring students and helping them navigate academic and spiritual challenges [^1]. They also receive specialized training to ensure they can effectively serve in this role [^2].
 
    Make sure each piece of referenced information is correctly cited. If you are unsure about a fact, provide clarification to the best of your ability.
    """
 
    CONTEXT_PROMPT = """
    Answer the question as truthfully as possible using the numbered contexts below. If the answer isn't in the text, please say "Sorry, I'm not able to answer this question. Could you rephrase it?" Please provide a detailed answer. For each sentence in your answer, include a link to the contexts the sentence came from using the format [^context number].
 
    Contexts:
    {context_str}
   
    Instruction: Based on the above documents, provide a detailed answer for the user question below. Ensure that each statement is clearly cited, e.g., "This is the answer based on the source [^1]. This is part of the answer [^2]..."
    """
   
    CONDENSE_PROMPT_TEMPLATE = """
    Based on the following follow-up question from the user,
    rephrase it to form a complete, standalone question.
   
    Follow Up Input: {question}
    Standalone question:
    """
    return CustomCondensePlusContextChatEngine.from_defaults(
        system_prompt=SYSTEM_CITATION_PROMPT,
        context_prompt=CONTEXT_PROMPT,
        condense_prompt=CONDENSE_PROMPT_TEMPLATE,
        retriever=retriever,
        node_postprocessors=node_postprocessors,
        verbose=True,
    )

In [9]:
from llama_index.core.settings import Settings
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model_name = "text-embedding-3-large"
 
Settings.embed_model = OpenAIEmbedding(
    model=embed_model_name,
    embed_batch_size=100,
    max_retries=25,
    timeout=180,
    reuse_client=True,
    dimensions=3072,
)

In [10]:
messages = ''
chat_engine = get_chat_engine()

In [24]:
response = await chat_engine.achat("Can a friend of the church become a mentor?")
print(response)

Condensed question: Is it possible for a friend of the church to serve as a mentor?
Context: We have 7 nodes:
node_id: 1
text: Mentors / The power of your peers Nobody knows the BYU-Pathway experience better than our current students. They can empathize with your situation, provide you with guidance to become successful, and help you find and use resources you didn’t even know existed. Each of our nearly 500 mentors are united in their belief that as followers of Jesus Christ we are meant to support one another and encourage each other’s success, **creating a stronger community and network**. offer strategies for success, provide important reminders, connect you to support resources, and give you encouragement to help you achieve each milestone on the way to graduation.

node_id: 2
text: Can a student get a new mentor? / Audience Missionary Assigned to PathwayConnect concern with their mentor can submit a Mentor Concern Form. This submission goes to the Mentor team at BYU-Pathway and w

In [17]:
messages = ''
chat_engine = get_chat_engine()
new_data = []
for index, row in qa_df.iloc[:50].iterrows():
    question = row['Question']
    ideal_answer = row['Ideal Answer']
    response = await chat_engine.achat(question)
    new_data.append({
        'Question': question,
        'Ideal Answer': ideal_answer,
        'Query Engine Answer': response.response,
        'Retrieved Content' : "\n\n".join([node.text for node in response.source_nodes])
    })

Condensed question: How do I know if a student has a scholarship?
Context: We have 6 nodes:
node_id: 1
text: Outreach to All PC103 and Online Degree Students on Your Roster Step the (Mentor) * these questions are generally “yes”, the student is likely a student who would they benefit and the from this the the following questions: the Does this the have a C+ (77%) the higher in their PathwayConnect course or one of their Online Degree not and did they participate consistently the the semester? Please have the student send a screenshot of their the grades. * If the student wants the scholarship applied the the next term, has the student registered for next term? Please have the student send a screenshot of their registration. + If the student has registered for next term, do they have a balance? This scholarship the not eligible for refund, so if the student has already paid their tuition, the student is not eligible for the for at this time. However, you can encourage the student to get

In [19]:
messages = ''
chat_engine = get_chat_engine()
for index, row in qa_df.iloc[101:].iterrows():
    question = row['Question']
    ideal_answer = row['Ideal Answer']
    response = await chat_engine.achat(question)
    new_data.append({
        'Question': question,
        'Ideal Answer': ideal_answer,
        'Query Engine Answer': response.response,
        'Retrieved Content' : "\n\n".join([node.text for node in response.source_nodes])
    })

Condensed question: Is the Hall scholarship different from the Heber J. Grant scholarship?
Context: We have 4 nodes:
node_id: 1
text: About Scholarship - Hall Foundation / ABOUT the Hall Foundation Scholarship In 2020, the Hall Foundation Scholarship evolved into a full-tuition, online-only scholarship for returned missionaries to pursue online education through the or Ensign College. In 2024, The Hall Foundation extended scholarships to all international online students. Applicants are no longer required to be returned missionaries. **IMPORTANT ANNOUNCEMENT:** The Hall Foundation Online Scholarship Application is currently closed. Foundation extended scholarships to all international online students. Applicants are no longer a to be returned missionaries. **IMPORTANT ANNOUNCEMENT:** achieve. In 2015, the Brad and Andrea Hall Family Foundation was created to offer on-campus scholarships to international returned missionaries from the Church of Jesus Christ of Latter-day Saints.

node_i

In [20]:
len(new_data)

149

In [21]:
result_df = pd.DataFrame(new_data)
result_df.head()
result_df = result_df.dropna(subset=['Ideal Answer'])
result_df.to_csv("chatbot_answer.csv")