In [None]:
def load_file(file_path: str) -> List[Document]:
    """
    Load a file (PDF, TXT, or CSV), clean the text, and chunk it into Document objects.
    """
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        text = DocumentProcessor.load_pdf(file_path)
    elif ext == ".txt":
        text = DocumentProcessor.load_txt(file_path)
    elif ext == ".csv":
        text = DocumentProcessor.load_csv(file_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    # Chunk the text
    documents = DocumentProcessor.chunk_text(text)
    return documents


def load_documents(file_path):
    """
    Load a file (PDF, TXT, or CSV) into Documents, then create a new VectorStore index.
    """
    # Load and chunk document
    documents = load_file(file_path)

    # Create and return vector store
    vector_store = VectorStore()  # or HybridSearcher() if you want both vector + keyword
    vector_store.create_index(documents, force_recreate=True)
    print(f"Processed {len(documents)} chunks from file: {file_path}")

    return vector_store


vector_store = load_documents("/commons/corpra_share/k152356/ReAct_Testing/metatext.txt")


In [None]:
# ------------------------------------------------------------------------------
# DOCUMENT GENERATION WITH GUIDELINES
# ------------------------------------------------------------------------------
def generate_document_with_guidelines(
    user_query: str,
    guidelines: List[str],
    vector_store: VectorStore,
    k: int = 3,
    max_new_tokens: int = 512,
    temperature: float = 0.7
) -> str:
    """
    Searches the VectorStore for relevant doc chunks based on user_query,
    then iterates over each guideline to produce a separate snippet.
    Returns one combined "document" with all guideline-based responses.
    """
    # 1) Retrieve top-k relevant documents
    search_results = vector_store.search(user_query, k=k)

    # 2) Build a single combined context from the top docs
    combined_context_parts = []
    for doc, sim_score in search_results:
        meta_str = ""
        if doc.metadata:
            meta_parts = [f"{key}: {val}" for key, val in doc.metadata.items()]
            meta_str = "\n".join(meta_parts)

        context_str = (
            f"---\nContent:\n{doc.content}\n"
            f"Metadata:\n{meta_str}\n"
            f"Similarity Score: {sim_score:.4f}\n---"
        )
        combined_context_parts.append(context_str)
    combined_context = "\n\n".join(combined_context_parts)

    # 3) Loop over each guideline, generate a snippet
    final_snippets = []
    for idx, guideline in enumerate(guidelines, start=1):
        prompt = f"""
You have the following user query:
{user_query}

Context from relevant documents (with metadata):
{combined_context}

Guideline #{idx}: {guideline}

Based on the user query and the above context, create a concise 
section of a final document that follows this guideline. 
Use only the provided context if needed.
"""
        snippet = get_completion(
            prompt,
            max_new_tokens=max_new_tokens,
            temperature=temperature
        )
        formatted_snippet = f"### GUIDELINE #{idx}: {guideline}\n{snippet.strip()}\n"
        final_snippets.append(formatted_snippet)

    # 4) Combine all guideline-based snippets
    final_document = "\n\n".join(final_snippets)
    return final_document


In [None]:
def generate_document_with_guidelines(
    user_query: str,
    guidelines: list,
    vector_store: VectorStore,
    k: int = 3,
    max_new_tokens: int = 512,
    temperature: float = 0.7
) -> str:
    """
    Searches the VectorStore for relevant document chunks based on user_query,
    then iterates over each guideline to produce a separate snippet.
    Returns one combined "document" with all guideline-based responses.
    
    Parameters:
      user_query (str): The model validation question.
      guidelines (list): A list of guideline strings that define sections of the final document.
      vector_store (VectorStore): An instance of the VectorStore used for retrieving context.
      k (int): The number of top documents to retrieve from the vector store.
      max_new_tokens (int): Maximum tokens for each generated snippet.
      temperature (float): Sampling temperature for generation.
    
    Returns:
      str: The final combined document based on the generated guideline sections.
    """
    # 1) Retrieve top-k relevant documents
    search_results = vector_store.search(user_query, k=k)

    # 2) Build a combined context from the top documents
    combined_context_parts = []
    for doc, sim_score in search_results:
        meta_str = ""
        if doc.metadata:
            meta_parts = [f"{key}: {val}" for key, val in doc.metadata.items()]
            meta_str = "\n".join(meta_parts)
        context_str = (
            f"---\nContent:\n{doc.content}\n"
            f"Metadata:\n{meta_str}\n"
            f"Similarity Score: {sim_score:.4f}\n---"
        )
        combined_context_parts.append(context_str)
    combined_context = "\n\n".join(combined_context_parts)

    # 3) Iterate over each guideline and generate the associated snippet
    final_snippets = []
    for idx, guideline in enumerate(guidelines, start=1):
        prompt = f"""
Using this RAG function, I want you to implement a hardcoded version of the ReAct framework to do the following:
Your task is to assist a Quantitative Model Validator working in the Model Risk Management team of a bank, to find answers to policy questions about Model Development Document (MDD) based on the provided context.
Contents of the subsection(s) of MDD is used as the only input context to answer the Model Validation policy questions. You are a highly accurate assistant who strictly answers only based on the information in the provided context.

Strictly follow these Generation Instructions:
- Your response should be accurate, coherent, detailed, and descriptive by including all the important statistics, tables, terminologies, and definitions.
- Your response should be relevant to the question being asked.
- Your response should be honest, focused, and grounded in the provided context.
- Do not change or assume any definition, terminology, statistical data, numerical information, or table information.
- Always respond with "Not found" when you cannot find relevant information in the context.
- Always respond with "Not found" if any information asked is not explicitly mentioned.
- Use the important keywords and phrases from the context to frame your response.
- Use bullet points only when required.
- Only use the information provided under the specific product, business segment or aspect when answering questions. If the context includes details about multiple products, ensure your response is limited to the product specified in the query. Do not include information from other products, businesses, or other aspects.

You have the following user query:
{user_query}

Context from relevant documents (with metadata):
{combined_context}

Guideline #{idx}: {guideline}

Based on the user query and the above context, create a concise section of a final document that follows this guideline.
Focus on using the available context effectively.
"""
        # Generate snippet using the LLaMA model's get_completion function
        snippet = get_completion(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
        formatted_snippet = f"### GUIDELINE #{idx}: {guideline}\n{snippet.strip()}\n"
        final_snippets.append(formatted_snippet)

    # 4) Combine all guideline-based snippets into one document
    final_document = "\n\n".join(final_snippets)
    return final_document


In [None]:
if __name__ == "__main__":
    # Define a sample user query and a list of guidelines for generating the document sections.
    user_query = "Describe the growth of the portfolio over the past 5, 10 years."
    guidelines = [
        "Briefly describe the business portfolio to which the model applies",
        "Include the products and business segments offered by the business line",
        "Describe the products of the LOB and portfolio to which this model applies.",
        "Describe any current or planned changes in the products, channels, policies, programs, organization, or marketing practices that may impact the model under consideration.",
        "Assess how close the current customer base to the target customer profile is.",
        "Consider whether the customer base is likely to shift over the lifetime of the model.",
        "Specify the current and possible future market conditions and the impact they may have on the portfolio and the model.",
        "Describe the growth of the portfolio over the past 5, 10, X years, both in size and in significance to the balance sheet.",
        "If this is a revalidation or the model replaces an existing model, highlight the key relevant changes on the business between the previous model developments and model validations, and this validation.",
        "Include all the changes related to modeling, such as model framework and theory, variables, data sources, and programs; business changes, such as policy or strategy; environmental changes, such as competitor actions, economic changes, and political or regulatory changes; and any other changes that impact the model, its implementation, evaluation, and usage.",
        "Include a table or summary of the portfolio, product, or business metrics of the business in the most recent and past periods. These can include metrics such as balances, losses, recoveries, number of accounts, average account size, credit limits, etc."
    ]

    # Generate the final document by integrating the context with guideline responses.
    final_document = generate_document_with_guidelines(
        user_query=user_query,
        guidelines=guidelines,
        vector_store=vector_store,  # Assuming vector_store is defined and initialized
        k=3,
        max_new_tokens=512,
        temperature=0.7
    )

    print("Final Document:")
    print(final_document)


In [None]:
def run_model_validation_policy_questions_fixed(guidelines, vector_store, max_context_tokens=900):
    """
    For each policy guideline, this function retrieves relevant context using the provided vector_store,
    truncates the context if it is too long, and then uses a system prompt (with strict instructions)
    to generate an answer based only on that context.

    If no relevant context is retrieved, it responds with "Not found".
    
    Args:
        guidelines (list): A list of policy guideline questions.
        vector_store: An initialized vector store with a search(query, k) method.
        max_context_tokens (int): Maximum number of tokens to include for context.
        
    Returns:
        dict: A mapping from each guideline to its generated answer.
    """
    # Define the system prompt with the required instructions.
    system_prompt = (
        "Your task is to assist a Quantitative Model Validator working in the Model Risk Management team of a bank, "
        "to find answers to policy questions about Model Development Document (MDD) based on the provided context.\n"
        "Contents of the subsection(s) of MDD is used as the only input context to answer the Model Validation policy questions. "
        "You are a highly accurate assistant who strictly answers only based on the information in the provided context.\n\n"
        "Strictly follow these Generation Instructions:\n"
        "- Your response should be accurate, coherent, detailed, and descriptive by including all the important statistics, tables, terminologies, and definitions.\n"
        "- Your response should be relevant to the question being asked.\n"
        "- Your response should be honest, focused, and grounded in the provided context.\n"
        "- Do not change or assume any definition, terminology, statistical data, numerical information, or table information.\n"
        "- Always respond with \"Not found\" when you cannot find relevant information in the context.\n"
        "- Always respond with \"Not found\" if any information asked is not explicitly mentioned.\n"
        "- Use the important keywords and phrases from the context to frame your response.\n"
        "- Use bullet points only when required.\n"
        "- Only use the information provided under the specific product, business segment or aspect when answering questions. "
        "If the context includes details about multiple products, ensure your response is limited to the product specified in the query. "
        "Do not include information from other products, businesses, or other aspects.\n"
    )
    
    results = {}
    
    # Loop through each guideline.
    for guideline in guidelines:
        # Retrieve relevant document chunks from the vector store (top 3 results).
        search_results = vector_store.search(guideline, k=3)
        # Aggregate the content from the results.
        context_chunks = [doc.content for (doc, score) in search_results if doc and score > 0]
        context = "\n\n".join(context_chunks).strip()
        # Truncate the context to avoid exceeding token limits.
        context = truncate_text(context, max_tokens=max_context_tokens)
        
        # If there's no relevant context, return "Not found".
        if not context:
            answer = "Not found"
        else:
            # Append "Answer:" at the end of the prompt to guide the model.
            prompt = system_prompt + "\n\nContext:\n" + context + "\n\nPolicy Question: " + guideline + "\nAnswer:"
            answer = get_completion_fixed(prompt)
        
        results[guideline] = answer
        print(f"Guideline: {guideline}\nAnswer: {answer}\n{'-'*60}")
    
    return results


# Example guidelines list.
guidelines = [
    "Briefly describe the business portfolio to which the model applies",
    "Include the products and business segments offered by the business line",
    "Describe the products of the LOB and portfolio to which this model applies.",
    "Describe any current or planned changes in the products, channels, policies, programs, organization, or marketing practices that may impact the model under consideration.",
    "Assess how close the current customer base to the target customer profile is.",
    "Consider whether the customer base is likely to shift over the lifetime of the model.",
    "Specify the current and possible future market conditions and the impact they may have on the portfolio and the model.",
    "Describe the growth of the portfolio over the past 5, 10, X years, both in size and in significance to the balance sheet.",
    "If this is a revalidation or the model replaces an existing model, highlight the key relevant changes on the business between the previous model developments and model validations, and this validation.",
    "Include all the changes related to modeling, such as model framework and theory, variables, data sources, and programs; business changes, such as policy or strategy; environmental changes, such as competitor actions, economic changes, and political or regulatory changes; and any other changes that impact the model, its implementation, evaluation, and usage.",
    "Include a table or summary of the portfolio, product, or business metrics of the business in the most recent and past periods. These can include metrics such as balances, losses, recoveries, number of accounts, average account size, credit limits, etc."
]

# To run the updated function (assuming vector_store is already instantiated and indexed):
# results = run_model_validation_policy_questions_fixed(guidelines, vector_store)
