In [169]:
!pip install rouge-score nltk absl-py evaluate datasets ragas sacrebleu

I0000 00:00:1746674134.533807 18497675 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [170]:
# Standard library

import os
import pkgutil
import time
import psutil


# Environment
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Third-party libraries
import evaluate
import sacrebleu
from datasets import Dataset

import ragas
from ragas import evaluate as ragas_evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.tools import Tool
from langchain_core.messages import AIMessage, HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI


# Application modules
from startup_chain import get_startup_data_chain
from legal_chain import get_legal_data_chain
from startup_masterclass_chain import get_startup_masterclass_chain

from helper import (
    get_llm,
    get_embeddings_model,
    get_startup_data_vector_store_retriever,
    get_legal_data_vector_store_retriever,
    get_startup_masterclass_vector_store_retriever,
)

# List all available metrics in RAGAS
available_metrics = [modname for _, modname, _ in pkgutil.iter_modules(ragas.metrics.__path__)]
print("Available RAGAS Metrics:", available_metrics)

Available RAGAS Metrics: ['_answer_correctness', '_answer_relevance', '_answer_similarity', '_aspect_critic', '_bleu_score', '_context_entities_recall', '_context_precision', '_context_recall', '_datacompy_score', '_domain_specific_rubrics', '_factual_correctness', '_faithfulness', '_goal_accuracy', '_instance_specific_rubrics', '_multi_modal_faithfulness', '_multi_modal_relevance', '_noise_sensitivity', '_nv_metrics', '_rouge_score', '_simple_criteria', '_sql_semantic_equivalence', '_string', '_summarization', '_tool_call_accuracy', '_topic_adherence', 'base', 'utils']


In [171]:
chat_history = []
# llm = get_llm()
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", max_tokens=8192)
embedding = get_embeddings_model()

startup_data_rag_chain = get_startup_data_chain(llm, embedding)
legal_data_rag_chain = get_legal_data_chain(llm, embedding)
startup_masterclass_rag_chain=get_startup_masterclass_chain(llm, embedding)

retriever_startup_data = get_startup_data_vector_store_retriever(embedding)
retriever_legal_data = get_legal_data_vector_store_retriever(embedding)
retriever_startup_masterclass = get_startup_masterclass_vector_store_retriever( embedding)

react_docstore_prompt = hub.pull("hwchase17/react")


 [LOG] OK !! get_embeddings_model DONE 


 [LOG] OK !! get_startup_data_vector_store_retriever DONE 


 [LOG] OK !! get_legal_data_vector_store_retriever DONE 


 [LOG] OK !! get_startup_masterclass_vector_store_retriever DONE 


 [LOG] OK !! get_startup_data_vector_store_retriever DONE 


 [LOG] OK !! get_legal_data_vector_store_retriever DONE 


 [LOG] OK !! get_startup_masterclass_vector_store_retriever DONE 



In [172]:
def invoke_startup_data_rag_chain(input, **kwargs):
    ans=startup_data_rag_chain.invoke({"input": input, "chat_history": kwargs.get("chat_history", [])})
    return ans

def invoke_legal_data_rag_chain(input, **kwargs):
    ans=legal_data_rag_chain.invoke({"input": input, "chat_history": kwargs.get("chat_history", [])})
    return ans

def invoke_startup_masterclass_chain(input, **kwargs):
    ans=startup_masterclass_rag_chain.invoke({"input": input, "chat_history": kwargs.get("chat_history", [])})
    return ans

tools = [
    Tool(
        name="Startup Data Answer Question",
        func=invoke_startup_data_rag_chain,
        description="useful for when you need to answer questions about the context",
    ),
    Tool(
        name="Legal Data Answer Question",
        func=invoke_legal_data_rag_chain,
        description="useful for when you need to answer questions about legal information, regulations, or legal documents",
    ),
    Tool(
        name="StartupMasterclass",
        func=invoke_startup_masterclass_chain,
        description="useful for when you need expert guidance on startup fundamentals, product development, fundraising, growth tactics, team building, operations, and entrepreneurship strategies from comprehensive course materials",
    )
]

agent = create_react_agent(
    llm=llm,
    tools=tools,
    prompt=react_docstore_prompt,
)

agent_executor = AgentExecutor.from_agent_and_tools(
    agent=agent, tools=tools, handle_parsing_errors=True, verbose=True,
)

In [173]:
def evaluate_rag_pipeline(query, retriever):
    # Compute performance metrics
    start_time = time.time()  
    result = agent_executor.invoke({"input": query, "chat_history": chat_history})
    end_time = time.time()    
    inference_time = end_time - start_time  

    answer = result['output']
    print(f"Inference time: {inference_time:.4f} seconds")  

    # Retrieve documents and update chat history
    retrieved_docs = retriever.get_relevant_documents(query)
    chat_history.append(HumanMessage(content=query))
    chat_history.append(AIMessage(content=answer))

    # Determine reference text
    if retrieved_docs:
        reference_text = retrieved_docs[0].page_content
    else:
        reference_text = "No reference available"

    retrieved_contexts = [doc.page_content for doc in retrieved_docs]
    eval_data = Dataset.from_list([{  
        "user_input": query,
        "retrieved_contexts": retrieved_contexts,
        "response": answer,
        "reference": reference_text
    }])

    print("##########################################")
    print(eval_data)
    print("Reference:", reference_text)
    print("##########################################")

    # RAG-specific evaluation
    results = ragas_evaluate(
        eval_data,
        metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
    )
    results_df = results.to_pandas()
    faithfulness_score      = results_df["faithfulness"].iloc[0]
    answer_relevancy_score  = results_df["answer_relevancy"].iloc[0]
    context_precision_score = results_df["context_precision"].iloc[0]
    context_recall_score    = results_df["context_recall"].iloc[0]

    # Load ROUGE
    rouge = evaluate.load("rouge")

    # SacreBLEU for BLEU scoring
    if retrieved_docs:
        alt_refs = [doc.page_content for doc in retrieved_docs]
    else:
        alt_refs = [""]
    hypothesis = [answer]
    sacre_references = [alt_refs]

    try:
        sacrebleu_result = sacrebleu.corpus_bleu(hypothesis, sacre_references)
        bleu_score = sacrebleu_result.score / 100.0
    except Exception as e:
        print(f"SacreBLEU error: {e}")
        bleu_score = 0.0

    concatenated_ref = " ".join(alt_refs)
    try:
        rouge_score = rouge.compute(predictions=hypothesis, references=[concatenated_ref])
    except Exception as e:
        print(f"ROUGE computation error: {e}")
        rouge_score = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

    # Print evaluation results
    print(f"Query: {query}")
    print(f"Generated Answer: {answer}\n")
    print("--- RAGAS Evaluation ---")
    print(f"Faithfulness:      {faithfulness_score:.4f}")
    print(f"Answer Relevancy:  {answer_relevancy_score:.4f}")
    print(f"Context Precision: {context_precision_score:.4f}")
    print(f"Context Recall:    {context_recall_score:.4f}")
    print("\n--- Additional Metrics ---")
    print(f"BLEU Score (SacreBLEU): {bleu_score:.4f}")
    print(f"ROUGE Scores:           {rouge_score}")

    # Compute system metrics
    cpu_usage = psutil.cpu_percent(interval=1)  
    memory = psutil.virtual_memory()            
    print("\n--- SYSTEM METRICS ---")
    print(f"CPU Usage: {cpu_usage}%")  
    print(f"Memory Used: {memory.used / 1e6:.2f} MB / {memory.total / 1e6:.2f} MB")  

    # Return all metrics including inference time
    return {
        "faithfulness":      faithfulness_score,
        "answer_relevancy":  answer_relevancy_score,
        "context_precision": context_precision_score,
        "context_recall":    context_recall_score,
        "bleu_score":        bleu_score,
        "rouge_score":       rouge_score,
        "inference_time":    inference_time  
    }

In [174]:
test_questions = [
    "Which legal documents will be required for the a new startup's registration?",
    "What are some NLP based startups?",
    "What are some some computer vision based startups?",
    "What is the registration process for a new startup?",
    "What are some tips about sales and marketing for new companies?",
    "What are the factors found in a product that users love?"
]

In [175]:
query = test_questions[0]
evaluation_results = evaluate_rag_pipeline(query,retriever_legal_data)
print("\nFinal Evaluation Results:", evaluation_results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: This question requires knowledge of legal requirements for startup registration.  I should use the Legal Data Answer Question tool.

Action: Legal Data Answer Question
Action Input: "What legal documents are typically required for registering a new startup, and what varies based on location (country and state/province)?"
[0m[33;1m[1;3m{'input': 'What legal documents are typically required for registering a new startup, and what varies based on location (country and state/province)?"\n', 'chat_history': [], 'context': [Document(metadata={'source': 'D:\\Data\\Circle\\BOT\\Data\\cleaned\\Companies_Act_2017_cleaned.txt'}, page_content='Section 16:\n16.  Registration of memorandum and articles.—(1) There shall be \nfiled with the registrar an application on the specified form containing the \nfollowing information and documents for incorporation of a company, namely:—  \n \n(a) a declaration on the specified form, by a

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Query: Which legal documents will be required for the a new startup's registration?
Generated Answer: The specific legal documents required for registering a new startup vary significantly depending on the location (country, state/province), the legal structure chosen (e.g., sole proprietorship, LLC, corporation, partnership), and the industry. However, some common documents include:

* **Articles of Incorporation (or similar):** This foundational document establishes the existence of the company, outlining its purpose, structure, and initial directors/officers.  The exact name and requirements vary by jurisdiction.

* **Operating Agreement (for LLCs):** This internal agreement governs the management and operation of a limited liability company.

* **Bylaws (for corporations):**  These rules govern the internal affairs of a corporation.

* **Registered Agent Designation:**  This designates an individual or entity to receive legal and official documents on behalf of the company.

* **EI

In [176]:
query = test_questions[1]
evaluation_results = evaluate_rag_pipeline(query,retriever_startup_data)
print("\nFinal Evaluation Results:", evaluation_results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find information about startups that utilize Natural Language Processing (NLP).  A general knowledge search might be sufficient, but a specialized startup database might yield better results.  I'll try a general search first, and if that's insufficient, I'll explore other options.

Action: Startup Data Answer Question
Action Input: "List some successful startups that utilize Natural Language Processing (NLP) in their core product or service."
[0m[36;1m[1;3m{'input': 'List some successful startups that utilize Natural Language Processing (NLP) in their core product or service."\n', 'chat_history': [], 'context': [Document(metadata={'id': 2462}, page_content="company_name: NuMind \n long_description: NuMind is a tool for data scientists, data analysts, but also software engineers to create custom NLP models. For example, a recruiting company uses NuMind to find which job offers best match a given resume. E

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Query: What are some NLP based startups?
Generated Answer: Several startups utilize NLP in their core offerings.  NuMind helps data scientists create custom NLP models more efficiently. Tennr employs conversational AI to analyze stakeholder information and past deals.  ScopeAI (now acquired) focused on extracting insights from customer conversations using NLP.

--- RAGAS Evaluation ---
Faithfulness:      0.5000
Answer Relevancy:  0.9560
Context Precision: 1.0000
Context Recall:    1.0000

--- Additional Metrics ---
BLEU Score (SacreBLEU): 0.0017
ROUGE Scores:           {'rouge1': 0.09271523178807947, 'rouge2': 0.022172949002217293, 'rougeL': 0.0662251655629139, 'rougeLsum': 0.0750551876379691}

--- SYSTEM METRICS ---
CPU Usage: 16.2%
Memory Used: 19006.11 MB / 38654.71 MB

Final Evaluation Results: {'faithfulness': 0.5, 'answer_relevancy': 0.955981389849265, 'context_precision': 0.9999999999, 'context_recall': 1.0, 'bleu_score': 0.0017117570908132252, 'rouge_score': {'rouge1': 0.092715

In [177]:
query = test_questions[2]
evaluation_results = evaluate_rag_pipeline(query,retriever_startup_data)
print("\nFinal Evaluation Results:", evaluation_results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find information about startups that utilize computer vision technology.  A general knowledge base might not be sufficient, but a database specializing in startup information would be helpful.

Action: Startup Data Answer Question
Action Input: "List some examples of startups using computer vision technology, categorized by industry if possible."
[0m[36;1m[1;3m{'input': 'List some examples of startups using computer vision technology, categorized by industry if possible."\n', 'chat_history': [], 'context': [Document(metadata={'id': 1774}, page_content="company_name: Jido Maps \n long_description: We help teams that may or may not have machine learning expertise quickly turn their data into deployed computer vision models. Example applications include security camera monitoring, automating data entry, interpreting web scraped images, validated user photo inputs, augmented reality and mobile product scanni

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Query: What are some some computer vision based startups?
Generated Answer: Jido Maps uses computer vision for applications such as security camera monitoring and augmented reality. Tenyks focuses on MLOps monitoring and validation for computer vision algorithms. Ambient.ai applies computer vision to enhance enterprise security operations.

--- RAGAS Evaluation ---
Faithfulness:      0.8000
Answer Relevancy:  0.8592
Context Precision: 1.0000
Context Recall:    1.0000

--- Additional Metrics ---
BLEU Score (SacreBLEU): 0.0001
ROUGE Scores:           {'rouge1': 0.09022556390977444, 'rouge2': 0.04528301886792453, 'rougeL': 0.07894736842105263, 'rougeLsum': 0.08270676691729324}

--- SYSTEM METRICS ---
CPU Usage: 14.8%
Memory Used: 19020.12 MB / 38654.71 MB

Final Evaluation Results: {'faithfulness': 0.8, 'answer_relevancy': 0.859202157894965, 'context_precision': 0.9999999999, 'context_recall': 1.0, 'bleu_score': 0.00013630940478937217, 'rouge_score': {'rouge1': 0.09022556390977444, 'rouge

In [178]:
query = test_questions[3]
evaluation_results = evaluate_rag_pipeline(query,retriever_legal_data)
print("\nFinal Evaluation Results:", evaluation_results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: This question involves the legal and practical aspects of starting a business.  I need to understand the legal requirements and also the practical steps involved in setting up a business.  I'll start with Legal Data Answer Question to get the legal side, then use StartupMasterclass to get the practical steps.

Action: Legal Data Answer Question
Action Input: "What are the legal requirements for registering a new startup in the United States?"
[0m[33;1m[1;3m{'input': 'What are the legal requirements for registering a new startup in the United States?"\n', 'chat_history': [], 'context': [Document(metadata={'source': 'D:\\Data\\Circle\\BOT\\Data\\cleaned\\Companies_Act_2017_cleaned.txt'}, page_content='Section 16:\n16.  Registration of memorandum and articles.—(1) There shall be \nfiled with the registrar an application on the specified form containing the \nfollowing information and documents for incorporation of a 

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Query: What is the registration process for a new startup?
Generated Answer: To register a new startup in the United States, the most efficient approach is to incorporate online in Delaware using a service like Clerky. Delaware is recommended due to its clear and established corporate laws, and investor familiarity.  This method keeps initial legal costs low, in the hundreds rather than thousands of dollars.  A lawyer is only needed for more complex situations involving HIPAA, extensive privacy policies, or intricate fundraising terms.  This information is based on advice from a startup course, and for a complete and legally accurate guide, consulting legal professionals or official state resources is recommended.

--- RAGAS Evaluation ---
Faithfulness:      0.0000
Answer Relevancy:  0.9223
Context Precision: 1.0000
Context Recall:    1.0000

--- Additional Metrics ---
BLEU Score (SacreBLEU): 0.0004
ROUGE Scores:           {'rouge1': 0.06618240516545601, 'rouge2': 0.0032336297493936947

In [179]:
query = test_questions[4]
evaluation_results = evaluate_rag_pipeline(query,retriever_startup_masterclass)
print("\nFinal Evaluation Results:", evaluation_results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: This question is about sales and marketing strategies for startups, which falls under the domain of entrepreneurship and growth tactics.  The StartupMasterclass tool seems best suited to answer this.

Action: StartupMasterclass
Action Input: "Sales and marketing tips for new companies"
[0m[38;5;200m[1;3m{'input': 'Sales and marketing tips for new companies"\n', 'chat_history': [], 'context': [Document(metadata={'page': 310, 'source': 'D:/Data/Circle/BOT/Data/Startup Course Text Data.pdf'}, page_content='A quick background on Clever: we build software for schools. We are an app platform for \ndevelopers that is used today by about one in five schools in America. We started it about \ntwo years ago. \n  \nSales has been key. I want to use this time to share some of the things that have worked for \nme along the way. Of course, there\'s a million ways to do this, so you\'ll find what works for \nyou. \n  \nFirst I wa

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Query: What are some tips about sales and marketing for new companies?
Generated Answer: For new companies, sales and marketing success hinges on the founder's direct involvement in initial sales efforts, prioritizing understanding customer needs over pitching features. Effective strategies include:

* **Sales:**  Focus on building a strong sales funnel, starting with prospecting through personal networks, conferences, and targeted cold emails. During sales calls, emphasize listening more than talking to understand customer challenges.

* **Marketing:**  While not explicitly detailed, implied marketing strategies include targeted content marketing, social media engagement on relevant platforms, and public relations to build brand awareness and reach the target audience.  The approach should be tailored to the specific target market and stage of the company's growth.  For example, an early-stage startup might focus on reaching innovators, while a more established company might target la

In [180]:
query = test_questions[5]
evaluation_results = evaluate_rag_pipeline(query,retriever_startup_masterclass)
print("\nFinal Evaluation Results:", evaluation_results)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: This question is about product development and user experience, which falls under the domain of startup fundamentals.  I should use the StartupMasterclass tool to get expert insights.

Action: StartupMasterclass
Action Input: "Factors contributing to user love for a product"
[0m[38;5;200m[1;3m{'input': 'Factors contributing to user love for a product"\n', 'chat_history': [], 'context': [Document(metadata={'page': 287, 'source': 'D:/Data/Circle/BOT/Data/Startup Course Text Data.pdf'}, page_content='insights. We concept them and then we start to build. This is why we go to find those lists \nof consumer problems. The principles, how do we think about approaching that? What are \nthe solutions?And then what\'s required in the product to make that happen. Sam? \n  \nSam Altman: Could, could you talk about how you balance the fact that a user would \nnever, told you they wanted to pay $200 for a wireless speaker. \n  \

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[3]: LLMDidNotFinishException(The LLM generation was not completed. Please increase the max_tokens and try again.)


Query: What are the factors found in a product that users love?
Generated Answer: Based on the StartupMasterclass materials, creating a product users love involves several key factors:  understanding user behavior and lifestyle rather than just asking what they want; prioritizing a small group of intensely loyal users over a large group of mildly interested ones;  creating "enchanting quality" that goes beyond functionality, incorporating delightful surprises and emotional connections; and mastering "first moments" by making every interaction positive and memorable.  The core concept is to build a product that solves problems users may not even know they have and to focus on creating a strong emotional connection rather than simply meeting functional needs.

--- RAGAS Evaluation ---
Faithfulness:      1.0000
Answer Relevancy:  0.9203
Context Precision: 0.5000
Context Recall:    nan

--- Additional Metrics ---
BLEU Score (SacreBLEU): 0.0007
ROUGE Scores:           {'rouge1': 0.071093226