In [12]:
import sys, os
sys.path.append(os.path.abspath('src'))

from pgai.vector_db_clients import FAISSClient#, ChromaClient
from pgai import langchain_helper

# Intro

In this example, I am using LangChain to build a simple LLM-powered assistant. I will focus on using:
1. Google Gemini
2. OpenAI gpt-4o-mini

# Problem Setting

I'll take a `new_report.txt`, `metrics.csv`, and `training_examples.csv` and use LangChain to:
1. Score the new report
2. Extract from the new_report 3 pieces of supporting evidence for this Score

# Initialize LLM

First need to initialize an LLM. I'll use _Google Gemini_ or _OpenAI 'gpt-4.1-nano'_ model. You can get an API key from [OpenAI](https://platform.openai.com/settings/organization/api-keys).

In [13]:
AI_Provider = "GOOGLE"
# LLM_Provider = "OPENAI"

llm, llm_structured = langchain_helper.get_llm(AI_Provider)

✅ GOOGLE_API_KEY loaded successfully (not printing it for security).


# Create VectorDB from TXT

In [14]:
db_client = FAISSClient()
db_client.from_txt_file(txt_path="data/new_report_IBM.txt")
# db = langchain_helper.create_vectordb_from_txt_file(txt_filename="new_report_IBM.txt")
# db.save_local("data/vector_dbs/faiss_index")
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # This model is small (~80MB), fast on CPU, good for English # Alternative: ultra-fast memory-light (~45MB): model_name="sentence-transformers/paraphrase-MiniLM-L3-v2" # Alternative: embeddings = OpenAIEmbeddings()
# db = FAISS.load_local("data/vector_dbs/faiss_index", embeddings)

Report loaded: 2591 characters
First 200 characters of the report: Balanced Power of IBM

IBM’s latest sustainability report opens with a bold statement: “IBM is committed to achieving net-zero GHG Scope 1 emissions by 2050 through a comprehensive, three-phase decarb...
Split into 16 chunks
FAISS index has 16 vectors


<pgai.vector_db_clients.faiss_client.FAISSClient at 0x1a2800a5810>

# Prompt

In [15]:
import pandas as pd
metrics_filename = 'data/metrics.csv'
metrics = pd.read_csv(metrics_filename)
# print(metrics.iloc[0]["MetricDescription"])
# metrics.head()

In [16]:
train_examples_filename = 'data/train_examples.csv'
train_examples = pd.read_csv(train_examples_filename)
# train_examples.head(1)

In [17]:
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate

# Defining the system prompt (how the AI should act)
system_prompt = SystemMessagePromptTemplate.from_template('You are a sustainability consultant tasked to score a company against the provided metric. Score can be: 1, 2, or 3.')

# the user prompt is provided by the user, in this case however the only dynamic input is the article
user_prompt = HumanMessagePromptTemplate.from_template(
    """# You need to score the company "{new_company}" against the metric and criteria (provided below) and provide 3 reasons for the score by quoting the report.
        ## The output should be a JSON object with the following fields (no other explanation or text or fields are allowed):
        - Company: the name of the company
        - MetricID: MetricID
        - Score: the score of the company
        - Reason1: first reason for the score
        - Reason2: second reason for the score
        - Reason3: third reason for the score
    
    MetricID: {metric_id}
    Metric name: {metric_name}
    Scoring criteria: {metric_description}

    # Below are examples of the scoring applied to 3 companies:
    Company 1: {Company_1}
    Score: {Score_1}
    Reason 1: {Reason1_1}
    Reason 2: {Reason2_1}
    Reason 3: {Reason3_1}

    Company 2: {Company_2}
    Score: {Score_2}
    Reason 1: {Reason1_2}
    Reason 2: {Reason2_2}
    Reason 3: {Reason3_2}

    Company 3: {Company_3}
    Score: {Score_3}
    Reason 1: {Reason1_3}
    Reason 2: {Reason2_3}
    Reason 3: {Reason3_3}
    
    # The report of the company "{new_company}" is:
    {new_company_report_chunks_summary}
    """,

    input_variables=["metric_id", "metric_name", "metric_description", "new_company",
        "Company_1", "Score_1", "Reason1_1", "Reason2_1", "Reason3_1",
        "Company_2", "Score_2", "Reason1_2", "Reason2_2", "Reason3_2",
        "Company_3", "Score_3", "Reason1_3", "Reason2_3", "Reason3_3", "new_company_report_chunks_summary"]
)

Now we can merge the `system` and `user` prompts into a full chat prompt using the `ChatPromptTemplate`:

In [18]:
# prompt template 1: create an article title
prompt = ChatPromptTemplate.from_messages([system_prompt, user_prompt])

# Main Pipeline

Now chain the `prompt` template and the `llm` object defined earlier to create an LLM chain for **prompt formatting > llm generation > get output**.

Let's use __LCEL__ to construct the chain: define inputs with `{"metric_id": lambda x: x["metric_id"], ...}` and use the pipe operator (`|`) to feed the output from its left into the input to its right.

In [19]:
# This chain will output (for the given metric) the new company's score and provides 3 reasons
chain = (
    {
        "metric_id": lambda x: x["metric_id"],
        "metric_name": lambda x: x["metric_name"],
        "metric_description": lambda x: x["metric_description"],
        "new_company": lambda x: x["new_company"],
        "Company_1": lambda x: x["Company_1"],
        "Score_1": lambda x: x["Score_1"],
        "Reason1_1": lambda x: x["Reason1_1"],
        "Reason2_1": lambda x: x["Reason2_1"],
        "Reason3_1": lambda x: x["Reason3_1"],
        "Company_2": lambda x: x["Company_2"],
        "Score_2": lambda x: x["Score_2"],
        "Reason1_2": lambda x: x["Reason1_2"],
        "Reason2_2": lambda x: x["Reason2_2"],
        "Reason3_2": lambda x: x["Reason3_2"],
        "Company_3": lambda x: x["Company_3"],
        "Score_3": lambda x: x["Score_3"],
        "Reason1_3": lambda x: x["Reason1_3"],
        "Reason2_3": lambda x: x["Reason2_3"],
        "Reason3_3": lambda x: x["Reason3_3"],
        "new_company_report_chunks_summary": lambda x: x["new_company_report_chunks_summary"]
    }
    | prompt
    | llm_structured
    | {
        "Company": lambda llm_output: llm_output.Company,
        "MetricID": lambda llm_output: llm_output.MetricID,
        "Score": lambda llm_output: llm_output.Score,
        "Reason1": lambda llm_output: llm_output.Reason1,
        "Reason2": lambda llm_output: llm_output.Reason2,
        "Reason3": lambda llm_output: llm_output.Reason3
        }
)

In [20]:
new_company = "IBM"  # FIXME

new_company_scores_df = pd.DataFrame(columns=train_examples.columns)
new_company_scores_df

Unnamed: 0,Company,MetricID,Score,Reason1,Reason2,Reason3


In [21]:
for i in range(len(metrics)):
    metric_id = metrics.iloc[i]["MetricID"]
    # print(f"MetricID: {metric_id}")

    query = langchain_helper.prep_simularity_search_query(metrics, metric_id)
    # print(query)

    new_company_report_chunks_summary = langchain_helper.simularity_search(query, db_client, chunks_number=3)  # Retriev report data
    # print(f"new_company_report_chunks_summary: {new_company_report_chunks_summary}")

    prompt_inputs = langchain_helper.prep_prompt_inputs(new_company, metrics, metric_id, train_examples, new_company_report_chunks_summary)  # RAG
    # print(output)
    output = chain.invoke(prompt_inputs)
    new_company_scores_df = pd.concat([new_company_scores_df, pd.DataFrame(output, index=[0])], ignore_index=True)

# save new_company_scores_df to new_company_scores.csv
new_company_scores_df.to_csv("data/new_company_scores.csv", index=False)
new_company_scores_df

Unnamed: 0,Company,MetricID,Score,Reason1,Reason2,Reason3
0,IBM,1,2,IBM is committed to achieving net-zero GHG Sco...,The company lays out a transparent emissions t...,"As of 2025, IBM’s Scope 1 emissions remain at ..."
1,IBM,2,3,IBM is runnig 100m in 14.8s.,IBM can do 40 pushups.,IBM can do 60 jumps.
2,IBM,3,1,"Math is just a rough guideline, like ‘ballpark...","elite in physical culture, lax in intellectual...",Some of the educational narratives now veer in...


## <span style="color:red"> __NEXT__ </span>