# Intro

In this example, I am using LangChain to build a simple LLM-powered assistant. I will focus on using:
1. Google Gemini
2. OpenAI gpt-4o-mini

In [1]:
# !pip install -qU \
#   langchain-core==0.3.33 \
#   langchain-openai==0.3.3 \
#   langchain-community==0.3.16

# Initialize LLM

First need to initialize an LLM. I'll use _Google Gemini_ or _OpenAI 'gpt-4.1-nano'_ model. You can get an API key from [OpenAI](https://platform.openai.com/settings/organization/api-keys).

In [2]:
AI_Provider = "GOOGLE"
# LLM_Provider = "OPENAI"

if AI_Provider == "GOOGLE":
    llm_model = "gemini-2.0-flash"
elif AI_Provider == "OPENAI":
    llm_model = "gpt-4.1-nano"  # less expensive than gpt-4o-mini

In [3]:
from dotenv import load_dotenv
import os
from getpass import getpass


load_dotenv()  # Try to load local .env file (for local dev); silently skip if not found (for CI)
os.environ[f"{AI_Provider}_API_KEY"] = os.getenv(f"{AI_Provider}_API_KEY") or getpass(f"Enter {AI_Provider} API Key: ")  # Get API key from environment or user input
if os.getenv(f"{AI_Provider}_API_KEY") is None:
    raise ValueError(f"❌ {AI_Provider}_API_KEY not found. Make sure it's in your .env file or set as a GitHub Action secret.")
else:
    print(f"✅ {AI_Provider}_API_KEY loaded successfully (not printing it for security).")

✅ GOOGLE_API_KEY loaded successfully (not printing it for security).


In [4]:
if AI_Provider == "GOOGLE":
    from langchain_google_genai import ChatGoogleGenerativeAI as ChatLLM
elif AI_Provider == "OPENAI":
    from langchain_openai import ChatOpenAI as ChatLLM

llm = ChatLLM(temperature=0.0, model=llm_model)  # For normal accurate responses
# print("✅", llm.invoke("What LLM version are you?").content)

# Problem Setting

I'll take a `new_report.txt`, `metrics.csv`, and `training_examples.csv` and use LangChain to:
1. Score the new report
2. Extract from the new_report 3 pieces of supporting evidence for this Score

# Create VectorDB from TXT

In [201]:
chunk_size = 250  # FIXME: calibrate
chunk_overlap = 50  # FIXME: calibrate

In [5]:
# Read the IBM report file into a string
with open('data/new_report_IBM.txt', 'r', encoding='utf-8') as file:
    report = file.read()
print("Report loaded successfully!")
print(f"Report length: {len(report)} characters")
print("First 200 characters of the report:")
print(report[:200] + "...")

Report loaded successfully!
Report length: 2591 characters
First 200 characters of the report:
Balanced Power of IBM

IBM’s latest sustainability report opens with a bold statement: “IBM is committed to achieving net-zero GHG Scope 1 emissions by 2050 through a comprehensive, three-phase decarb...


In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter  # to split YouTube transcript into chunks
from langchain_core.documents import Document

documents = [Document(page_content=report)]  # wrap the report string in a Document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)  # initilize the text splitter
docs = text_splitter.split_documents(documents)  # split report into overlapping chunks
print(f'The number of chunks is {len(docs)}')
# # test
# print(f'Test: docs[0].page_content:\n{docs[0].page_content}')
# for i, doc in enumerate(docs):
#     print(f'Test: docs[{i}].page_content:\n{doc.page_content}')

The number of chunks is 16


In [14]:
from langchain_huggingface import HuggingFaceEmbeddings  # Alternative:  from langchain.embeddings.openai import OpenAIEmbeddings
    # Try:          from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings

# define how to map a string (can be a sentence or a paragraph) to a vector
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # This model is small (~80MB), fast on CPU, good for English # Alternative: ultra-fast memory-light (~45MB): model_name="sentence-transformers/paraphrase-MiniLM-L3-v2" # Alternative: embeddings = OpenAIEmbeddings()
# # Test: one embed string
# embed_test = embeddings.embed_query("What company is the report about?")
# print(f'Test: len(embeddings)={len(embed_test)}, embeddings[:5]={embed_test[:5]}')  # Should return a 384-dim vector


  from .autonotebook import tqdm as notebook_tqdm


In [27]:
from langchain_community.vectorstores import FAISS  # Vector Database (indexes); alternatives: Pinecon, Weaviate
db = FAISS.from_documents(docs, embeddings)  # create a DB of vector embeddings from the docs
print(f'The number of vectors in the DB is {db.index.ntotal}')

The number of vectors in the DB is 16


In [17]:
def make_vectordb_from_report(report_filename: str) -> FAISS:
    return db # TODO: Implement this function


# Simularity Search

In [None]:
# def prep_simularity_search_query(metrics, metric_id):
def prep_simularity_search_query(metrics, metric_id):

    user_prompt = HumanMessagePromptTemplate.from_template(
        """{metric_name} metric:\n{metric_description}""",
        input_variables=["metric_name", "metric_description"]
    )

    query = user_prompt.format(
        metric_name = metrics[metrics["MetricID"]==metric_id]["MetricName"].values[0],
        metric_description = metrics[metrics["MetricID"]==metric_id]["MetricDescription"].values[0]).content

    return query

In [None]:
def simularity_search(query, db, chunks_number=4):
    docs = db.similarity_search(query, k=chunks_number)  # find chunks_number docs similar to the user's query; FAISS does the similarity search
    new_company_report_chunks_summary = " ".join([doc.page_content for doc in docs])  # combine "page_content" fields from each of the found docs
    return new_company_report_chunks_summary

# Prompt

In [200]:
import pandas as pd
metrics_filename = 'data/metrics.csv'
metrics = pd.read_csv(metrics_filename)
# print(metrics.iloc[0]["MetricDescription"])
# metrics.head()

In [196]:
train_examples_filename = 'data/train_examples.csv'
train_examples = pd.read_csv(train_examples_filename)
# train_examples.head(1)

In [188]:
def prep_prompt_inputs(new_company, metrics, metric_id, train_examples, new_company_report_chunks_summary):
    prompt_inputs = {
        'new_company': new_company,
        'metric_name': metrics[metrics["MetricID"]==metric_id]["MetricName"].values[0],
        'metric_description': metrics[metrics["MetricID"]==metric_id]["MetricDescription"].values[0]
    }

    # add inputs from exaples
    df = train_examples[train_examples['MetricID']==metric_id].reset_index(drop=True)
    assert len(df) == 3, "Expected exactly 3 example companies for 1 metric"  # Safety check

    prompt_inputs['Company_1'] = df.loc[0, 'Company']
    prompt_inputs['Score_1'] = df.loc[0, 'Score']
    prompt_inputs['Reasons1_1'] = df.loc[0, 'Reason1']
    prompt_inputs['Reasons2_1'] = df.loc[0, 'Reason2']
    prompt_inputs['Reasons3_1'] = df.loc[0, 'Reason3']
    
    prompt_inputs['Company_2'] = df.loc[1, 'Company']
    prompt_inputs['Score_2'] = df.loc[1, 'Score']
    prompt_inputs['Reasons1_2'] = df.loc[1, 'Reason1']
    prompt_inputs['Reasons2_2'] = df.loc[1, 'Reason2']
    prompt_inputs['Reasons3_2'] = df.loc[1, 'Reason3']
    
    prompt_inputs['Company_3'] = df.loc[2, 'Company']
    prompt_inputs['Score_3'] = df.loc[2, 'Score']
    prompt_inputs['Reasons1_3'] = df.loc[2, 'Reason1']
    prompt_inputs['Reasons2_3'] = df.loc[2, 'Reason2']
    prompt_inputs['Reasons3_3'] = df.loc[2, 'Reason3']

    prompt_inputs['new_company_report_chunks_summary'] = new_company_report_chunks_summary
    
    return prompt_inputs

In [189]:
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
# Defining the system prompt (how the AI should act)
system_prompt = SystemMessagePromptTemplate.from_template('You are a sustainability consultant tasked to score a company against the provided metric. Score can be: 1, 2, or 3.')

# the user prompt is provided by the user, in this case however the only dynamic input is the article
user_prompt = HumanMessagePromptTemplate.from_template(
    """Metric name: {metric_name}
    Scoring criteria: {metric_description}

    # You need to score the company "{new_company}" against this metric and provide 3 reasons for the score.
        ## The output should be a JSON object with the following fields (no other explanation or text or fields are allowed):
        - Company: the name of the company
        - Score: the score of the company
        - Reasons1: first reason for the score
        - Reasons2: second reason for the score
        - Reasons3: third reason for the score

    # Below are examples of the scoring applied to 3 companies:
    Company 1: {Company_1}
    Score: {Score_1}
    Reason 1: {Reasons1_1}
    Reason 2: {Reasons2_1}
    Reason 3: {Reasons3_1}

    Company 2: {Company_2}
    Score: {Score_2}
    Reason 1: {Reasons1_2}
    Reason 2: {Reasons2_2}
    Reason 3: {Reasons3_2}

    Company 3: {Company_3}
    Score: {Score_3}
    Reason 1: {Reasons1_3}
    Reason 2: {Reasons2_3}
    Reason 3: {Reasons3_3}
    
    # The report of the company "{new_company}" is:
    {new_company_report_chunks_summary}
    """,

    input_variables=["metric_name", "metric_description", "new_company",
        "Company_1", "Score_1", "Reasons1_1", "Reasons2_1", "Reasons3_1",
        "Company_2", "Score_2", "Reasons1_2", "Reasons2_2", "Reasons3_2",
        "Company_3", "Score_3", "Reasons1_3", "Reasons2_3", "Reasons3_3", "new_company_report_chunks_summary"]
)

Now we can merge the system and user prompts into a full chat prompt using the `ChatPromptTemplate`:

In [190]:
from langchain.prompts import ChatPromptTemplate

# prompt template 1: create an article title
prompt = ChatPromptTemplate.from_messages([system_prompt, user_prompt])

`ChatPromptTemplate` prefixes each individual message with it's role, ie `System:`, `Human:`, or `AI:`.

In [195]:
# prompt.messages

By default, the `ChatPromptTemplate` will read the `input_variables` from each of the prompt templates inserted and allow us to use those input variables when formatting the full chat prompt template:

In [193]:
# print(prompt.format(**prompt_inputs))

# Main Pipeline

In [None]:
new_company = "IBM"  # FIXME
for i in range(1):  #FIXME: range(len(metrics)):
    metric_id = metrics.iloc[i]["MetricID"]
    print(f"MetricID: {metric_id}")

    query = prep_simularity_search_query(metrics, metric_id)
    print(query)

    new_company_report_chunks_summary = simularity_search(query, db, number_of_chunks=3)
    print(f"new_company_report_chunks_summary: {new_company_report_chunks_summary}")

    prompt_inputs = prep_prompt_inputs(new_company, metrics, metric_id, train_examples)
    #TODO: run the LLM to score the new company and extract 3 reasons for the score
    #TODO: record the score in the output dataframe
    break

In [None]:
output = llm.invoke(prompt.format(**prompt_inputs))

AIMessage(content='```json\n{\n  "Company": "IBM",\n  "Score": 3,\n  "Reasons1": "IBM has committed to net zero GHG emissions by 2030, encompassing Scope 1 emissions.",\n  "Reasons2": "IBM provides a detailed roadmap with specific targets and initiatives for reducing Scope 1 emissions year by year.",\n  "Reasons3": "IBM has already implemented several initiatives and achieved significant reductions in its Scope 1 emissions, demonstrating progress towards its 2030 goal."\n}\n```', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--9f10a023-373d-436b-bc47-c9bc44f405a4-0', usage_metadata={'input_tokens': 656, 'output_tokens': 114, 'total_tokens': 770, 'input_token_details': {'cache_read': 0}})

In [217]:
import json
import re

# Step 1: Extract the JSON part (removes ```json ... ```)
raw_output = output.content
json_str = re.sub(r"^```json|```$", "", raw_output.strip()).strip()

# Step 2: Convert to Python dict
parsed_output = json.loads(json_str)

# Check the result
parsed_output

{'Company': 'IBM',
 'Score': 3,
 'Reasons1': 'IBM has committed to net zero GHG emissions by 2030, encompassing Scope 1 emissions.',
 'Reasons2': 'IBM provides a detailed roadmap with specific targets and initiatives for reducing Scope 1 emissions year by year.',
 'Reasons3': 'IBM has already implemented several initiatives and achieved significant reductions in its Scope 1 emissions, demonstrating progress towards its 2030 goal.'}

In [246]:
res_i = pd.DataFrame(parsed_output, index=[0])
res_i.insert(1, "MetricID", metric_id)
res_i

Unnamed: 0,Company,MetricID,Score,Reasons1,Reasons2,Reasons3
0,IBM,1,3,IBM has committed to net zero GHG emissions by...,IBM provides a detailed roadmap with specific ...,IBM has already implemented several initiative...


Now chain the `prompt` template and the `llm` object defined earlier to create an LLM chain for **prompt formatting > llm generation > get output**.

Let's use __LCEL__ to construct the chain: define inputs with `{"article": lambda x: x["article"]}` and use the pipe operator (`|`) to feed the output from its left into the input to its right.

In [233]:
# chain1: inputs:article / output:article_title
chain = (
    # {
    #     "article": lambda x: x["article"],
    #     "article_title": lambda x: x["article_title"]
    # }
    # | 
    prompt
    | llm
    | {"response": lambda x: x.content}
)

This chain creates scores the new company and procides 3 reasons:

In [236]:
# res = chain.invoke({
#     "article": article,
#     "article_title": article_title_msg["article_title"]
# })
res = chain.invoke(prompt_inputs)
res["response"]

'```json\n{\n  "Company": "IBM",\n  "Score": 3,\n  "Reasons1": "IBM has committed to net zero GHG emissions by 2030, encompassing Scope 1 emissions.",\n  "Reasons2": "IBM provides a detailed roadmap with specific initiatives and timelines for achieving its net zero target, including interim goals and progress reporting.",\n  "Reasons3": "IBM has already implemented several initiatives and demonstrated progress towards its GHG reduction targets, indicating that the first steps have been taken."\n}\n```'

## <span style="color:red"> __NEXT__ </span>

Since we output _several fields_ we'll specify for the LLM to use __structured outputs__ to make the generated fields aligned with our requirements.
or this we create a _pydantic object_ and describe the required output format - this format description is then passed to our model using the `with_structured_output` method:

In [48]:
from pydantic import BaseModel, Field

class Paragraph(BaseModel):
    paragraph_original: str = Field(description="The original paragraph")
    paragraph_edited: str = Field(description="The improved edited paragraph")
    feedback: str = Field(description="Feedback on the original paragraph")

llm_structured = llm_creative.with_structured_output(Paragraph)

Let's combine all this into chain3:

In [49]:
# chain3: inputs:article / output:paragraph
chain3 = (
    {"article": lambda x: x["article"]}
    | prompt3
    | llm_structured
    | {
        "paragraph_original": lambda x: x.paragraph_original,
        "paragraph_edited": lambda x: x.paragraph_edited,
        "feedback": lambda x: x.feedback
    }
)

In [50]:
paragraph_and_feedback = chain3.invoke({"article": article})
paragraph_and_feedback

{'paragraph_original': 'I tackled a pressing issue for a large life insurance client: their computational model for projecting financial outcomes and predicting balance sheets over multiple quarters for life insurance products was prohibitively slow and costly. The client’s leadership needed a capability to run ad-hoc what-if scenarios, but the model’s runtime prevented near-real-time risk analytics.',
 'paragraph_edited': 'I tackled a pressing issue for a large life insurance client: their computational model for projecting financial outcomes and predicting balance sheets over multiple quarters for life insurance products was prohibitively slow and costly. This inefficiency hindered their ability to respond quickly to market changes and increased the risk of inaccurate financial forecasting. The client’s leadership needed a capability to run ad-hoc what-if scenarios, but the model’s runtime prevented near-real-time risk analytics.',
 'feedback': "The original paragraph is well-written