# AIM Hackathon: Sample code
19.10.2024

In [46]:
import os
import requests
import PyPDF2
import tiktoken
import pandas as pd
import pickle
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings.base import OpenAIEmbeddings

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS

# load openai key
if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [6]:
REPORTS_SAVE_PATH = 'data/sample_reports'
DB_PATH = "data/db/sample.db"

# See https://openai.com/api/pricing/
MODEL = "gpt-4o"

In [7]:
df = pd.read_json('data/reports.json')
df

Unnamed: 0,company_name,year,dataset,pdf_url
0,Walmart,2023,handcrafted,https://corporate.walmart.com/content/dam/corp...
1,Walmart,2021,handcrafted,https://corporate.walmart.com/content/dam/corp...
2,Walmart,2019,handcrafted,https://corporate.walmart.com/content/dam/corp...
3,Amazon,2023,handcrafted,https://sustainability.aboutamazon.com/content...
4,Amazon,2021,handcrafted,https://sustainability.aboutamazon.com/content...
...,...,...,...,...
141,tarkett,2020,scraped,https://www.tarkett.com/sites/default/files/20...
142,trivium-packaging,2021,scraped,https://www.triviumpackaging.com/media/13fl4q3...
143,trivium-packaging,2020,scraped,https://triviumpackaging.com/sustainability/re...
144,trust,2023,scraped,https://dezlwerqy1h00.cloudfront.net/images/co...


## Download some reports

In [8]:
# EXAMPLE: select apple reports
df_sample = df[df['dataset'] == 'handcrafted']

In [9]:
len(df_sample)

23

In [10]:
# download Apple reports to save_dir
def download_files(df: pd.DataFrame, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    for url in df['pdf_url']:
        pdf_filename = os.path.basename(url)
        response = requests.get(url)
        with open(os.path.join(save_dir, pdf_filename.split('?')[0]), 'wb') as file:
            file.write(response.content)
    print(f"Success.")

In [11]:
download_files(df_sample, REPORTS_SAVE_PATH)

Success.


## Create simple vector database

In [139]:
docs_pages = {}

# Load PDFs
def get_documents_from_path(files_path: str) -> [Document]:
    documents = []
    
    for file in os.listdir(files_path)[:1]:
        _, file_extension = os.path.splitext(file)
        text = ""
        texts = []
        
        if file_extension == ".pdf":
            try:
                with open(os.path.join(files_path, file), 'rb') as f:
                    reader = PyPDF2.PdfReader(f, strict=False)
                    for page in reader.pages:
                        text += page.extract_text() + "\n"
                        texts.append(page.extract_text())
                    
                if text:
                    documents.append(Document(page_content=text, metadata={"source": file}))
                    docs_pages[file] = texts.copy()
                else:
                    print(f"WARNING: No text extracted from {file}")
            except Exception as e:
                print(e)
        else:
            # TODO: can add support for other file types here
            raise Exception(f"Unsupported file extension: {file_extension}")
    
    return documents

In [140]:
documents = get_documents_from_path(REPORTS_SAVE_PATH)

In [141]:
len(docs_pages["fy2023-walmart-esg-highlights.pdf"])

43

In [142]:
docs_pages["fy2023-walmart-esg-highlights.pdf"]

['Environmental,  \nSocial, and  \nGovernance  \nHighlights\nFY2023\n',
 'FY2023  \nESG Highlights\nContents\nESG REPORTING SOURCES\nESG Issue Briefs\nESG Reporting DataINTRODUCTION\n4 Leadership Letters\n6 Awards & Recognition\n7 Our Company\n8 Our Approach to ESG\nOPPORTUNITY\n11 Opportunity Highlights\n12  Human Capital: Good Jobs  \n& Advancement for Associates\n13 Human Capital Spotlight\n14 Equity & Inclusion at Walmart & Beyond\n15 Supplier Opportunity SUSTAINABILITY\n17 Sustainability Highlights\n18 Product Supply Chain Sustainability\n19 Climate Change\n20 Climate Change Spotlight\n21  Regeneration of Natural Resources:  \nForests, Land, Oceans\n22 Waste: Circular Economy\n23 People in Supply Chains\nCOMMUNITY\n25 Community Highlights\n26 Serving Communities\n27 Safer, Healthier Food & Other Products\n28 Disaster Preparedness & Response\n29  Disaster Preparedness & Response  \nSpotlightETHICS & INTEGRITY\n31 Ethics & Integrity Highlights\n32 Ethics & Compliance\n33 Corporate G

In [112]:
# Create database
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=300, separators=["\n\n", "\n"])

# split documents and create vector database
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=os.environ["OPEN_API_KEY"])  # https://platform.openai.com/docs/guides/embeddings/embedding-models
db = FAISS.from_documents(texts, embeddings)

# count build embedding token number
tokenizer = tiktoken.get_encoding("cl100k_base")
build_token_count = sum([len(tokenizer.encode(doc.page_content)) for doc in texts])
print(f"Token count: {build_token_count}")

Token count: 969082


In [113]:
# Store the database
with open(DB_PATH, "wb") as f:
    pickle.dump(db.serialize_to_bytes(), f)

## Create simple RAG

In [114]:
# Load the database
DB_PATH = "data/db/sample.db"

with open(DB_PATH, "rb") as f:
    db_bytes = pickle.load(f)
    db = FAISS.deserialize_from_bytes(db_bytes, embeddings, allow_dangerous_deserialization=True)

In [115]:
MODEL = "gpt-4o-mini"

In [120]:
# Load the LLM
llm = ChatOpenAI(model_name=MODEL, temperature=0, api_key=os.environ["OPEN_API_KEY"])  # for deterministic outputs

system_prompt = """
You are an expert assistant evaluating ESG reports of major companies based on the SDGs. 
Your job is to evaluate the companies achievements regarding the SDG goal given and the company name and given year. 
If the company does not mention anything say "I don't know". 
Otherwise give it a score of 1-10 and support your claim with snippets of the report where the evidence of suporting the goal is seen.
Also say the year this information is from.
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt_template}
)

In [100]:
def ask_question(query):
    response = retrieval_chain({"query": query})
    print(f"Question: {query}\nAnswer: {response['result']}")
    return response

In [121]:
ask_question("How does Walmart perform in the 12th SDG on sustainable consumption an production in 2019?")

Question: How does Walmart perform in the 12th SDG on sustainable consumption an production in 2019?
Answer: Walmart's performance regarding the 12th Sustainable Development Goal (SDG 12) on sustainable consumption and production in 2019 can be evaluated as follows:

**Score: 8/10**

**Supporting Evidence:**
1. **Sustainable Supply Chain Initiatives:** Walmart has committed to sourcing at least 20 key commodities more sustainably by 2025. This includes fresh produce, animal agriculture, seafood, and consumables. The report states, "We have prioritized the 20x25 commodities based upon a variety of factors, including the nature and magnitude of environmental and social improvement opportunities."

2. **Sustainability Index Participation:** As of the end of FY2019, Walmart achieved a supplier participation rate in the Sustainability Index that covers 80% of the goods sold in U.S. Walmart stores and Sam’s Club locations for categories where the Index is available. This indicates a strong c

{'query': 'How does Walmart perform in the 12th SDG on sustainable consumption an production in 2019?',
 'result': 'Walmart\'s performance regarding the 12th Sustainable Development Goal (SDG 12) on sustainable consumption and production in 2019 can be evaluated as follows:\n\n**Score: 8/10**\n\n**Supporting Evidence:**\n1. **Sustainable Supply Chain Initiatives:** Walmart has committed to sourcing at least 20 key commodities more sustainably by 2025. This includes fresh produce, animal agriculture, seafood, and consumables. The report states, "We have prioritized the 20x25 commodities based upon a variety of factors, including the nature and magnitude of environmental and social improvement opportunities."\n\n2. **Sustainability Index Participation:** As of the end of FY2019, Walmart achieved a supplier participation rate in the Sustainability Index that covers 80% of the goods sold in U.S. Walmart stores and Sam’s Club locations for categories where the Index is available. This indic

In [122]:
ask_question("How does Walmart perform in the 12th SDG on sustainable consumption an production in 2023?")

Question: How does Walmart perform in the 12th SDG on sustainable consumption an production in 2023?
Answer: I don't know.


{'query': 'How does Walmart perform in the 12th SDG on sustainable consumption an production in 2023?',
 'result': "I don't know.",
 'source_documents': [Document(metadata={'source': 'walmart-2019-esg-report.pdf'}, page_content='footprint for stores and clubs \nBaseline established\n>189 million pounds (lbs.) for Walmart U.S.\n>31 million lbs. for Sam’s Club Progress to be reported\nGoal: In 2016, Walmart and the Walmart \nFoundation committed to invest $25 \nmillion in projects to advance food safety \nin China over five yearsAmount invested in projects to date >$15 million \nGoal: In 2014, Walmart and the Walmart \nFoundation set a goal to provide nutrition \neducation to 4 million people by 2020Estimated impact of grants awarded >4 million people  \nGoal: Between 2014 and 2019, provide  \n4 billion meals to those who need them \nthrough grants from Walmart and the \nWalmart Foundation and food donations \nfrom Walmart stores, Sam’s Club locations \nand distribution centersImpact of 

In [123]:
ask_question("How does Apple perform in the 12th SDG on sustainable consumption an production in 2019?")

Question: How does Apple perform in the 12th SDG on sustainable consumption an production in 2019?
Answer: **Company:** Apple  
**Year:** 2019  
**SDG Goal:** 12 - Responsible Consumption and Production  

**Score:** 8/10  

**Evaluation:** Apple demonstrates significant achievements in promoting sustainable consumption and production, particularly through its initiatives aimed at reducing waste, increasing the use of recycled materials, and improving energy efficiency. 

**Supporting Evidence:**
1. **Zero Waste Program:** Apple has implemented a Zero Waste Program for suppliers, which aims to divert 100% of waste from landfills. As of 2018, they achieved UL’s Zero Waste to Landfill certification for all final assembly facilities across various product lines. They have engaged 90 suppliers and diverted over 1 million metric tons of waste, which is a substantial contribution to responsible production practices. (Source: 2019 Environmental Responsibility Report)

2. **Increased Use of Re

{'query': 'How does Apple perform in the 12th SDG on sustainable consumption an production in 2019?',
 'result': "**Company:** Apple  \n**Year:** 2019  \n**SDG Goal:** 12 - Responsible Consumption and Production  \n\n**Score:** 8/10  \n\n**Evaluation:** Apple demonstrates significant achievements in promoting sustainable consumption and production, particularly through its initiatives aimed at reducing waste, increasing the use of recycled materials, and improving energy efficiency. \n\n**Supporting Evidence:**\n1. **Zero Waste Program:** Apple has implemented a Zero Waste Program for suppliers, which aims to divert 100% of waste from landfills. As of 2018, they achieved UL’s Zero Waste to Landfill certification for all final assembly facilities across various product lines. They have engaged 90 suppliers and diverted over 1 million metric tons of waste, which is a substantial contribution to responsible production practices. (Source: 2019 Environmental Responsibility Report)\n\n2. **I

In [124]:
ask_question("How does Apple perform in the 5th SDG on gender equality?")

Question: How does Apple perform in the 5th SDG on gender equality?
Answer: I don't know.


{'query': 'How does Apple perform in the 5th SDG on gender equality?',
 'result': "I don't know.",
 'source_documents': [Document(metadata={'source': 'HM-Group-Sustainability-Disclosure-2021.pdf'}, page_content='—  Promoting diversity & equality . Together with \nour brands, encourage inclusion and celebrate \ndiversity and equality through products and communications. \n—  Inclusion . Advance inclusion of people in the \nsocieties we are part of through global and local \ninitiatives.\n—  Transparency . Clearly communicate our \nstrategy and progress to create accountability, \npushing us and others to do better. Our approach is informed by our Human Rights  \nPolicy , Sustainability Commitment , and other \nsocial policies .\nProgress: internal diversity  \n& equality\n—  We set specific targets to improve diversity across our business. Brands and markets have set inclusion and diversity (I&D) commitments and action plans. We’re working to publicly disclose these plans and have exten

In [125]:
ask_question("How does Apple perform in the 6th SDG?")

Question: How does Apple perform in the 6th SDG?
Answer: Based on the provided information, I would evaluate Apple's performance regarding the 6th Sustainable Development Goal (SDG 6: Clean Water and Sanitation) as follows:

**Score: 8/10**

**Supporting Evidence:**
1. **Water Stewardship**: Apple has implemented a comprehensive water stewardship strategy that addresses water availability, quality, and equity. They emphasize accountability for the water used in their corporate facilities, data centers, distribution centers, and retail stores, as well as at their suppliers' facilities. This is highlighted in the report: "We focus both on improving water use in our facilities and—through our Clean Water Program—helping our suppliers conserve water and prevent water pollution."

2. **Engagement with Communities**: Apple is actively engaging with communities where they operate to ensure shared water resources are protected and accessible. This indicates a commitment to not only managing th

{'query': 'How does Apple perform in the 6th SDG?',
 'result': 'Based on the provided information, I would evaluate Apple\'s performance regarding the 6th Sustainable Development Goal (SDG 6: Clean Water and Sanitation) as follows:\n\n**Score: 8/10**\n\n**Supporting Evidence:**\n1. **Water Stewardship**: Apple has implemented a comprehensive water stewardship strategy that addresses water availability, quality, and equity. They emphasize accountability for the water used in their corporate facilities, data centers, distribution centers, and retail stores, as well as at their suppliers\' facilities. This is highlighted in the report: "We focus both on improving water use in our facilities and—through our Clean Water Program—helping our suppliers conserve water and prevent water pollution."\n\n2. **Engagement with Communities**: Apple is actively engaging with communities where they operate to ensure shared water resources are protected and accessible. This indicates a commitment to not 

In [126]:
ask_question("How does Apple perform in the 7th SDG?")

Question: How does Apple perform in the 7th SDG?
Answer: I don't know.


{'query': 'How does Apple perform in the 7th SDG?',
 'result': "I don't know.",
 'source_documents': [Document(metadata={'source': 'Apple_Environmental_Progress_Report_2023.pdf'}, page_content='Report \nhighlights\xa0\nCarbon neutral for \ncorporate  emissions\nSince April 2020, we’ve achieved \ncarbon neutrality for our \ncorporate emissions1 by sourcing \n100 percent  renewable electricity \nfor Apple facilities, implementing \nenergy efficiency, and securing \nhigh-quality carbon offsets for \nremaining emissions.\xa0While \ncorporate emissions\xa0represent \nonly a small portion of our overall \nemissions, this is an important \nstep toward our broader goal of \nachieving carbon neutrality for all \nour products.\nRead more on page 13 .\nReduced value chain \nemissions by over \n45 percent \xa0\nWe reduced our overall emissions \nacross scopes 1, 2, and 32 by \nover 45 percent  compared \nwith our 2015 baseline year. \nWe avoided\xa0over 28 million \nmetric tons of emissions \nthro

In [128]:
texts[0]

Document(metadata={'source': 'fy2023-walmart-esg-highlights.pdf'}, page_content='Environmental,  \nSocial, and  \nGovernance  \nHighlights\nFY2023\n\nFY2023  \nESG Highlights\nContents\nESG REPORTING SOURCES\nESG Issue Briefs\nESG Reporting DataINTRODUCTION\n4 Leadership Letters\n6 Awards & Recognition\n7 Our Company\n8 Our Approach to ESG\nOPPORTUNITY\n11 Opportunity Highlights\n12  Human Capital: Good Jobs  \n& Advancement for Associates\n13 Human Capital Spotlight\n14 Equity & Inclusion at Walmart & Beyond\n15 Supplier Opportunity SUSTAINABILITY\n17 Sustainability Highlights\n18 Product Supply Chain Sustainability\n19 Climate Change\n20 Climate Change Spotlight\n21  Regeneration of Natural Resources:  \nForests, Land, Oceans\n22 Waste: Circular Economy\n23 People in Supply Chains\nCOMMUNITY\n25 Community Highlights\n26 Serving Communities\n27 Safer, Healthier Food & Other Products\n28 Disaster Preparedness & Response\n29  Disaster Preparedness & Response  \nSpotlightETHICS & INTEGRI

In [143]:
context = docs_pages["fy2023-walmart-esg-highlights.pdf"][8]

In [144]:
context

'We prioritize the ESG issues that offer the greatest potential for Walmart \nto create shared value; these are issues that rank high in relevance to our \nbusiness and stakeholders as well as Walmart’s ability to make a difference.\nFor each priority ESG issue, our disclosures aim to:\n• Articulate the relevance of the issue for societyand Walmart’s business\n• Reflect an understanding of stakeholder expectations\n• Share our aspirations, goals, and strategies to create shared value \n• Describe our progress, opportunities, and challenges \nRead more : Our ESG Priorities\nStakeholder Engagement\nOur ability to create shared value depends \non direct and frequent engagement with \nour customers, associates, and community \nleaders, as well as the people who \nsupply our products, hold our stock, and \nevaluate our performance. Stakeholder \nperspectives and feedback help improve \nthe relevance and effectiveness of the \nproducts and services we offer and the \ninitiatives we support. 

In [None]:
# Load the LLM
llm = ChatOpenAI(model_name=MODEL, temperature=0, api_key=os.environ["OPEN_API_KEY"])  # for deterministic outputs

system_prompt = """
You are an expert assistant evaluating ESG reports of major companies based on the SDGs. 
Your job is to evaluate the companies achievements regarding the SDG goal given and the company name and given year. 
If the company does not mention anything say "I don't know". 
Otherwise give it a score of 1-10 and support your claim with snippets of the report where the evidence of suporting the goal is seen.
Also say the year this information is from.
Context: {context}
Question: {question}
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"], 
    template=system_prompt
)

In [168]:
prompt = """You are an expert assistant for extracting information from company ESG reports. 
Your task is to assign the page content into these categories:
FACTUAL - the page presented to you contains hard facts with evidence supporting what the company is doing in terms of ESG, for example "we reduced our carbon footprint by 20%", this categery has to be only of fatcs that are quantifiable and measurable
MISSION - it describes vaguely the company's aim and standards, for example "our aim is to"
VALUES - contains what the company believes in, for example "we believe in"
Use only the data in the page.
Give a ratio of the page FACTUAL:MISSION:VALUES content.
Company name: {company}
Page from report: {page}"""

In [169]:
prompt.format(company="Walmart", page=context)

'You are an expert assistant for extracting information from company ESG reports. \nYour task is to assign the page content into these categories:\nFACTUAL - the page presented to you contains hard facts with evidence supporting what the company is doing in terms of ESG, for example "we reduced our carbon footprint by 20%", this categery has to be only of fatcs that are quantifiable and measurable\nMISSION - it describes vaguely the company\'s aim and standards, for example "our aim is to"\nVALUES - contains what the company believes in, for example "we believe in"\nUse only the data in the page.\nGive a ratio of the page FACTUAL:MISSION:VALUES content.\nCompany name: Walmart\nPage from report: We prioritize the ESG issues that offer the greatest potential for Walmart \nto create shared value; these are issues that rank high in relevance to our \nbusiness and stakeholders as well as Walmart’s ability to make a difference.\nFor each priority ESG issue, our disclosures aim to:\n• Articul

In [170]:
print(context)

We prioritize the ESG issues that offer the greatest potential for Walmart 
to create shared value; these are issues that rank high in relevance to our 
business and stakeholders as well as Walmart’s ability to make a difference.
For each priority ESG issue, our disclosures aim to:
• Articulate the relevance of the issue for societyand Walmart’s business
• Reflect an understanding of stakeholder expectations
• Share our aspirations, goals, and strategies to create shared value 
• Describe our progress, opportunities, and challenges 
Read more : Our ESG Priorities
Stakeholder Engagement
Our ability to create shared value depends 
on direct and frequent engagement with 
our customers, associates, and community 
leaders, as well as the people who 
supply our products, hold our stock, and 
evaluate our performance. Stakeholder 
perspectives and feedback help improve 
the relevance and effectiveness of the 
products and services we offer and the 
initiatives we support. 
Day to day, we enga

In [177]:
def invoke_llm(company, page):
    answer = llm.invoke(prompt.format(company=company, page=page))
    return answer

In [178]:
answer = invoke_llm("Walmart", context)

In [179]:
print(answer.content)

Based on the content provided from Walmart's ESG report, the categorization is as follows:

- **FACTUAL**: The page does not provide specific quantifiable data or measurable outcomes related to Walmart's ESG initiatives. It discusses priorities and approaches but lacks hard facts or evidence supporting specific achievements or metrics.

- **MISSION**: The content includes statements about Walmart's aims and strategies, such as creating shared value, engaging with stakeholders, and prioritizing ESG issues. Phrases like "our disclosures aim to" and "our ability to create shared value depends on" reflect the company's mission and objectives.

- **VALUES**: The page contains elements that reflect Walmart's beliefs and principles, such as the importance of stakeholder engagement, equity and inclusion, and ethical standards. Statements like "we believe in" are implied through the values presented in the opportunity, sustainability, community, and ethics sections.

**Ratio of content**: 
- FA

In [180]:
docs_pages["fy2023-walmart-esg-highlights.pdf"][10]

'Opportunity Highlights\n$21.75\nU.S. associate \naverage total hourly \ncompensation1>180,000>34,000\nASSOCIATES\nenrolled in  Live Better U\n>$13 cumulative total \npurchases supporting \nAmerican jobs  \n(2021 through FY2023)BILLION\nBILLION\n28%\nof U.S. officers are \npeople of color68%\nfull-time U.S. \nhourly associates2 $47\nU.S. ASSOCIATES3 \nreceived promotions;  \n88% of roles4 above  \nentry level filled internally sourced from  ~2,400 \ndiverse suppliers5 \n to U.S. businessesFY2023  \nESG Highlights 11 Introduction Sustainability Community Ethics & Integrity\n Opportunity\n37%\nof U.S. officers \nare women'

In [181]:
answer = invoke_llm("Walmart", docs_pages["fy2023-walmart-esg-highlights.pdf"][10])

In [182]:
print(answer.content)

Based on the content provided from the Walmart ESG report page titled "Opportunity Highlights," the categorization is as follows:

**FACTUAL:**
- $21.75 U.S. associate average total hourly compensation
- >180,000 U.S. associates enrolled in Live Better U
- >$13 billion cumulative total purchases supporting American jobs (2021 through FY2023)
- 28% of U.S. officers are people of color
- 68% of full-time U.S. hourly associates received promotions
- 88% of roles above entry level filled internally
- ~2,400 diverse suppliers to U.S. businesses
- 37% of U.S. officers are women

**MISSION:**
- None present in the provided content.

**VALUES:**
- None present in the provided content.

**Ratio of content:**
FACTUAL:MISSION:VALUES = 10:0:0
