In [2]:
import os
import requests
import PyPDF2
import pandas as pd
from dotenv import load_dotenv
import json

from langchain_openai import ChatOpenAI
from langchain_core.documents import Document

# load openai key
if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [3]:
# TODO set global variables
REPORTS_SAVE_PATH = 'data/sample_reports'
DB_PATH = "data/db/sample.db"

# See https://openai.com/api/pricing/
MODEL = "gpt-3.5-turbo-0125"

# Dataset

In [3]:
df = pd.read_json('data/reports.json')
df

Unnamed: 0,company_name,year,dataset,pdf_url
0,Walmart,2023,handcrafted,https://corporate.walmart.com/content/dam/corp...
1,Walmart,2021,handcrafted,https://corporate.walmart.com/content/dam/corp...
2,Walmart,2019,handcrafted,https://corporate.walmart.com/content/dam/corp...
3,Amazon,2023,handcrafted,https://sustainability.aboutamazon.com/content...
4,Amazon,2021,handcrafted,https://sustainability.aboutamazon.com/content...
...,...,...,...,...
141,tarkett,2020,scraped,https://www.tarkett.com/sites/default/files/20...
142,trivium-packaging,2021,scraped,https://www.triviumpackaging.com/media/13fl4q3...
143,trivium-packaging,2020,scraped,https://triviumpackaging.com/sustainability/re...
144,trust,2023,scraped,https://dezlwerqy1h00.cloudfront.net/images/co...


In [4]:
# EXAMPLE: select Apple reports
df_sample = df[df['company_name'] == 'Apple']

In [None]:
# download Apple reports to save_dir
def download_files(df: pd.DataFrame, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    for url in df['pdf_url']:
        pdf_filename = os.path.basename(url)
        response = requests.get(url)
        with open(os.path.join(save_dir, pdf_filename), 'wb') as file:
            file.write(response.content)
    print(f"Success.")

In [None]:
download_files(df_sample, REPORTS_SAVE_PATH)

# Create dummy vector store

In [7]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("state_of_the_union.txt")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(texts, embeddings)

In [8]:
retriever = vectorstore.as_retriever()

In [9]:
docs = retriever.invoke("what did the president say about ketanji brown jackson?")

In [10]:
docs

[Document(metadata={'source': 'state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'),
 Document(metadata={'source': 'state_of_the_union.txt'}, page_content='A former top litigator in private practice. A former federal 

# Simple RAG

In [14]:
docs[0]

Document(metadata={'source': 'state_of_the_union.txt'}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.')

In [12]:
# sample documents
dummy_docs = [
    Document(
            metadata={'source': 'state_of_the_union.txt'},
            page_content="The amount of recycled cobalt used in Apple's products was approximately 13%, primarily sourced from post-industrial and post-consumer scrap, including end-of-life batteries."
    ),
    Document(
            metadata={'source': 'state_of_the_union.txt'},
            page_content="30% of the tin used across all products came from recycled sources, mainly in solder on printed circuit boards."
    ),
    Document(
            metadata={'source': 'state_of_the_union.txt'},
            page_content="45% of the rare earth elements used in Apple's products came from recycled sources, primarily in magnets for products like iPhone, iPad, and Apple Watch."
    ),
    Document(
            metadata={'source': 'state_of_the_union.txt'},
            page_content="In 2022, 25% of the cobalt shipped in Apple's products came from recycled sources, nearly doubling from 2021."
    ),
    Document(
            metadata={'source': 'state_of_the_union.txt'},
            page_content= "The use of recycled tin increased to 38%, primarily in flexible printed circuit boards across multiple product lines."
    ),
    Document(
            metadata={'source': 'state_of_the_union.txt'},
            page_content= "73% of the rare earth elements used came from recycled sources, up from 45% in 2021, with some products containing 98-100% recycled rare earth elements."
    ),
]

In [13]:
def build_context_from_docs(docs):
    return ' '.join([doc.page_content for doc in docs])

In [14]:
build_context_from_docs(dummy_docs)

"The amount of recycled cobalt used in Apple's products was approximately 13%, primarily sourced from post-industrial and post-consumer scrap, including end-of-life batteries. 30% of the tin used across all products came from recycled sources, mainly in solder on printed circuit boards. 45% of the rare earth elements used in Apple's products came from recycled sources, primarily in magnets for products like iPhone, iPad, and Apple Watch. In 2022, 25% of the cobalt shipped in Apple's products came from recycled sources, nearly doubling from 2021. The use of recycled tin increased to 38%, primarily in flexible printed circuit boards across multiple product lines. 73% of the rare earth elements used came from recycled sources, up from 45% in 2021, with some products containing 98-100% recycled rare earth elements."

In [15]:
# Load the LLM
llm = ChatOpenAI(
        model_name=MODEL,
        temperature=0,
        max_tokens=1000,       
    ).bind(response_format={"type": "json_object"})

In [30]:
# Create prompt template
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant. You help to retrieve information from sustainability reports and output it as a json object.",
        ),
        ("human", """
        I have the following question: {question}\n\n
        Retrieve all relevant information about the question from the following text in triple backticks:\n\n```{context}```\n\n
        The information should be output as a json object. All numeric values should be converted to numbers. One field should containt the unit of the value.
        """),
    ]
)

In [31]:
question = "What are the reported levels of recycled cobalt, tin, and rare earth elements in Apple products for 2021 and 2022?"
context = build_context_from_docs(dummy_docs)


chain = prompt | llm
answer = chain.invoke(
    {
        "question": question,
        "context": context,
    }
)

In [32]:
answer_json = json.loads(answer.content)

In [33]:
print(json.dumps(answer_json, indent=2))

{
  "2021": {
    "recycled_cobalt": {
      "percentage": 13,
      "unit": "%"
    },
    "recycled_tin": {
      "percentage": 30,
      "unit": "%"
    },
    "recycled_rare_earth_elements": {
      "percentage": 45,
      "unit": "%"
    }
  },
  "2022": {
    "recycled_cobalt": {
      "percentage": 25,
      "unit": "%"
    },
    "recycled_tin": {
      "percentage": 38,
      "unit": "%"
    },
    "recycled_rare_earth_elements": {
      "percentage": 73,
      "unit": "%"
    }
  }
}
