## Importing Libraries

In [1]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import streamlit as st
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv, find_dotenv
import getpass
# from langchain_openai.embeddings import OpenAIEmbeddings
import pathlib
import textwrap
from IPython.display import display
from IPython.display import Markdown
import PyPDF2
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path
import pytesseract 
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))


load_dotenv(find_dotenv())


## Setting Up Models

In [8]:
model_for_images = genai.GenerativeModel('gemini-pro-vision')

model_for_tables = genai.GenerativeModel('gemini-pro')

model_for_text = genai.GenerativeModel('gemini-pro')    

## Text Pre-Processing

In [9]:
# Create a function to extract text

def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()
    
    # Find the formats of the text
    # Initialize the list with all the formats that appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))
    
    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

In [10]:
# Create a function to crop the image elements from PDFs
def crop_image(element, pageObj):
    # Get the coordinates to crop the image from the PDF
    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1] 
    # Crop the page using coordinates (left, bottom, right, top)
    pageObj.mediabox.lower_left = (image_left, image_bottom)
    pageObj.mediabox.upper_right = (image_right, image_top)
    # Save the cropped page to a new PDF
    cropped_pdf_writer = PyPDF2.PdfWriter()
    cropped_pdf_writer.add_page(pageObj)
    # Save the cropped PDF to a new file
    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
        cropped_pdf_writer.write(cropped_pdf_file)

# Create a function to convert the PDF to images
def convert_to_images(input_file,):
    images = convert_from_path(input_file)
    image = images[0]
    output_file = "PDF_image.png"
    image.save(output_file, "PNG")

# Create a function to read text from images
def image_to_text(image_path):
    # Read the image
    img = Image.open(image_path)
    # Extract the text from the image
    text = pytesseract.image_to_string(img)
    return text

In [11]:
# Extracting tables from the page

def extract_table(pdf_path, page_num, table_num):
    # Open the pdf file
    pdf = pdfplumber.open(pdf_path)
    # Find the examined page
    table_page = pdf.pages[page_num]
    # Extract the appropriate table
    table = table_page.extract_tables()[table_num]
    return table

# Convert table into the appropriate format
def table_converter(table):
    table_string = model_for_tables.generate_content(["report the content of the table ", str(table) ], stream=False, generation_config=genai.types.GenerationConfig(temperature=0.1))
    #table_description = model_for_tables.generate_content(["Summarize the main takeaways of this table", str(table_string.parts).replace('[text: "', '') ], stream=False)
    return str(table_string.parts).replace('[text: "', '') #str(table_description.parts).replace('[text: "', '')

## FAISS Embeddings

In [12]:
    
pdf_list = os.listdir("/Users/mattiafornasiero/Desktop/DSMA - Thesis/to_process")
first = True
db = []

for i in tqdm(pdf_list):

    pdf_path = f'to_process/{i}'
    # create a PDF file object
    pdfFileObj = open(pdf_path, 'rb')
    # create a PDF reader object
    pdfReader = PyPDF2.PdfReader(pdfFileObj)

    embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001") 

    name = i.replace(".pdf", "")
    year = int(name[len(name)-4:])
    company = str(name[:len(name)-4])
    splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500, chunk_overlap=800)

    for pagenum, page in enumerate(extract_pages(pdf_path)):
        
        # Initialize the variables needed for the text extraction from the page
        pageObj = pdfReader.pages[pagenum]
        page_text = []
        line_format = []
        text_from_images = []
        text_from_tables = []
        page_content = []
        # Initialize the number of the examined tables
        table_num = 0
        first_element= True
        table_extraction_flag= False
        # Open the pdf file
        pdf = pdfplumber.open(pdf_path)
        # Find the examined page
        page_tables = pdf.pages[pagenum]
        # Find the number of tables on the page
        tables = page_tables.find_tables()


        # Find all the elements
        page_elements = [(element.y1, element) for element in page._objs]
        # Sort all the elements as they appear in the page 
        page_elements.sort(key=lambda a: a[0], reverse=True)

        # Find the elements that composed a page
        for i,component in enumerate(page_elements):
            # Extract the position of the top side of the element in the PDF
            pos= component[0]
            # Extract the element of the page layout
            element = component[1]
            
            # Check if the element is a text element
            if isinstance(element, LTTextContainer):
                # Check if the text appeared in a table
                if table_extraction_flag == False:
                    # Use the function to extract the text and format for each text element
                    (line_text, format_per_line) = text_extraction(element)
                    # Append the text of each line to the page text
                    page_text.append(line_text)
                else:
                    # Omit the text that appeared in a table
                    pass
                
        # Check the elements for images
            if isinstance(element, LTFigure):
                # Crop the image from the PDF
                crop_image(element, pageObj)
                # Convert the cropped pdf to an image
                convert_to_images('cropped_image.pdf')
                # Extract the text from the image
                img_PIL = Image.open("PDF_image.png")
                image_text = model_for_images.generate_content([f"Describe the content of the image and summarize the main takeways, the context is: '{page_content}'", img_PIL ], stream=False, generation_config=genai.types.GenerationConfig(temperature=0.1))
                text = str(image_text.parts).replace('[text: "', '')
                text = Document(
                    page_content=text,
                    metadata = {"year" : year, "company" : company, "type" : "image" }
                )
                db.append(text)

            # Check the elements for tables
            if isinstance(element, LTRect):
                # If the first rectangular element
                if first_element == True and (table_num+1) <= len(tables):
                    # Find the bounding box of the table
                    lower_side = page.bbox[3] - tables[table_num].bbox[3]
                    upper_side = element.y1 
                    # Extract the information from the table
                    table = extract_table(pdf_path, pagenum, table_num)
                    # Convert the table information in structured string format
                    table_string = table_converter(table)
                    # Append the table string into a list
                    # text_from_tables.append(table_string)
                    table_string = Document(
                        page_content=table_string,
                        metadata = {"year":year, "company" :company, "type":"table" }
                    )
                    first_element = False
                
                    #db.append(table_description)
                    db.append(table_string)

                # Check if we already extracted the tables from the page
                # if element.y0 >= lower_side and element.y1 <= upper_side:
                #     pass
                if i+1 == len(page_elements):
                    break
                elif not isinstance(page_elements[i+1][1], LTRect):
                    table_extraction_flag = False
                    first_element = True
                    table_num+=1

        # Create the key of the dictionary
        dctkey = 'Page_'+str(pagenum)
        # Add the list of list as the value of the page key

        txt = "".join(page_text).replace(" \n", "")
        chunks = splitter.split_text(txt)
        for x in chunks:
            page_text = Document(
                page_content=x,
                metadata = {"year":year, "company" :company, "type": "text" }
            )
            if len(page_text.page_content) >= 15:
                db.append(page_text)

    # Closing the pdf file object
    pdfFileObj.close()

    # Deleting the additional files created
    os.remove('cropped_image.pdf')
    os.remove('PDF_image.png')

      
   

vectorstore = FAISS.from_documents(db, embeddings)
vectorstore.save_local("faiss_index")

  0%|          | 0/2 [00:05<?, ?it/s]


FailedPrecondition: 400 User location is not supported for the API use.

## LLM querying

In [287]:
def get_conversational_chain_subclaims(question):
    prompt= f"""
    Here are common greenwashing techniques:
    
    Greenwashing is the process of conveying a false impression or misleading information about how a company’s products are environmentally sound. Greenwashing involves making an unsubstantiated claim to deceive consumers into believing that a company’s products are environmentally friendly or have a greater positive environmental impact than they actually do.
    There are different types of greenwashing: The Use of Vague or Misleading Terminology
    
    The use of unclear or deceptive language is a common greenwashing tactic. For instance, companies may market their product as "natural," "organic," or "eco-friendly" without defining these terms. Since there are no standard regulations, these labels may be misleading, and some companies may use them to falsely suggest that their product is environmentally friendly.

    Emphasising One Small Aspect of a Product
    A tactic commonly used for greenwashing is to promote one eco-friendly aspect of a product, while disregarding other aspects that aren't environmentally friendly. For instance, a product that can be recycled may be advertised as eco-friendly, even if it's not made from sustainable materials or has a high carbon footprint. This type of advertising can be deceptive, as it gives the impression that the product is environmentally friendly when it may not actually be so.

    Making False Claims
    One way companies greenwash their products is by making inaccurate claims about the product's environmental impact. A product might be advertised as "carbon neutral" even if it is not, which is prohibited in several countries. However, it is challenging to enforce such regulations, particularly in countries with lenient environmental laws.

    Using Images of Nature or the Environment
    Companies often use pictures of nature or the environment in their advertisements to suggest that their products are environmentally friendly. However, this can be misleading because a product might feature an image of a mountain stream or something similar, even though it contains harmful chemicals that can damage aquatic life.

    Featured Article: Energy Companies & Greenwashing
    Exaggerating the Environmental Benefits
    Some companies tend to overstate the environmental advantages of their products. An instance for this would be when a car maker promotes its hybrid vehicle as "100% emissions-free," though the vehicle still emits some amount of emissions. Such marketing can deceive consumers into believing that the product is more environmentally friendly than it actually is.

    Using Green Logos or Symbols
    Companies may use green logos or symbols to suggest that their products are environment-friendly, but this does not necessarily mean that their products are indeed eco-friendly. It should be noted that the use of such symbols is not regulated, and companies may use them even if their products are not environmentally friendly.

    Making Comparisons to Less Eco-Friendly Products
    One way companies greenwash is by comparing their product to less eco-friendly alternatives. They might say their product is "greener" than traditional options, even if it is still detrimental to the environment. This type of marketing can be misleading because it suggests the product is environmentally safe, even if it is not.

    Focusing on Small Changes
    Some companies give the impression of being environmentally friendly by making minor changes to their products or practices, such as using biodegradable packaging. However, they may still use harmful chemicals in their products, which means this type of marketing can be deceptive. It suggests that the company is doing more for the environment than it really is.

    Claiming to Be Carbon Neutral
    Although some companies say they are carbon neutral by offsetting their emissions, such as through planting trees or investing in renewable energy, this claim could be misleading because it might not consider the entire carbon footprint of the company. This includes emissions from producing and transporting their products.

    Using Emotional Appeals
    Some companies use emotional appeals in their marketing to create the impression that their products are environmentally friendly. They may include images of animals or children in their advertising to evoke feelings of compassion or responsibility. Although this marketing technique can be effective in persuading consumers to buy the product, it can be misleading if the product is not genuinely eco-friendly. don't include '''python ''' in the response
    ---
    
    You are a regulator that specializes in ESG reporting. Your goal is to generate subclaims starting from the provided claim using the greenwashing techniques. Return 5 subclaims in a python list format without mentioning the company name in the claim.
    The claim is: '{question}'. 
    Answer:
    """


    model = genai.GenerativeModel('gemini-pro')

    chain = model.generate_content(str(prompt), generation_config=genai.types.GenerationConfig(temperature=0.1))

    return chain

def user_input_subclaims(user_question):

    chain = get_conversational_chain_subclaims(user_question)

    return chain

## Simple Querying

In [288]:
def get_conversational_chain():
    prompt_template = """
    
    You are a regulator that specializes in ESG reporting. You have to evaluate if a given claim is greenwashed or not.
    Greenwashing is the process of conveying a false impression or misleading information about how a company’s products are environmentally sound. Greenwashing involves making an unsubstantiated claim to deceive consumers into believing that a company’s products are environmentally friendly or have a greater positive environmental impact than they actually do.
    There are different types of greenwashing: The Use of Vague or Misleading Terminology
    ---
    The use of unclear or deceptive language is a common greenwashing tactic. For instance, companies may market their product as "natural," "organic," or "eco-friendly" without defining these terms. Since there are no standard regulations, these labels may be misleading, and some companies may use them to falsely suggest that their product is environmentally friendly.

    Emphasising One Small Aspect of a Product
    A tactic commonly used for greenwashing is to promote one eco-friendly aspect of a product, while disregarding other aspects that aren't environmentally friendly. For instance, a product that can be recycled may be advertised as eco-friendly, even if it's not made from sustainable materials or has a high carbon footprint. This type of advertising can be deceptive, as it gives the impression that the product is environmentally friendly when it may not actually be so.

    Making False Claims
    One way companies greenwash their products is by making inaccurate claims about the product's environmental impact. A product might be advertised as "carbon neutral" even if it is not, which is prohibited in several countries. However, it is challenging to enforce such regulations, particularly in countries with lenient environmental laws.

    Using Images of Nature or the Environment
    Companies often use pictures of nature or the environment in their advertisements to suggest that their products are environmentally friendly. However, this can be misleading because a product might feature an image of a mountain stream or something similar, even though it contains harmful chemicals that can damage aquatic life.

    Featured Article: Energy Companies & Greenwashing
    Exaggerating the Environmental Benefits
    Some companies tend to overstate the environmental advantages of their products. An instance for this would be when a car maker promotes its hybrid vehicle as "100% emissions-free," though the vehicle still emits some amount of emissions. Such marketing can deceive consumers into believing that the product is more environmentally friendly than it actually is.

    Using Green Logos or Symbols
    Companies may use green logos or symbols to suggest that their products are environment-friendly, but this does not necessarily mean that their products are indeed eco-friendly. It should be noted that the use of such symbols is not regulated, and companies may use them even if their products are not environmentally friendly.

    Making Comparisons to Less Eco-Friendly Products
    One way companies greenwash is by comparing their product to less eco-friendly alternatives. They might say their product is "greener" than traditional options, even if it is still detrimental to the environment. This type of marketing can be misleading because it suggests the product is environmentally safe, even if it is not.

    Focusing on Small Changes
    Some companies give the impression of being environmentally friendly by making minor changes to their products or practices, such as using biodegradable packaging. However, they may still use harmful chemicals in their products, which means this type of marketing can be deceptive. It suggests that the company is doing more for the environment than it really is.

    Claiming to Be Carbon Neutral
    Although some companies say they are carbon neutral by offsetting their emissions, such as through planting trees or investing in renewable energy, this claim could be misleading because it might not consider the entire carbon footprint of the company. This includes emissions from producing and transporting their products.

    Using Emotional Appeals
    Some companies use emotional appeals in their marketing to create the impression that their products are environmentally friendly. They may include images of animals or children in their advertising to evoke feelings of compassion or responsibility. Although this marketing technique can be effective in persuading consumers to buy the product, it can be misleading if the product is not genuinely eco-friendly.
    ---
    Context from the company's past ESG reports:\n {context}?\n
    The claim I want you to assess is: '{question}'. Provide a clear answer and in case the claim is greenwashed explain which techinque was used.
    Answer:
    """


    model = ChatGoogleGenerativeAI(model="gemini-1.0-pro",
                                   client=genai,
                                   temperature=0.001,
                                   )
    
    prompt = PromptTemplate(template=prompt_template,
                            input_variables=["context", "question"])
    
    chain = load_qa_chain(llm=model, chain_type="stuff", prompt=prompt)
    return chain

def user_input(user_question, docs):
   
    chain = get_conversational_chain()

    response = chain(
        {"input_documents": docs, "question": user_question}, return_only_outputs=True )
    return response

## No instructions response

In [289]:
def get_conversational_chain():
    prompt_template = """
    You are a regulator that specializes in ESG reporting. You have to evaluate if a given claim is greenwashed or not.

        Context from the company's past ESG reports:\n {context}?\n
    The claim I want you to assess is: '{question}'. Provide a clear answer, in case the claim is contradicted by the esg report or the numbers are exxagerated, report the snippet of text
    Answer:
    """


    model = ChatGoogleGenerativeAI(model="gemini-1.0-pro",
                                   client=genai,
                                   temperature=0.001,
                                   )
    
    prompt = PromptTemplate(template=prompt_template,
                            input_variables=["context", "question"])
    
    chain = load_qa_chain(llm=model, chain_type="stuff", prompt=prompt)
    return chain

def user_input(user_question, docs):
   
    chain = get_conversational_chain()

    response = chain(
        {"input_documents": docs, "question": user_question}, return_only_outputs=True )
    return response

## Evaluate Claims 

In [279]:
claim = "Oatly generates 73% less CO2e vs. milk, calculated from grower to grocer"

In [280]:
resp = user_input_subclaims(claim)

def transform_string_to_list(input_string):
    return input_string.strip().split('\n')
output_list = transform_string_to_list(resp.text)

embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001")   # type: ignore
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

docs =[]

for i in output_list:
    for x in (new_db.similarity_search(i, k=3)):
              docs.append(x)
to_markdown(resp.text)


> 1. Oatly's claim of 73% less CO2e emissions compared to milk may not consider the full lifecycle of its product, including emissions from production and transportation.
> 2. Oatly's comparison to milk may be misleading, as milk production practices vary widely, and some milk producers may have lower carbon footprints than Oatly's.
> 3. Oatly's use of the term "CO2e" may be unclear to consumers, and it may not be clear what specific greenhouse gases are included in this calculation.
> 4. Oatly's claim may rely on offsetting emissions through tree planting or renewable energy investments, which may not fully mitigate the actual carbon footprint of its product.
> 5. Oatly's marketing may use emotional appeals, such as images of nature or children, to create a positive impression of its environmental impact, even if the product's actual environmental performance is not as strong as suggested.

In [283]:
response = user_input(claim, docs)
to_markdown(response['output_text'])

> The claim is contradicted by the ESG report.
> 
> The ESG report states that "A Life Cycle Assessment study found that switching from cow's milk to Oatly oat milk does not significantly reduce environmental impact."

## Assessing consistency

In [290]:
embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001")   # type: ignore
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

for a in range(10):

    resp = user_input_subclaims(claim)

    output_list = transform_string_to_list(resp.text)
    docs =[]

    for i in output_list:
        for x in (new_db.similarity_search(i, k=5)):
                docs.append(x)


    response = user_input(claim, docs)
    print(response['output_text'])


The claim is not contradicted by the ESG report. The report states that "Oatly generates 73% less CO2e vs. milk, calculated from grower to grocer". This is supported by the data in the table, which shows that Oatly products have a lower carbon footprint than cow's milk.
The claim is supported by the ESG report. The report states that "Oatly generates 73% less CO2e vs. milk, calculated from grower to grocer". This is based on a Life Cycle Assessment study conducted by Blonk Consultants. The study found that the production of Oatly products generates 73% less CO2e than the production of cow's milk.
The claim is not contradicted by the ESG report. The report states that "Oatly generates 73% less CO2e vs. milk, calculated from grower to grocer". This is supported by the table in the report, which shows that Oatly's CO2e emissions are 73% lower than those of milk.
The claim is not contradicted by the ESG report. The report states that "Oatly generates 73% less CO2e vs. milk, calculated from