In [2]:
"""
ESG Risk:

A Real-Time ESG Risk Scoring Framework for Company Filings 

Authored in 2025 by Aaron Walker


Change log:
- 04/09/2025: added def text_to_scored_ESG_() & def ESG_risk_score()

"""
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pdfplumber
import re
import urllib.request
from pdf_parser import url_to_pdf
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline # for using the models


#Use ESG-BERT (Bidirectional Encoder Representations from Transformers) for natural language processing of company SEC filings
#Input: databbase of SEC filings by company.
#Outout: ESG Risk score and classifcation (Low, Med, High)

#To Do list:
# 1) ETL (Extract) pipeline using either responsibility reports.com (volentary ESG reports, likely effected by greenwashing) or SEC filings (best)
# 2) Transform & load: PDF sentence classification into E,S,G
# 3) Import ESG Models, return esg classifications per sentence
# 4) Import ESG sentence classifications, analyse and return sentiment
# 5) Calculate risk score and classify company
# 6+) Apply MLOPS and productionarise: Containerise, orchestrate, monitor -> finetune ESG hugging face model -> Visualisations in streamlit server
# Write up paper and compare with ESG literature

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 1) Extract pdfs given url to report

#Extract from:
#https://www.responsibilityreports.com/ and
#https://www.sec.gov/edgar/search/

company_url_list = [
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/NASDAQ_AMD_2024.pdf",
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/NYSE_MCD_2023.pdf",
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/LSE_BT_2023.pdf",
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/LSE_VTC_2023.pdf"
]

url_to_pdf(company_url_list)

In [4]:
# 2) Transfrorm & load: Convert from PDF to list of clean sentences

pdf_location = r'data/LSE_VTC_2023.pdf' #NYSE_MCD_2023.pdf
def pdfparser(pdf_location):
    def pdf_to_text(pdf_url):
        """
        Converts PDF into string
        """
        all_text = ""
        with pdfplumber.open(f"{pdf_url}") as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text: 
                    all_text += text + " "
        return all_text.strip()

    def get_clean_text(text):
        """
        Cleans input text
        """
        text = text.replace("\n"," ") # convert newline into space
        text = re.sub(r'\s+', ' ', text) #convert multuple spaces into single space 
        text = re.sub(r'[•*]','', text) # removes symbols • and *
        return text.strip()

    def get_clean_text_to_list(words):
        """
        Splits clean text from words into sentences, then returns as list
        """
        sentences = get_clean_text(words)
        sentences = re.split(r'(?<=[.!?])\s+', sentences)
        return [s.strip() for s in sentences if s.strip()]
        
    pdf_text = pdf_to_text(pdf_location)
    return get_clean_text_to_list(pdf_text)

clean_text = pdfparser(pdf_location)

In [5]:
print(f"Length of company filing in sentences: {len(clean_text)}")
print(f"Sample sentences: {clean_text[:3]}")

Length of company filing in sentences: 1315
Sample sentences: ['Environment, Social and Governance Report 2023 2023 ESG REPORT OVERVIEW GOVERNANCE & STRATEGY ECONOMIC & INNOVATION ENVIRONMENT SOCIAL Introduction Videndum is committed to operating as a sustainable business, with a focus on reducing our environmental footprint and actively contributing to the development of the communities in which we operate.', 'Our robust governance framework is designed to ensure the continued success of our business, while minimising risks to our operations and supply chains.', 'Our approach Three years ago, we launched our first Environment, Social and Governance (“ESG”) Report, presenting a framework with targets and indicators to track our progress.']


In [6]:
# 3) Import ESG Models, return esg classifications per sentence

### Load the models (takes ca. 1 min)
# Environmental model.
name = "ESGBERT/EnvironmentalBERT-environmental" # path to download from HuggingFace
# In simple words, the tokenizer prepares the text for the model and the model classifies the text-
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
# The pipeline combines tokenizer and model to one process.
pipe_env = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Also load the social and governance model.
# Social model.
name = "ESGBERT/SocialBERT-social"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
pipe_soc = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Governance model.
name = "ESGBERT/GovernanceBERT-governance"
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSequenceClassification.from_pretrained(name)
pipe_gov = pipeline("text-classification", model=model, tokenizer=tokenizer)

pipe_senti = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", framework="pt")

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


In [7]:
# example outputs
test = pipe_env(clean_text[:10]) # Test for environmental classification..
test1 = pipe_soc(clean_text[:10])
test2 = pipe_gov(clean_text[:10])
test3 = pipe_senti(clean_text[:10])
print(test)
print(test1)
print(test2)
print(test3)

[{'label': 'environmental', 'score': 0.9966869950294495}, {'label': 'none', 'score': 0.995571494102478}, {'label': 'environmental', 'score': 0.9963024854660034}, {'label': 'environmental', 'score': 0.9964116215705872}, {'label': 'environmental', 'score': 0.9903509020805359}, {'label': 'none', 'score': 0.994099497795105}, {'label': 'none', 'score': 0.993297278881073}, {'label': 'none', 'score': 0.9930485486984253}, {'label': 'none', 'score': 0.9956704378128052}, {'label': 'none', 'score': 0.7723640203475952}]
[{'label': 'social', 'score': 0.9995939135551453}, {'label': 'none', 'score': 0.9999203681945801}, {'label': 'social', 'score': 0.9988338351249695}, {'label': 'social', 'score': 0.9993301630020142}, {'label': 'social', 'score': 0.9904727339744568}, {'label': 'none', 'score': 0.9999352693557739}, {'label': 'none', 'score': 0.9998633861541748}, {'label': 'none', 'score': 0.999942421913147}, {'label': 'social', 'score': 0.9996960163116455}, {'label': 'none', 'score': 0.999886989593505

In [8]:
sample_text = clean_text
ESG_features = [pipe_env, pipe_soc, pipe_gov, pipe_senti]
def text_to_scored_ESG_(ESG_features):
    """_
    Applies ESG model to text, returns df of sentence labels and scores 
    
    """
    report_df = pd.DataFrame({"sentence":sample_text})
     
    for pipe in ESG_features:

        results = pipe(sample_text, padding=True, truncation=True)

        labels = [x["label"] for x in results]
        scores = [x["score"] for x in results]


        if pipe == pipe_env:
            report_df["Environment"] = labels 
            report_df["Env_score"] = scores
            
        elif pipe == pipe_soc:
            report_df["Social"] = labels 
            report_df["Social_score"] = scores
            
        elif pipe == pipe_gov:
            report_df["Government"] = labels 
            report_df["Gov_score"] = scores
            
        else:
            report_df["Sentiment"] = labels 
            report_df["Senti_score"] = scores
            
            
    return report_df
        
df = text_to_scored_ESG_(ESG_features)
df.head(10)

Unnamed: 0,sentence,Environment,Env_score,Social,Social_score,Government,Gov_score,Sentiment,Senti_score
0,"Environment, Social and Governance Report 2023...",environmental,0.996687,social,0.999594,governance,0.957807,POSITIVE,0.996997
1,Our robust governance framework is designed to...,none,0.995571,none,0.99992,governance,0.996511,POSITIVE,0.999345
2,"Our approach Three years ago, we launched our ...",environmental,0.996302,social,0.998834,governance,0.948303,POSITIVE,0.978013
3,"Now, in our third year of reporting, our progr...",environmental,0.996412,social,0.99933,governance,0.953452,POSITIVE,0.998439
4,Contents Overview 2 Governance & Strategy 6 Ec...,environmental,0.990351,social,0.990473,none,0.542238,POSITIVE,0.984067
5,software solutions to the content creation con...,none,0.994099,none,0.999935,none,0.997598,POSITIVE,0.914233
6,"TikTok, YouTube and Instagram Production Solut...",none,0.993297,none,0.999863,none,0.991725,POSITIVE,0.953703
7,Creating video and audio content to stream com...,none,0.993049,none,0.999942,none,0.997583,POSITIVE,0.833742
8,"live or pre-recorded to their employees, Viden...",none,0.99567,social,0.999696,none,0.997188,POSITIVE,0.997051
9,"Products include video fluid heads, tripods, c...",none,0.772364,none,0.999887,none,0.993477,POSITIVE,0.99022


In [9]:
#ESG Risk scoring:

def esg_risk_score(row):
    risk = 0
    # Environment
    if row["Environment"] != "none" and row["Sentiment"] == "NEGATIVE":
        risk += row["Env_score"] * row["Senti_score"]
    # Social
    if row["Social"] != "none" and row["Sentiment"] == "NEGATIVE":
        risk += row["Social_score"] * row["Senti_score"]
    # Governance
    if row["Government"] != "none" and row["Sentiment"] == "NEGATIVE":
        risk += row["Gov_score"] * row["Senti_score"]
    return risk

# Apply to dataframe
df["ESG_risk_score"] = df.apply(esg_risk_score, axis=1)

# Optional: total ESG risk for the whole document/report
total_esg_risk = df["ESG_risk_score"].sum()

print(df[["sentence","ESG_risk_score"]])
print("Total ESG risk:", total_esg_risk)

                                               sentence  ESG_risk_score
0     Environment, Social and Governance Report 2023...             0.0
1     Our robust governance framework is designed to...             0.0
2     Our approach Three years ago, we launched our ...             0.0
3     Now, in our third year of reporting, our progr...             0.0
4     Contents Overview 2 Governance & Strategy 6 Ec...             0.0
...                                                 ...             ...
1310  The “Videndum Recycles” events, encouraging gr...             0.0
1311                        home for proper management.             0.0
1312  Our events serve as an opportunity for employe...             0.0
1313  64 Videndum plc Bridge House Heron Square Rich...             0.0
1314                        Image credit: Felix Belloin             0.0

[1315 rows x 2 columns]
Total ESG risk: 222.6031400287411


In [None]:
df_sorted = df.sort_values(by="ESG_risk_score", ascending=False)
df_sorted

Unnamed: 0,sentence,Environment,Env_score,Social,Social_score,Government,Gov_score,Sentiment,Senti_score,ESG_risk_score
912,"We look to rework with our top suppliers, ensu...",environmental,0.972088,social,0.999434,governance,0.869402,NEGATIVE,0.993148,2.821459
591,The questionnaire requested details of our sup...,environmental,0.997899,social,0.999428,governance,0.813987,NEGATIVE,0.973730,2.737460
268,The ESG programme is led by Marco operational ...,environmental,0.995431,social,0.999090,governance,0.895726,NEGATIVE,0.945576,2.732949
132,11 2023 ESG REPORT OVERVIEW GOVERNANCE & STRAT...,environmental,0.995464,social,0.999422,governance,0.638805,NEGATIVE,0.988345,2.602996
258,The ESG Committee met four times during 2023 t...,environmental,0.996047,social,0.999570,governance,0.985419,NEGATIVE,0.821040,2.447550
...,...,...,...,...,...,...,...,...,...,...
451,"IInn tthhee UUKK,, commerce; (2) subscription ...",none,0.995882,none,0.999812,none,0.985483,POSITIVE,0.800595,0.000000
450,wwhhiicchh wwiillll nneeeedd ttoo bbee eeqquui...,none,0.996117,none,0.999919,none,0.996392,NEGATIVE,0.991773,0.000000
449,Long-term margin improvement medium-term.,none,0.997219,none,0.999928,none,0.997541,POSITIVE,0.986571,0.000000
448,These are: (1) internet/e- capital.,none,0.995286,none,0.999945,none,0.998061,NEGATIVE,0.966352,0.000000


In [12]:
df_sorted["sentence"][912]

'We look to rework with our top suppliers, ensuring we parts in-house before sending them to maintain strong levels of communication our suppliers, saving on shipping parts on ESG topics.'

In [None]:
# 5) Calculate risk score and classify company: save to dataframe or database (postgres) and add fast api to look up company and return risk score and classification
#to do:
#def store_ESG_Risk_Scores():
#-Sentence/ column detection

In [None]:
#def Run_ESG_Risk_analysis(list_of_companies):
    

In [None]:
#def return_ESG_Risk_Score():
#use FastAPI to query and return scores from ESG_Risk_database
    

In [None]:
# 6+) Apply MLOPS and productionarise: Containerise, orchestrate, monitor -> finetune ESG hugging face model -> Visualisations in streamlit server