In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pdfplumber
import re
import requests
import urllib.request

#Use ESG-BERT (Bidirectional Encoder Representations from Transformers) for natural language processing of company SEC filings
#Input: databbase of SEC filings by company.
#Outout: ESG Risk score and classifcation (Low, Med, High)
#To Do list
# 1) ETL pipeline using either responsibility reports.com (volentary ESG reports, likely effected by greenwashing) or SEC filings (best)
# 2) PDF sentence classification into E,S,G
# 3) Sentence sentiment positive, negative, neutral
# 4) ESG overall Score
# 5) pipeline to run on large list of comnpanies (10 to begin with)
# Visualistions

In [None]:
# Extract pdfs given url to report
#https://www.responsibilityreports.com/

company_url_list = [
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/NASDAQ_AMD_2024.pdf",
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/NYSE_MCD_2023.pdf",
"https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/LSE_BT_2023.pdf"
]

def url_to_pdf(company_url_list):
    for company_url in company_url_list:
        match = re.search(r'/([^/]+)\.pdf$', company_url)
        if match:
            file_name = match.group(1)
        urllib.request.urlretrieve(company_url, f"Reports\{file_name}.pdf")
        
        
url_to_pdf(company_url_list)



In [None]:
# Convert from PDF to list of clean sentences
pdf_location = "data\McDonalds_CSR_Report.pdf"

def pdf_to_text(pdf_url):
    """Converts PDF into string

    Args:
        pdf_url (string): link to pdf location

    Returns:
        all_text: string containing all pdf text
    """
    all_text = ""
    with pdfplumber.open(f"{pdf_url}") as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text: 
                all_text += text + " "
    return all_text.strip()

def get_clean_text(text):
    """Cleans input text

    Args:
        text (string): uncleaned text
        
    Returns:
        Cleaned_text: the cleaned text
    """
    text = text.replace("\n"," ") # convert newline into space
    text = re.sub(r'\s+', ' ', text) #convert multuple spaces into single space 
    text = re.sub(r'[^x00-\x7f]','',text) # remove non ascii characters
    text = re.sub(r'[•*]','', text) # removes symbols • and *
    return text.strip()

def get_clean_text_to_list(words):
    sentences = get_clean_text(words)
    return [s.strip() for s in sentences if s.strip()]
    
    

pdf_text = pdf_to_text(pdf_location)
clean_texts = get_clean_text_to_list(pdf_text) 

In [28]:
clean_texts

['2',
 '0',
 '2',
 '3',
 '2',
 '0',
 '2',
 '4',
 'O',
 'u',
 'r',
 'P',
 'u',
 'r',
 'p',
 'o',
 's',
 'e',
 'I',
 'm',
 'p',
 'a',
 'c',
 't',
 'R',
 'e',
 'p',
 'o',
 'r',
 't',
 'M',
 'c',
 'D',
 'o',
 'n',
 'a',
 'l',
 'd',
 's',
 'C',
 'o',
 'r',
 'p',
 'o',
 'r',
 'a',
 't',
 'i',
 'o',
 'n',
 'I',
 'm',
 'p',
 'a',
 'c',
 't',
 'R',
 'e',
 'p',
 'o',
 'r',
 't',
 'M',
 'c',
 'D',
 'o',
 'n',
 'a',
 'l',
 'd',
 's',
 'C',
 'o',
 'r',
 'p',
 'o',
 'r',
 'a',
 't',
 'i',
 'o',
 'n',
 'P',
 'u',
 'r',
 'p',
 'o',
 's',
 'e',
 'I',
 'm',
 'p',
 'a',
 'c',
 't',
 'R',
 'e',
 'p',
 'o',
 'r',
 't',
 '2',
 '0',
 '2',
 '3',
 '2',
 '0',
 '2',
 '4',
 'I',
 'n',
 't',
 'r',
 'o',
 'd',
 'u',
 'c',
 't',
 'i',
 'o',
 'n',
 'O',
 'u',
 'r',
 'P',
 'l',
 'a',
 'n',
 'e',
 't',
 'F',
 'o',
 'o',
 'd',
 'Q',
 'u',
 'a',
 'l',
 'i',
 't',
 'y',
 'S',
 'o',
 'u',
 'r',
 'c',
 'i',
 'n',
 'g',
 'J',
 'o',
 'b',
 's',
 'I',
 'n',
 'c',
 'l',
 'u',
 's',
 'i',
 'o',
 'n',
 'E',
 'm',
 'p',
 'o',
 'w'