In [None]:
import io
import numpy as np
import glob
import string
import re
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer 

def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=False):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()
            yield text
            # close open handles
            converter.close()
            fake_file_handle.close()
            
def extract_keyword_text(pdf_path, gt_kwl, pr_kwl, tr_kwl, tokenizer, porterstemmer):
    used_page = []
    page_type = []
    for page in extract_text_by_page(pdf_path):
        # lower case
        lower_page = page.lower()
        
        #tokenization (including remove whitespaces)
        tokens = tokenizer.tokenize(lower_page)
        
        #remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        
        #remove numbers
        tokens = [token for token in tokens if token.isalpha()]
        
        #word stemming
        tokens = [porterstemmer.stem(token) for token in tokens]
        
        for keyword in gt_kwl+pr_kwl+tr_kwl:
            if keyword in tokens:
                used_page.append(" ".join(tokens))
                break

    result = " ".join(used_page)
    print("done")
    return [pdf_path.split("/")[-1], result]

def extract_all_text(pdf_path, tokenizer, porterstemmer):
    used_page = []
    for page in extract_text_by_page(pdf_path):
        # lower case
        lower_page = page.lower()
        
        #tokenization (including remove whitespaces)
        tokens = tokenizer.tokenize(lower_page)
        
        #remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        
        #remove numbers
        tokens = [token for token in tokens if token.isalpha()]
        
        #word stemming
        tokens = [porterstemmer.stem(token) for token in tokens]
        
        used_page.append(" ".join(tokens))
        
    result = " ".join(used_page)
    print("done")
    return [pdf_path.split("/")[-1], result]

general_term_key_words_list = ["climate", "climate risk", "carbon emission", " ghg ", "environment", " esg ", "physical risk",
                               "transition risk", "co2", "sustainability", "green economy", "energy efficiency", "clean energy",
                               "waste recuperation", "renewable energy", "wind", "hydro", "solar", "global warming", "hydrocarbon",
                               "methane", "oil", "coal", "gas", "greenhouse gases", "composting", "recycling", "soil pollution",
                               "air pollution", "water pollution", "natural hazard"]
physical_risk_key_words_list = ["heat wave", "cold wave", "flood", "drought", "wildfire", "storm", "change in temperature", 
                                "change in precipitation patterns", "sea level rise", "coastal hazard", "fluvial hazard"]
transition_risk_key_words_list = ["paris agreement", "kyoto protocol", " epa ", "environmental protection agency"]

ls_annual_report1 = np.append(glob.glob("./annual report/*.pdf"), glob.glob("./annual report 1/*.pdf"))
ls_annual_report2 = np.append(glob.glob("./annual report 2/*.pdf"), glob.glob("./annual report 3/*.pdf"))
ls_annual_report = np.append(ls_annual_report1, ls_annual_report2)
tokenizer = TreebankWordTokenizer()
porterstemmer = PorterStemmer()
ar_corpus = [extract_keyword_text(file, 
                               general_term_key_words_list, 
                               physical_risk_key_words_list, 
                               transition_risk_key_words_list, 
                               tokenizer, porterstemmer) for file in ls_annual_report]


done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


In [None]:
for i in range(len(ar_corpus)):
    ar_corpus[i][0] = "_".join(ar_corpus[i][0].split("_")[:-1])

In [None]:
# Read and clean all the investor presentations (by key word)
# and sustainability reports(keep all the text)

ls_investor_SP500_pre = glob.glob("./presentation/S&P500/*.pdf")
ls_investor_RLV_pre = glob.glob("./presentation/RLV/*.pdf")
ls_investor_RUS_pre = glob.glob("./presentation/RUS/*.pdf")
ls_investor_EURO_pre = glob.glob("./presentation/EURO/*.pdf")
ls_sustainability_SP500_report = glob.glob("./sustainability/S&P500/*.pdf")
ls_sustainability_RLV_report = glob.glob("./sustainability/RLV/*.pdf")
ls_sustainability_RUS_report = glob.glob("./sustainability/RUS/*.pdf")
ls_sustainability_EURO_report = glob.glob("./sustainability/EURO/*.pdf")
ip_sp500_corpus = [extract_keyword_text(file, 
                               general_term_key_words_list, 
                               physical_risk_key_words_list, 
                               transition_risk_key_words_list, 
                               tokenizer, porterstemmer) for file in ls_investor_SP500_pre]
ip_rlv_corpus = [extract_keyword_text(file, 
                               general_term_key_words_list, 
                               physical_risk_key_words_list, 
                               transition_risk_key_words_list,
                               tokenizer, porterstemmer) for file in ls_investor_RLV_pre]
ip_rus_corpus = [extract_keyword_text(file, 
                               general_term_key_words_list, 
                               physical_risk_key_words_list, 
                               transition_risk_key_words_list,
                               tokenizer, porterstemmer) for file in ls_investor_RUS_pre]
ip_euro_corpus = [extract_keyword_text(file, 
                               general_term_key_words_list, 
                               physical_risk_key_words_list, 
                               transition_risk_key_words_list,
                               tokenizer, porterstemmer) for file in ls_investor_EURO_pre]
sr_sp500_corpus = [extract_all_text(file, tokenizer, porterstemmer) for file in ls_sustainability_SP500_report]
sr_rlv_corpus = [extract_all_text(file, tokenizer, porterstemmer) for file in ls_sustainability_RLV_report]
sr_rus_corpus = [extract_all_text(file, tokenizer, porterstemmer) for file in ls_sustainability_RUS_report]
sr_euro_corpus = [extract_all_text(file, tokenizer, porterstemmer) for file in ls_sustainability_EURO_report]

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


In [None]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer 

import sys
import csv

# This is the part reading the data Ryan you sent to me, you can modify this part or just neglect it.

csv.field_size_limit(sys.maxsize)

new_list = []
with open("./corpus4_5.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        new_list.append(row)

nry_text = []
tokenizer = TreebankWordTokenizer()
ps = PorterStemmer()
for item in new_list[1:]:
    lower_page = item[2].lower()
    company_name = item[3]
    tokens = tokenizer.tokenize(lower_page)

    #remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    #remove numbers
    tokens = [token for token in tokens if token.isalpha()]

    #word stemming
    tokens = [ps.stem(token) for token in tokens]

    for keyword in general_term_key_words_list+physical_risk_key_words_list+transition_risk_key_words_list:
        if keyword in tokens:
            nry_text.append([company_name, " ".join(tokens)])
            break

In [None]:
corpus = ar_corpus + nry_text + ip_sp500_corpus + ip_rlv_corpus + ip_rus_corpus + ip_euro_corpus + sr_sp500_corpus + sr_rlv_corpus + sr_rus_corpus + sr_euro_corpus 

In [None]:
# Modify the company name indices we use:

for i in range(len(ar_corpus+nry_text), len(corpus)):
    used_name = corpus[i][0]
    valid_name = " ".join(used_name.split(".")[:-1])
    
    if valid_name.split(" ")[-1] in ["UW", "UQ", "UR"]:
        real_name = "_".join(["NASDQ"] + valid_name.split(" ")[:-1])
        
    if valid_name.split(" ")[-1] == "LN":
        real_name = "_".join(["LSE"] + valid_name.split(" ")[:-1])
        
    if valid_name.split(" ")[-1] == "UA":
        real_name = "_".join(["AMEX"] + valid_name.split(" ")[:-1])
        
    if valid_name.split(" ")[-1] == "UN":
        real_name = "_".join(["NYSE"] + valid_name.split(" ")[:-1])
        
    else:
        real_name = "_".join(["OTC"] + valid_name.split(" ")[:-1])
    corpus[i][0] = real_name

In [None]:
# Finally build our real corpus used for NLP processing:
# In the structure of "company - text"

company_name_list = list(set([corpus[i][0] for i in range(len(corpus))]))
real_corpus = []
for name in company_name_list:
    real_text_list = []
    for i in range(len(corpus)):
        if corpus[i][0] == name:
            real_text_list.append(corpus[i][1])
    real_text = " ".join(real_text_list)
    real_corpus.append([name, real_text])

real_corpus[0]

['NYSE_BLX',
 'dear sharehold busi wa mark by signific challeng for bladex in a still uncertain global macroeconom notwithstand a posit forecast by the intern monetari fund imf for global gdp growth of gener increas while the rise in interest rate by the feder reserv fed in the unit state normal monetari polici it also led to an appreci of the dollar and a neg impact on emerg economi particularli in latin in addit the further escal of protection between the unit state and it commerci partner especi china remain a sourc of uncertainti for foreign trade and intern financi thi ha brought about a slowdown in the momentum of econom activ in latin america and the caribbean end at essenti the same pace as with estim averag growth of in gener the region face veri heterogen scenario crisi in argentina uncertainti in brazil and mexico and greater resili in the pacif trade in latin america show a posit perform with estim growth of in abov global trade growth of thi growth is mainli explain by the

In [None]:
# Save all the preprocessed data in a csv file

with open("real whole corpus.csv", "w") as csvfile:
    writer = csv.writer(csvfile)
    for line in real_corpus:
        writer.writerow(line)

In [None]:
# Modify the company name indices in BB_DISCLOSURE_SCORE.xlsx to make it match with names we use:

sp500_list = []
with open("./index company/S&P.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        sp500_list.append(row)
        
rlv_list = []
with open("./index company/RLV.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        rlv_list.append(row)
        
euro_list = []
with open("./index company/EURO.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        euro_list.append(row)
        
rus_list = []
with open("./index company/RUS.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        rus_list.append(row)

new_list = np.array(sp500_list+rlv_list[1:]+euro_list[1:]+rus_list[1:]).T
new_dict = {}
for i in range(3):
    new_dict[new_list[i][0]] = new_list[i][1:]
cr_disclosure_company = pd.DataFrame(new_dict)
cr_disclosure_company = cr_disclosure_company[~cr_disclosure_company["ENV_DISCLOSURE_SCORE"].isin(["#N/A N/A"])]


new_names = list(cr_disclosure_company["Name"])
real_names = []
for i in range(len(new_names)):
    valid_name = new_names[i]
    
    if valid_name.split(" ")[-1] in ["UW", "UQ", "UR"]:
        real_name = "_".join(["NASDQ"] + valid_name.split(" ")[:-1])
        
    if valid_name.split(" ")[-1] == "LN":
        real_name = "_".join(["LSE"] + valid_name.split(" ")[:-1])
        
    if valid_name.split(" ")[-1] == "UA":
        real_name = "_".join(["AMEX"] + valid_name.split(" ")[:-1])
        
    if valid_name.split(" ")[-1] == "UN":
        real_name = "_".join(["NYSE"] + valid_name.split(" ")[:-1])
        
    else:
        real_name = "_".join(["OTC"] + valid_name.split(" ")[:-1])
        
    #print(real_name)
    real_names.append(real_name)

new_cr_disclosure_company = pd.DataFrame({"Name": real_names,
                                          "ENV_DISCLOSURE_SCORE": list(cr_disclosure_company["ENV_DISCLOSURE_SCORE"])})

new_cr_disclosure_company

Unnamed: 0,Name,ENV_DISCLOSURE_SCORE
0,NYSE_AAP,22.48062016
1,OTC_ADBE,41.86046512
2,OTC_ADP,9.302325581
3,NYSE_AIG,7.142857143
4,NYSE_AME,5.426356589
...,...,...
1500,NYSE_YEXT,1.550387597
1501,OTC_YGYI,1.550387597
1502,OTC_YORW,8.275862069
1503,OTC_YRCW,6.201550388


In [None]:
# Write and store modified company names:

new_cr_disclosure_company.to_csv("real BB company.csv")

In [None]:
# This part is just for my own experiment on LSA and LDA model (unsupervised learning part)
# Ryan you can re-ogranize this part or just neglect it.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

vectorizer = TfidfVectorizer(stop_words='english',
                             use_idf=True,
                             smooth_idf=True)
# SVD dimensionality reduction
svd_model = TruncatedSVD(n_components=100,         # num dimensions, hyper-parameter
                         algorithm='randomized',
                         n_iter=10)

# LDA dimensionality reduction
lda_model = LatentDirichletAllocation(n_components=n_topic,
                                    max_iter=100,
                                    learning_method='batch',
                                    evaluate_every=200,
#                                    perp_tol=0.1, #default                                       
#                                    doc_topic_prior=1/n_topic, #default
#                                    topic_word_prior=1/n_topic, #default
                                    verbose=0)

# pipeline
svd_transformer = Pipeline([('tfidf', vectorizer),
                            ('svd', svd_model)])
svd_matrix = svd_transformer.fit_transform(corpus)
svd_matrix

246