In [1]:
import os
import requests
import pandas as pd
import numpy as np
from timeit import default_timer as timer
import datetime
import pickle
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud, STOPWORDS
import re
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


from utils import sentence_classification
# import logging

CACHE_DIR = 'D:/huggingface_cache/'    # cache directory for huggingface models

[nltk_data] Downloading package punkt to C:\Users\Alessandro
[nltk_data]     Bitetto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Alessandro
[nltk_data]     Bitetto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Alessandro
[nltk_data]     Bitetto\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# set folders
CHECKPOINT_FOLDER = '.\\Checkpoints'
RESULTS_FOLDER = '.\\Results'
SENTIMENT_FOLDER = '.\\Checkpoints\\Sentiment'

if not os.path.exists(CHECKPOINT_FOLDER):
    os.makedirs(CHECKPOINT_FOLDER)
if not os.path.exists(RESULTS_FOLDER):
    os.makedirs(RESULTS_FOLDER)
if not os.path.exists(SENTIMENT_FOLDER):
    os.makedirs(SENTIMENT_FOLDER)

## Load text files, clean and do lemmatisation

ICOs with "FundRaisedUSD" will be kept anyway

In [17]:
# load text
LENGTH_TXT_CLEAN_THRSH = 4000     # threshold for maximum non-empty characters in parsed txt file

# available "FundRaisedUSD"
formatted_df=pd.read_pickle(os.path.join(CHECKPOINT_FOLDER, 'formatted_df.pkl'))
avail_fund_url=formatted_df[~formatted_df['FundRaisedUSD'].isna()]['url'].values

with open(os.path.join(CHECKPOINT_FOLDER, 'whitepaper_final.pickle'), 'rb') as handle:
    final_df = pickle.load(handle)

df_text = (final_df[(final_df['Final_Length_txt_clean'] >= LENGTH_TXT_CLEAN_THRSH) | (final_df['url'].isin(avail_fund_url))]
           [['url', 'Final_Length_txt', 'Final_Length_txt_clean', 'Final_Path_txt']])
df_text = df_text[df_text['Final_Path_txt'] != ''].reset_index(drop = True)
df_text['text'] = ''

start = timer()
for index, row in df_text.iterrows():
    
    print('Reading ' + str(index + 1) + ' / ' + str(len(df_text)), end = '\r')
    
    with open(row['Final_Path_txt']) as f:
        df_text.loc[index, 'text'] = f.read()
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))    

Reading 1867 / 1867
Total elapsed time: 0:00:01


In [18]:
# clean and lemmatisation
# https://github.com/Briiick/NLP-disaster-tweets/blob/main/notebooks/3-heavy-cleaning-BERT.ipynb

stopwords = set(STOPWORDS)
stopwords.update(["nan"])

def heavy_text_clean(x):
    # first we lowercase everything
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stopwords])
    # remove unicode characters
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'http*\S+', ' ', x)
    # then use regex to remove @ symbols and hashtags
    x = re.sub(r'@\S', '', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)
    # remove single letters and numbers surrounded by space
    x = re.sub(r'\s[a-z]\s|\s[0-9]\s', ' ', x)
    return x

def lemmatise(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

start = timer()
# apply cleaning
print('- Cleaning...')
df_text['text_clean'] = df_text['text'].apply(heavy_text_clean)

# apply lemmatisation
print('- Lemmatisation...')
df_text['text_clean'] = df_text['text_clean'].apply(lemmatise)
df_text['word_count'] = df_text['text_clean'].str.split().apply(len)

# remove text with no words
check=df_text[df_text['word_count'] == 0]
if len(check) > 0:
    print(f'\n- {len(check)} rows removed because of no word left')
    df_text=df_text[df_text['word_count'] > 0]

print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))

pkl_path=os.path.join(CHECKPOINT_FOLDER, 'df_text.pkl')
joblib.dump(df_text, pkl_path, compress=('lzma', 3))
print(f'\nData saved in {pkl_path}')

df_text.head(5)

- Cleaning...
- Lemmatisation...

- 1 rows removed because of no word left

Total elapsed time: 0:01:25

Data saved in .\Checkpoints\df_text.pkl


Unnamed: 0,url,Final_Length_txt,Final_Length_txt_clean,Final_Path_txt,text,text_clean,word_count
0,https://icomarks.com/ico/the-mill-of-blood,23152,8655,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,the mill blood mill blood about white paper ic...,975
1,https://icomarks.com/ico/moonlight,64263,53667,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,moonlight white paper important notice please ...,6166
2,https://icomarks.com/ico/digithoth,22151,5950,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nTopsoils...,topsoil northwest choice turf sod topsoil bark...,825
3,https://icomarks.com/ico/tourcom-blockchain,50350,40533,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,powerpoint peurejenteisyeon tourcom blockchain...,4759
4,https://icomarks.com/ico/migland,33630,27525,C:\Users\Alessandro Bitetto\Downloads\UniPV\IC...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,powerpoint peurejenteisyeon mig whitepaper enj...,3183


##  Perform sentiment analysis and text classification (pre-trained models)
### with HuggingFace API

https://huggingface.co/docs/api-inference/detailed_parameters

https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f

https://towardsdatascience.com/does-bert-need-clean-data-part-2-classification-d29adf9f745a

In [3]:
df_text=joblib.load(os.path.join(CHECKPOINT_FOLDER, 'df_text.pkl'))

In [6]:
API_TOKEN = "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"
API_URL = "https://api-inference.huggingface.co/models/"
HEADERS = {"Authorization": f"Bearer {API_TOKEN}"}
ROLLING_WINDOW_PERC=0.7
QUERY_BATCH_SIZE=50   # batch size of sentences to be sent to API
SPLIT_RELOAD=True
QUERY_RELOAD=True

MODEL_ID_LIST=['yiyanghkust/finbert-tone', 'yiyanghkust/finbert-esg-9-categories', 'yiyanghkust/finbert-esg']  # 'nbroad/ESG-BERT'


query_log=sentence_classification(df_text, model_ID_list=MODEL_ID_LIST, rolling_window_perc=ROLLING_WINDOW_PERC,
                                  query_batch_size=QUERY_BATCH_SIZE, split_reload=SPLIT_RELOAD,
                                  query_reload=QUERY_RELOAD, cache_dir=CACHE_DIR, api_url=API_URL,
                                  headers=HEADERS, checkpoint_folder=CHECKPOINT_FOLDER, sentiment_folder=SENTIMENT_FOLDER)

print('\n\n')
for k, v in query_log.items():
    query_path_csv=os.path.join(RESULTS_FOLDER, '02a_Sentiment_Raw_'+k+'.csv')
    v.to_csv(query_path_csv, index=False, sep=';')
    print('- Dataset saved to', query_path_csv)
    
query_path=os.path.join(CHECKPOINT_FOLDER, 'sentiment_raw.pkl')
joblib.dump(query_log, query_path, compress=('lzma', 3))
print('\n- Query log saved to', query_path)



##################################################################################################
#                                    yiyanghkust/finbert-tone                                    #
################################################################################################## 


Prediction classes:


Unnamed: 0,Class
0,Neutral
1,Positive
2,Negative



Maximum tokens allowed: 512

Special Tokens:
[PAD]: 0 
[CLS]: 3 
[SEP]: 4 
[UNK]: 2


-- Split sentences into chunks:

Reloaded (evaluated in 0:05:23)

Total sentences: 1866
Total chunked sentences: 22852


-- Query from API:

Querying batch (50 rows) 458 / 458  last interaction: 19/02/2023 11:57:07  - total failed batch: 0 (0 rows)

Unnamed: 0,Status
OK,22852



Done in  5:19:40
Data saved in .\Checkpoints\00_sentence_query_yiyanghkust_finbert-tone.pkl


##############################################################################################################
#                                    yiyanghkust/finbert-esg-9-categories                                    #
############################################################################################################## 


Prediction classes:


Unnamed: 0,Class
0,Climate Change
1,Natural Capital
2,Pollution & Waste
3,Human Capital
4,Product Liability
5,Community Relations
6,Corporate Governance
7,Business Ethics & Values
8,Non-ESG



Maximum tokens allowed: 512

Special Tokens:
[PAD]: 0 
[CLS]: 3 
[SEP]: 4 
[UNK]: 2


-- Split sentences into chunks:

Reloaded (evaluated in 0:05:26)

Total sentences: 1866
Total chunked sentences: 22852


-- Query from API:

Querying batch (50 rows) 458 / 458  last interaction: 19/02/2023 11:59:50  - total failed batch: 0 (0 rows)

Unnamed: 0,Status
OK,22852



Done in  5:59:45
Data saved in .\Checkpoints\00_sentence_query_yiyanghkust_finbert-esg-9-categories.pkl


#################################################################################################
#                                    yiyanghkust/finbert-esg                                    #
################################################################################################# 


Prediction classes:


Unnamed: 0,Class
0,
1,Environmental
2,Social
3,Governance



Maximum tokens allowed: 512

Special Tokens:
[PAD]: 0 
[CLS]: 3 
[SEP]: 4 
[UNK]: 2


-- Split sentences into chunks:

Reloaded (evaluated in 0:05:26)

Total sentences: 1866
Total chunked sentences: 22852


-- Query from API:

Querying batch (50 rows) 458 / 458  last interaction: 19/02/2023 12:01:23  - total failed batch: 0 (0 rows)

Unnamed: 0,Status
OK,22852



Done in  5:50:00
Data saved in .\Checkpoints\00_sentence_query_yiyanghkust_finbert-esg.pkl



- Dataset saved to .\Results\02a_Sentiment_Raw_yiyanghkust_finbert-tone.csv
- Dataset saved to .\Results\02a_Sentiment_Raw_yiyanghkust_finbert-esg-9-categories.csv
- Dataset saved to .\Results\02a_Sentiment_Raw_yiyanghkust_finbert-esg.csv

- Query log saved to .\Checkpoints\sentiment_raw.pkl


### Average sentiment for each splitted sentence

In [7]:
query_log=joblib.load(os.path.join(CHECKPOINT_FOLDER, 'sentiment_raw.pkl'))
sentiment={}
for mod, df in query_log.items():
    df_avg=df.drop(columns=['Model', 'Chunk', 'Status', 'Error', 'ref_index', 'max', 'eval_time']).groupby('url').mean()
    sentiment[mod]=df_avg
    
    path_csv=os.path.join(RESULTS_FOLDER, '02b_Sentiment_Average_'+mod+'.csv')
    df_avg.to_csv(path_csv, index=False, sep=';')
    print('- Dataset saved to', path_csv)
    
path=os.path.join(CHECKPOINT_FOLDER, 'sentiment_average.pkl')
joblib.dump(sentiment, path, compress=('lzma', 3))
print('\n- Sentiment log saved to', path)

- Dataset saved to .\Results\02b_Sentiment_Average_yiyanghkust_finbert-tone.csv
- Dataset saved to .\Results\02b_Sentiment_Average_yiyanghkust_finbert-esg-9-categories.csv
- Dataset saved to .\Results\02b_Sentiment_Average_yiyanghkust_finbert-esg.csv

- Sentiment log saved to .\Checkpoints\sentiment_average.pkl
