In [1]:
import os
import requests
import pandas as pd
import numpy as np
from timeit import default_timer as timer
import datetime
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud, STOPWORDS
import re
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to C:\Users\Alessandro
[nltk_data]     Bitetto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Alessandro
[nltk_data]     Bitetto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Alessandro
[nltk_data]     Bitetto\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load text files, clean and do lemmatisation 

In [2]:
# load text

with open('./Checkpoints/whitepaper_final.pickle', 'rb') as handle:
    final_df = pickle.load(handle)

df_text = final_df[['id', 'link_white_paper', 'Final_Length_txt', 'Final_Length_txt_clean', 'Final_Path_txt']]
df_text = df_text[df_text['Final_Path_txt'] != ''].reset_index(drop = True)
df_text['text'] = ''

start = timer()
for index, row in df_text.iterrows():
    
    print('Reading ' + str(index + 1) + ' / ' + str(len(df_text)), end = '\r')
    
    with open(row['Final_Path_txt']) as f:
        df_text.loc[index, 'text'] = f.read()
print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))    

Reading 1456 / 1456
Total elapsed time: 0:00:41


In [3]:
# clean and lemmatisation
# https://github.com/Briiick/NLP-disaster-tweets/blob/main/notebooks/3-heavy-cleaning-BERT.ipynb

stopwords = set(STOPWORDS)
stopwords.update(["nan"])

def heavy_text_clean(x):
    # first we lowercase everything
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stopwords])
    # remove unicode characters
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'http*\S+', ' ', x)
    # then use regex to remove @ symbols and hashtags
    x = re.sub(r'@\S', '', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)
    # remove single letters and numbers surrounded by space
    x = re.sub(r'\s[a-z]\s|\s[0-9]\s', ' ', x)
    return x

def lemmatise(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

start = timer()
# apply cleaning
print('- Cleaning...')
df_text['text_clean'] = df_text['text'].apply(heavy_text_clean)

# apple lemmatisation
print('- Lemmatisation...')
df_text['text_clean'] = df_text['text_clean'].apply(lemmatise)

print('\nTotal elapsed time:', str(datetime.timedelta(seconds=round(timer()-start))))    

- Cleaning...
- Lemmatisation...

Total elapsed time: 0:01:58


## Perform sentiment analysis and text classification (pre-trained models)

In [11]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification



CACHE_DIR = 'D:/huggingface_cache/'    # cache directory for huggingface models

In [82]:
# FINBERT-ESG 4 classes
# https://huggingface.co/yiyanghkust/finbert-esg



model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4, cache_dir=CACHE_DIR)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
input_sequence = df_text['text_clean'].values.tolist()



sentence_batch = input_sequence[:4]
sentence_batch = [sentence_batch[0][:30], sentence_batch[1][:10], sentence_batch[2][:25], sentence_batch[3]]


In [13]:
# get special tokens and maximum sentence (in tokens) length

max_length = model.config.max_position_embeddings
print('Maximum tokens allowed:', max_length)

tk = tokenizer("[PAD]", truncation=False)
token_CLS = tk['input_ids'][0]
token_PAD = tk['input_ids'][1]
token_SEP = tk['input_ids'][2]
print('\nSpecial Tokens:\n', '[PAD]:', token_PAD , '\n', '[CLS]:', token_CLS, '\n', '[SEP]:', token_SEP)



Maximum tokens allowed: 512

Special Tokens:
 [PAD]: 0 
 [CLS]: 3 
 [SEP]: 4


In [None]:
BATCH_SIZE = 50

In [70]:
chunk_size = 20


tot_chunks = -(len(input_dict[list(input_dict.keys())[0]]) // -chunk_size)   # ceiling division
dict_split = [dict.fromkeys(input_dict.keys(), []) for i in range(tot_chunks)]
for k in input_dict.keys():

    k_split = input_dict[k].split(chunk_size)
    
    for ind in range(tot_chunks):
        
        dict_split[ind][k] = k_split[ind]


In [89]:
torch.cuda.empty_cache()

In [90]:
import gc
gc.collect()

63

In [86]:
del model

In [83]:
model(**dict_split[0])

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1254, -0.9042,  1.7391, -0.4823],
        [-0.0882, -1.0164,  1.6304, -0.2545],
        [ 0.6830, -1.2343,  1.2142, -0.4729],
        [ 0.8670, -0.9884,  0.9637, -0.5734],
        [ 1.0109, -0.9952,  0.5606, -0.3687],
        [ 1.9186, -1.1999,  0.2052, -0.5890],
        [ 2.0167, -1.1739,  0.3271, -0.6743],
        [ 1.1576, -0.7802,  1.1014, -0.7412],
        [ 0.0624, -0.6992,  2.1715, -0.8323],
        [ 1.3926, -1.1537,  0.4975, -0.6319],
        [-0.3855, -1.7495,  1.8229,  0.2532],
        [ 2.1907, -1.5740, -0.2191, -0.2269],
        [-0.9597, -0.5492,  2.6501, -0.7098],
        [-0.4193, -0.6558,  2.2209, -0.7495],
        [ 1.0385, -0.5515,  1.0685, -1.0325],
        [ 0.1609, -0.4891,  1.6168, -0.8897],
        [-0.5652, -1.0757,  2.0942, -0.2389],
        [-0.4642, -1.2527,  2.2331, -0.2181],
        [-0.1035, -0.7103,  2.1153, -0.8218],
        [ 1.1151, -0.8764,  0.7684, -0.9058]], grad_fn=<AddmmBackward0>), hidden_st

In [15]:
input_dict, ind_list, nchar_list = split_sentence(sentence_batch=input_sequence[:30], max_length=max_length)

In [14]:
# split tokens for long sentence  (predicted probabilities are then aggregated by average)

def split_sentence(sentence_batch=[], max_length=512):
    
    '''
    Split sequence in chunks of tokens ready to be used for prediction with model(**input_dict).
    https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f
    
    Args:
        - sentence_batch: list of strings
        - max_length: max sentence
        
    Return:
        - input_dict: dictionary for prediction
        - ind_list: list of indices for splitted sentence (to be used for averaging predictions)
        - nchar_list: list of number of tokens in each chunck (to be used for averaging predictions)
    '''

    ind_list = []
    nchar_list = []
    input_ids = torch.tensor([])
    attention_mask = torch.tensor([])

    for ind, txt in enumerate(sentence_batch):

        # get tokens
        tokens = tokenizer.encode_plus(txt, add_special_tokens=False, return_tensors='pt')

        # define target chunksize
        chunksize = max_length - 2

        # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
        input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
        mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

        # append number of tokens (to be used for weighted average of predictions)
        nchar_list.extend([len(x) for x in input_id_chunks])

        # loop through each chunk
        for i in range(len(input_id_chunks)):
            # add CLS and SEP tokens to input IDs
            input_id_chunks[i] = torch.cat([
                torch.tensor([token_CLS]), input_id_chunks[i], torch.tensor([token_SEP])
            ])
            # add attention tokens to attention mask
            mask_chunks[i] = torch.cat([
                torch.tensor([1]), mask_chunks[i], torch.tensor([1])
            ])
            # get required padding length
            pad_len = chunksize - input_id_chunks[i].shape[0]
            # check if tensor length satisfies required chunk size
            if pad_len > 0:
                # if padding length is more than 0, we must add padding
                input_id_chunks[i] = torch.cat([
                    input_id_chunks[i], torch.Tensor([0] * pad_len)
                ])
                mask_chunks[i] = torch.cat([
                    mask_chunks[i], torch.Tensor([0] * pad_len)
                ])

        # append index of chunks
        ind_list.extend([ind] * len(input_id_chunks))

        # append input_ids and attention_mask
        input_ids = torch.cat([input_ids, torch.stack(input_id_chunks)])
        attention_mask = torch.cat([attention_mask, torch.stack(mask_chunks)])

    # assemble dictionary for prediction
    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }

    return input_dict, ind_list, nchar_list

In [277]:
model(**input_dict)

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.9276, -0.1568,  0.5362, -1.3970],
        [ 1.0166, -1.1014,  0.6414, -0.2772],
        [ 1.1070, -0.1969,  0.6157, -1.2979],
        [-2.1878,  2.4865,  1.4006, -1.7072],
        [ 0.5424,  0.1603,  0.4930, -1.9304]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [270]:
input_ids.size()

torch.Size([5, 20])

In [266]:
nchar_list

[4, 3, 7, 18, 7]

In [267]:
ind_list

[0, 1, 2, 3, 3]

In [264]:
tokenizer(input_sequence, truncation=False)

{'input_ids': [[3, 241, 1189, 2566, 11669, 4], [3, 13705, 22077, 5357, 4], [3, 3797, 4085, 4983, 7232, 10839, 19015, 7232, 4], [3, 27547, 2933, 4757, 3209, 14, 871, 4763, 2661, 3627, 21425, 4446, 904, 7727, 463, 17674, 14899, 5318, 894, 351, 185, 4851, 598, 871, 4763, 2661, 4]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [268]:
len(tokenizer(input_sequence, truncation=False)['input_ids'][3])

27

In [249]:
nchar_list

[18, 7]

In [246]:
[len(x) for x in input_id_chunks]

[20, 9]

In [244]:
input_id_chunks

[tensor([    3, 27547,  2933,  4757,  3209,    14,   871,  4763,  2661,  3627,
         21425,  4446,   904,  7727,   463, 17674, 14899,  5318,   894,     4]),
 tensor([   3,  351,  185, 4851,  598,  871, 4763, 2661,    4])]

In [168]:
aa = input_sequence[:4]
aa = [aa[0][:30], aa[1][:10], aa[2][:25], aa[3]]
MAX_LENGTH = 8

token_list = tokenizer(aa, truncation=False)



# split tokens for long sentence  (predicted probabilities are then aggregated by average)
token_list_split = dict.fromkeys(token_list.keys(), [])
ind_list = []
for k in token_list.keys():
    
    rep_list = []
    ind_list = []
    for ind in range(len(token_list[k])):

            long_seq = torch.as_tensor(token_list[k][ind])
            long_seq = long_seq.split(MAX_LENGTH)
            long_seq = [x.tolist() for x in long_seq]
            rep_list.extend(long_seq)
            ind_list.extend([ind] * len(long_seq))
            
    token_list_split[k] = rep_list
# token_list_split = 

In [162]:
vv = {k: torch.as_tensor(v) for k, v in token_list_split.items()}
vv

ValueError: expected sequence of length 6 at dim 1 (got 5)

In [165]:
torch.tensor(token_list_split['input_ids'])

ValueError: expected sequence of length 6 at dim 1 (got 5)

In [158]:
token_list_split

{'input_ids': [[3, 241, 1189, 2566, 11669, 4],
  [3, 13705, 22077, 5357, 4],
  [3, 3797, 4085, 4983, 7232, 10839, 19015, 7232],
  [4],
  [3, 27547, 2933, 4757, 3209, 14, 871, 4763],
  [2661, 3627, 21425, 4446, 904, 7727, 463, 17674],
  [14899, 5318, 894, 351, 185, 4851, 598, 871],
  [4763, 2661, 4]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1]]}

In [166]:
aa = model(**token_list)
aa

AttributeError: 'list' object has no attribute 'size'

In [149]:
ind_list

[0, 1, 2, 2, 3, 3, 3, 3]

In [148]:
token_list_split

{'input_ids': [[3, 241, 1189, 2566, 11669, 4],
  [3, 13705, 22077, 5357, 4],
  [3, 3797, 4085, 4983, 7232, 10839, 19015, 7232],
  [4],
  [3, 27547, 2933, 4757, 3209, 14, 871, 4763],
  [2661, 3627, 21425, 4446, 904, 7727, 463, 17674],
  [14899, 5318, 894, 351, 185, 4851, 598, 871],
  [4763, 2661, 4]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1]]}

In [122]:
token_list

{'input_ids': [[3, 241, 1189, 2566, 11669, 4], [3, 13705, 22077, 5357, 4], [3, 3797, 4085, 4983, 7232, 10839, 19015, 7232, 4], [3, 27547, 2933, 4757, 3209, 14, 871, 4763, 2661, 3627, 21425, 4446, 904, 7727, 463, 17674, 14899, 5318, 894, 351, 185, 4851, 598, 871, 4763, 2661, 4]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [123]:
token_list_split

{'input_ids': [[3, 241, 1189, 2566, 11669, 4],
  [3, 13705, 22077, 5357, 4],
  [3, 3797, 4085, 4983, 7232, 10839, 19015, 7232],
  [4],
  [3, 27547, 2933, 4757, 3209, 14, 871, 4763],
  [2661, 3627, 21425, 4446, 904, 7727, 463, 17674],
  [14899, 5318, 894, 351, 185, 4851, 598, 871],
  [4763, 2661, 4],
  [0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0],
  [1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1]],
 'token_type_ids': [[3, 241, 1189, 2566, 11669, 4],
  [3, 13705, 22077, 5357, 4],
  [3, 3797, 4085, 4983, 7232, 10839, 19015, 7232],
  [4],
  [3, 27547, 2933, 4757, 3209, 14, 871, 4763],
  [2661, 3627, 21425, 4446, 904, 7727, 463, 17674],
  [14899, 5318, 894, 351, 185, 4851, 598, 871],
  [4763, 2661, 4],
  [0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0],
  [0,

In [211]:
ss = tokenizer(input_sequence[3], truncation=False, return_tensors='pt')
ss

{'input_ids': tensor([[    3, 27547,  2933,  4757,  3209,    14,   871,  4763,  2661,  3627,
         21425,  4446,   904,  7727,   463, 17674, 14899,  5318,   894,   351,
           185,  4851,   598,   871,  4763,  2661,     4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}

In [215]:
ss1 = tokenizer(input_sequence[3], truncation=False, return_tensors='pt', max_length=512, padding="max_length")
ss1

{'input_ids': tensor([[    3, 27547,  2933,  4757,  3209,    14,   871,  4763,  2661,  3627,
         21425,  4446,   904,  7727,   463, 17674, 14899,  5318,   894,   351,
           185,  4851,   598,   871,  4763,  2661,     4,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [216]:
model(**ss)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0585,  1.4490,  1.0957, -1.7358]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [217]:
model(**ss1)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.0585,  1.4490,  1.0957, -1.7358]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [143]:
a= [1,2,3]
b= [(4,5,6)]
a.extend(b)
a

[1, 2, 3, 10]

In [116]:
[len(x) for x in token_list['input_ids']]

[6, 5, 9, 27]

In [58]:
token_list.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [56]:
len(token_list['input_ids'])

100

In [None]:
aa = tokenizer(inp[0][:2000],truncation=True, return_tensors="pt", max_length=512)

In [43]:
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
results = nlp(inp[0][:2000])
print(results)

[{'label': 'Social', 'score': 0.4784192442893982}]


In [18]:
esgbert = BertForSequenceClassification.from_pretrained('nbroad/ESG-BERT')
tokenizer2 = BertTokenizer.from_pretrained('nbroad/ESG-BERT')

Downloading: 100%|███████████████████████████████████████████████████████████████████| 232k/232k [00:01<00:00, 223kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 56.1kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 376/376 [00:00<00:00, 94.3kB/s]


In [45]:
out = finbert(**aa)
out

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.0919, -0.8702,  1.2675, -0.7472]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [48]:
torch.nn.functional.softmax(out.logits, dim=-1)

tensor([[0.4014, 0.0564, 0.4784, 0.0638]], grad_fn=<SoftmaxBackward0>)

In [49]:
finbert.config.id2label

{0: 'None', 1: 'Environmental', 2: 'Social', 3: 'Governance'}

In [39]:
aa = tokenizer(inp[0][:2000],truncation=True, return_tensors="pt", max_length=512)
len(aa['input_ids'][0])

314

In [23]:
aa

{'input_ids': tensor([[    3,   241,  1189,  ...,  1711, 10495,     4]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [27]:
# FINBERT ESG
API_URL = "https://api-inference.huggingface.co/models/yiyanghkust/finbert-esg"

output = query({
    "inputs": inp[0][:4000], "options": {"wait_for_model": True}
})
output

{'error': 'Input is too long, try to truncate or use a paramater to handle this: The size of tensor a (627) must match the size of tensor b (512) at non-singleton dimension 1'}

In [15]:
output = query({
    "inputs": aa, "options": {"wait_for_model": True}
})
output

TypeError: Object of type BatchEncoding is not JSON serializable

In [76]:
import sys
sys.getsizeof(inp[0][:3300])

3349

In [69]:
[len(x) for x in inp]

[37059, 51673, 49344, 121, 7549, 54385, 15822, 31272, 20828, 39907]

In [11]:
df_text['text_clean'].values[:10].tolist()

['table content introduction challenge solution approach tycoon platform trader zero additional effort positioning tycoon relevant source income talk community follower handsfree experience non custodial intuitive trader selection flexibility following individual stop loss talk trader trading tool connecting follower trader api connection tycoon trading engine secure environment ecosystem token loyalty program fee entry fee profit sharing payment process revenue share referral program referral payment referral reward additional benefit referee staking tier boost staking reward web portal defi ui ux functional overview tycoon token tyc tokenomics tyc value tyc listing binance smart chain integration business model key resource key activity copy trading crypto exchange service revenue stream entry fee profit share crypto exchange fee exchange collaboration whitelabel solution mvp roadmap founder team partner disclaimer introduction a trader either new crypto space busy invest time consta

In [12]:
df_text.Final_Path_txt[3]

'C:\\Users\\Alessandro Bitetto\\Downloads\\UniPV\\ICOs\\Whitepaper\\Recovered\\ID_6.txt'