In [1]:
import re
import pandas as pd
import os
import numpy as np
import string
import nltk
import time
from keybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification

import torch

from functools import partial
from multiprocess import Process, Pool, Queue
import multiprocess as mp

from nltk.corpus import stopwords

## Initialization

In [2]:
if torch.cuda.is_available():      
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

stop_words = stopwords.words('english')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased", output_attentions=False, output_hidden_states=True)
bert_model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Function

In [11]:
def Split_by_paragraph (file_path):
    
    with open(file_path,'rb') as fread:
        text = fread.read()
        
        text = text.decode('utf8').encode("ascii", "ignore")
        text = text.decode('utf8')

    fread.close()
    
    paras_cleaned = []
    paras = text.split("\n\n") # split into paragraphs
    
    # metrics for tracking. Remove after finalizing code
    #tables = []
    #page_breaks = []
    #empty_paras = []
    #short_paras = []
    
    pre = ''     # previous paragraph, fix paragraphs spanning multiple pages
    for i, para in enumerate(paras):
        #print(para)
        words = re.findall("[a-zA-Z]+", para) #cout words
        num_words = len(words)
        #print(num_p)
        
        if num_words == 0:  # empty paragraphs
            #empty_paras.append([i, para])
            continue

        non_words = re.findall(r"\b\d[\d.,%]*\b|\s+", para)  # count numbers and spaces
        num_non_words = len(non_words)
        #print(space_num)
        
        # if words are less than 1/3 of tokens, this may be a table
        if  num_words/(num_non_words + num_words) < 1/3: 
            #tables.append([i, para])
            continue     # don't take any table

        # check if a paragraph spans multiple pages
        # 1. the first word is capitalized 
        # 2. previous paragraph not end with .!?

        if (re.search(r'^[a-z]', para.strip())) \
            and (not re.search(r'[\.\!\?\:\;]$', pre)) \
            and (len(paras_cleaned)>0):
            
            #page_breaks.append([i, pre, para])
            # merge with previous paragraphs
            paras_cleaned[-1] = paras_cleaned[-1] + " " + para
            pre = paras_cleaned[-1]

        else:
            paras_cleaned.append(para)
            pre = para

    # further clean up paragraps:
    paras = []
    for i, para in enumerate(paras_cleaned):
        para = re.sub(r"\n", " ", para) # remove line break
        para = re.sub(r"\s+", ' ', para) # remove extra space
        #para = re.sub(r"\b\d[\d.,]*\b", 'dd', para) # replace numbers (at least 2 digits) by 'dd' to increase match

        if len(re.findall(r'\w\w+', para)) < 20: # minimum 20 words per paragraph. Should it be larger?
        #short_paras.append([i, para])
            continue

        paras.append(para)

    return paras


def get_id(words, sentences):

    sid = [tokenizer(i)['input_ids'] for i in sentences]
    kwid = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(i)) for i in words]  ## Initialize a list of word id list 
    kwid_n = [str(i) for i in kwid]
    word_dic = dict(zip(kwid_n, [i for i in words]))
    kwdic = {}
    for i in kwid:
        if i[0] not in kwdic:
            kwdic[i[0]] = []
        kwdic[i[0]].append(i)    ## Initialize a dictionary, with value as word id lists mentioned before, 
                                 ## and key as the first word id of the list
    idx = {}
    for i in kwid:
        idx[str(i)] = []         ## Initialize a dictionary, which would save the output
    for k in range(len(sid)):
        for i in range(min(len(sid[k]), 500)):
            if sid[k][i] in kwdic:
                for j in kwdic[sid[k][i]]:
                    try:
                        if i+len(j)<500 and all([sid[k][i+l]==j[l] for l in range(len(j))]):
                            idx[str(j)] += [(k, list(range(i, i+len(j))))]
                    except IndexError:
                        pass

    return idx, word_dic


def embedding(sentences):
    
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(sent,     
                        add_special_tokens = True, 
                        max_length = 500,    
                        truncation=True,
                        padding = 'max_length',
                        return_attention_mask = True, 
                        return_tensors = 'pt')
   
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    tup = ()
    for i in range(1+int((input_ids.shape[0])/5)):
        if i <= (input_ids.shape[0])/5 - 1:
            a = input_ids[5*i:5*(i+1)].to(device)
            b = attention_masks[5*i:5*(i+1)].to(device)
        else:
            a = input_ids[5*i:].to(device)
            b = attention_masks[5*i:].to(device)

        bert_model.eval()
        with torch.no_grad():
            outputs = bert_model(a, b)   
            hidden_states = outputs[2]

        token_embeddings = torch.stack(hidden_states[-4:], dim=0).cpu() 
        token_embeddings = token_embeddings.permute(1,2,0,3)
        token_embeddings = token_embeddings.mean(axis=2)
        tup += (token_embeddings, )

    token_embeddings = torch.cat(tup, dim=0)

    return token_embeddings


def get_vec(idx, tk):

    kw_vec = {}
    tp = ()
    for i in idx:
        if idx[i]:
            kw_vec[i] = []
    for i in kw_vec:
        kw_vec[i] = torch.cat(tuple(torch.cat(tuple(tk[j[0]][k].unsqueeze(dim=0) 
                            for k in j[1]), dim=0).mean(axis=0).unsqueeze(dim=0) for j in idx[i]), dim=0).mean(axis=0)
        tp += (kw_vec[i].unsqueeze(dim=0), )


    vec = torch.cat(tp, dim=0)

    return kw_vec, vec



        


## Pipeline

In [12]:
class extract(object):

    
    def __init__(self, keyword_path: str, txt_folder_path: str):
        
        self.kw_path = keyword_path
        self.txt_path = txt_folder_path
        self.kw = None
        self.txt = None
        self.found_para = None
        self.nkw = []
        
        # if type(keyword_path)!=str or type(txt_folder_path)!=str:
        #     print('Invalid Input of Parameters.')
       
    
    def fit(self):
        
        with open (self.kw_path)as f:
            k_words = f.read()
        self.kw = [i.strip().lower() for i in k_words.split(',')]
        self.kw = list(set(self.kw))
        path_list = os.listdir(self.txt_path)
        file_path = [self.txt_path+'/'+i for i in path_list]
        new = []
        name = []
        for i in range(len(file_path)):
            temp = Split_by_paragraph(file_path[i])
            new += temp
            name += [path_list[i]]*len(temp)
        self.txt = new
        self.filename = name
      
    
    def find(self):
        
        pt = re.compile('|'.join(self.kw))
        fk = [pt.search(i.lower()) for i in self.txt]
        yy = [i for i in range(len(fk)) if fk[i]]
        tt = [self.txt[i] for i in yy]
        ne = [self.filename[i] for i in yy]
        uni_para = []
        uni_name = []
        for i in range(len(tt)):
            if tt[i] not in uni_para:
                uni_para.append(tt[i])
                uni_name.append(ne[i])
        self.found_para = uni_para
        self.source = uni_name
    
    
    def expand(self):
        
        id_raw, word_raw = get_id(self.kw, self.found_para)
        
        if any(id_raw[i] for i in id_raw):
            mod = KeyBERT()
            raw_ex = mod.extract_keywords(self.found_para, keyphrase_ngram_range=(2,4), stop_words=None)
            exp_kw = []
            for i in raw_ex:
                exp_kw += list(set(i))
            exp_kw = list(map(lambda x: x[0], exp_kw))

            dic = nltk.FreqDist(exp_kw)
            new_kw = [i for i in dic if dic[i] >= 2]
            new_idx = np.array([dic[i] for i in new_kw]).argsort()[::-1][:100]  
            new_kw = [new_kw[i] for i in new_idx]
            
            if new_kw:
                id_new, word_new = get_id(new_kw, self.found_para)
                dic_raw, vec_raw = get_vec(id_raw, embedding(self.found_para))
                dic_new, vec_new = get_vec(id_new, embedding(self.found_para))

                score = []
                word_idx = []
                for i in dic_new:
                    score.append(torch.cosine_similarity(dic_new[i], vec_raw).max())
                    word_idx.append(word_new[i])
                # id_rank = np.array(score).argsort()[:-1][:3]
                id_rank = [i for i in range(len(score)) if score[i] >= 0.5]
                new_kw = [word_idx[i] for i in id_rank]
                new_kw = [i for i in new_kw if tokenizer.tokenize(i)[0] not in 
                          stop_words and tokenizer.tokenize(i)[-1] not in stop_words]
                self.nkw += [i for i in new_kw if i not in self.kw]
                self.kw += new_kw
                self.kw = list(set(self.kw))
        
    
    
    def iterate(self, iteration='converge'):
        
        # t1 = time.time()
        
        if not self.kw:
            self.fit()
        keyword = self.kw
        self.find()
        
        if self.found_para:
            self.expand()
            if set(self.kw) != set(keyword):
                if iteration == 'converge':
                    while set(self.kw) != set(keyword):
                        keyword = set(self.kw)
                        self.find()
                        self.expand()
                        # print(len(self.nkw))
                elif type(iteration) == int:
                    for i in range(iteration-1):
                        self.find()
                        self.expand()
                        # print(len(self.nkw))
        
        # t2 = time.time()
        # print(f'Iteration done! Total spent time: {t2-t1} seconds')
        
        

In [13]:
keyword_path = 'keyword_by_firm.txt'
doc_path = '10-K/'
path_list = [doc_path+i for i in os.listdir(doc_path)]
params = [extract(keyword_path,i) for i in path_list] 

In [14]:
t1 = time.time()
for i in range(len(params)):
    globals()[f'test{i}'] = params[i]
    globals()[f'test{i}'].fit()
t2 = time.time()
print(t2-t1)

9.136666536331177


In [15]:
t1 = time.time()
for i in range(len(params)):
    globals()[f'test{i}'].find()
t2 = time.time()
print(t2-t1)

16.788400173187256


In [16]:
t1 = time.time()
for i in range(len(params)):
    globals()[f'test{i}'].expand()
t2 = time.time()
print(t2-t1)

8it [00:00, 573.34it/s]
17it [00:00, 1137.05it/s]
19it [00:00, 1270.66it/s]
2it [00:00, 495.14it/s]
30it [00:00, 792.14it/s]
37it [00:00, 951.80it/s]
353it [00:00, 872.37it/s]
47it [00:00, 889.82it/s]
1it [00:00, 501.71it/s]
11it [00:00, 689.63it/s]
933it [00:00, 965.52it/s]
26it [00:00, 899.56it/s]


96.43504905700684


## Integration

In [10]:
def func_(test):
    
    df = pd.DataFrame(columns=['CIK', 'FILE_NAME', 'PATH', 'PARAGRAPH', 'FOUND_KEYWORDS'])
    test.iterate(2)
    test.find()
    
    if test.found_para:
        f_para = test.found_para
        name = test.source
        path = test.txt_path
        cik = re.findall(r'/([^/]+)', path)[0]
        pt = re.compile('('+'|'.join(test.kw)+')')

        for i in range(len(f_para)):
            file = name[i]
            para = f_para[i]
            kwd = pt.findall(f_para[i].lower())
            df.loc[i] = [cik, file, path, para, set(kwd)]
    
    return df


In [11]:
if __name__ == '__main__':
    ## For doc path input, '/' at the end shall be included
    keyword_path = input('Input the path of keyword txt file: ')
    doc_path = input('Input the parent path of the document file for each CIK (e.g., ".../10-K/"): ')

    path_list = [doc_path+i for i in os.listdir(doc_path)]
    params = [extract(keyword_path,i) for i in path_list] 

    ## run the function
    t1 = time.time()
    results = [func_(i) for i in params]
    df = pd.concat(results).reset_index(drop=True)
    t2 = time.time()
    print(f'Execution done! Total run time: {t2-t1} seconds')

    ## save the results
    save = input('Input the path to save the output: ')
    df.to_excel(save+'result1.xlsx')

Input the path of keyword txt file: keyword_by_firm.txt
Input the parent path of the document file for each CIK (e.g., ".../10-K/"): 10-K/


8it [00:00, 501.64it/s]
12it [00:00, 926.41it/s]
23it [00:00, 1098.90it/s]
5it [00:00, 836.15it/s]
28it [00:00, 802.66it/s]
39it [00:00, 910.09it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
376it [00:00, 820.13it/s]
48it [00:00, 860.01it/s]
1it [00:00, 498.02it/s]
11it [00:00, 649.27it/s]
951it [00:01, 872.03it/s]
31it [00:00, 797.53it/s]


Execution done! Total run time: 258.3349997997284 seconds


## Alternative

In [None]:
def get_word_span(sent):
    words = nltk.word_tokenize(sent)
    word_spans = []

    cur_pos = 0
    for w in words:
        start = sent[cur_pos:].index(w)
        end = start + len(w)
        word_spans.append = (w, start, end)
        cur_pos = end
    return word_spans


def get_n_gram_span(word_spans, n):
     # get n_gram embedding, n = 2, 3, 4
    ngram_spans = []

    for i, item in enumerate(word_spans):
        if i+ n-1<len(word_spans):
            words = ' '.join([word_spans[x] for x in range(i, i+n)])
            ngram_spans.append(words, #words
                           word_spans[i][1], # start position
                           word_spans[i+1][2])  # end position
        else:
            break

def get_keyword_emb(paragraph, seed_words, ngram_range, sim_thresh = 0.5, top = 5):

    sentences = nltk.sent_tokenize(paragraph)
    sent_emb, char_to_tokens = get_pretrained_wordvector(sentences, tokenizer, bert_model)

    # get ngram emb  # to do: remove stop words
    ngram_embs = []
    ngram_words = []
    found_seed_words = []
    found_seed_emb = []

    for i,sent in enumerate(sentences):
    word_spans = get_word_span(sent)


    for n in ngram_range:
        ngrams = get_n_gram_span(word_spans, n)

        for words, start, end in ngrams:
            n_gram_token_ids = list(set([char_to_tokens[i][x][0] for x in range(start, end+1)]))
            n_gram_emb = sent_emb[i][n_gram_token_ids].mean(dim = 0)      

        if words in seed_words:
            found_seed_emb.append(n_gram_emb)
            found_seed_words.append(words)

        else:
            ngram_embs.append(n_gram_emb)
            ngram_words.append(words)

    # calculate similarity 
    found_seed_emb = F.normalize(torch.cat(found_seed_emb.unsqueeze(0), dim = 0), dim = 0)
    ngram_embs = F.normalize(torch.cat(ngram_embs.unsqueeze(0), dim = 0), dim = 0)

    sim = found_seed_emb.mm(ngram_embs.T).cpu().numpy()
    top_index = sim.mean(axis = 0).argsort()[::-1][0:top]
    top_index = (sim[: top_index] >= sim_thresh).any(axis = 1)

    ngram_words = ngram_words[top_index]
    ngram_embs = ngram_embs[top_index]

    return found_seed_words, found_seed_emb, ngram_words,ngram_embs


## Multi-Process

In [None]:
# if __name__ == '__main__':
    
#     t1 = time.time()
#     pool = Pool(mp.cpu_count())
    
#     path_list = [doc_path+i for i in os.listdir(doc_path)]
#     params = [extract(keyword_path,i) for i in path_list] 
    
#     result = pool.map(func_, params)
    
#     pool.close()
#     pool.join()
    
#     t2 = time.time()
#     print(t2-t1)