In [6]:
from google.colab import drive
import sys, os

drive.mount('/content/drive')

cur_path = os.path.join('/content/drive/MyDrive/Colab_Notebooks','fraud')

os.chdir(cur_path)
print(os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab_Notebooks/fraud


In [7]:
import pandas as pd
import pickle
import re
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import multiprocessing as mp
import wordninja

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.optimize import linear_sum_assignment
import time

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Functions

In [4]:
from sentence_transformers import SentenceTransformer, util

bert_model = SentenceTransformer('all-mpnet-base-v2')
#bert_model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [5]:

# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = bert_model.encode(sentences1, convert_to_tensor=True)
embeddings2 = bert_model.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.1920
A man is playing guitar 		 A woman watches TV 		 Score: -0.0478
The new movie is awesome 		 The new movie is so great 		 Score: 0.9101


In [None]:
#embeddings1[0]

In [6]:
cosine_scores

tensor([[ 0.1920,  0.0634,  0.1059],
        [-0.0101, -0.0478, -0.0692],
        [ 0.0843, -0.0812,  0.9101]], device='cuda:0')

In [18]:
# Calculate similarity between two docs with tf-idf vectors by Kuhn-Munkres algorithm.
def match_two_docs_old(doc1, doc2):
    
    # pairwise distance between sentences
    sim = ((util.cos_sim(doc1, doc2)+1)/2).cpu().numpy()
    #print(sim.shape, min(sim.shape))
    
    # assignment is to minimize the distance of matched sentences
    # WMD may be used here (to do)
    row_ind, col_ind = linear_sum_assignment(1-sim)
    #print(row_ind)
    #print(col_ind)
    
    # total similarity of matched sentences
    score = sim[row_ind, col_ind].sum()
    #print(doc_sim)
    
    # normalize by min. size of two docs
    score1 = score/min(sim.shape)
    
    # normalize by max. size of two docs
    score2 = score/max(sim.shape)
    
    score = [score1, score2]
    
    return score, row_ind, col_ind, sim


In [8]:
def match_two_docs(doc1, doc2):
    
    # pairwise distance between sentences
    sim = np.dot(doc1, doc2.T)
    sim = (sim+1)/2
    #print(sim.shape, min(sim.shape))
    
    # assignment is to minimize the distance of matched sentences
    # WMD may be used here (to do)
    row_ind, col_ind = linear_sum_assignment(1-sim)
    #print(row_ind)
    #print(col_ind)
    
    # total similarity of matched sentences
    score = sim[row_ind, col_ind].sum()
    #print(doc_sim)
    
    # normalize by min. size of two docs
    score1 = score/min(sim.shape)
    
    # normalize by max. size of two docs
    score2 = score/max(sim.shape)
    
    score = [score1, score2]
    
    return score, row_ind, col_ind, sim

In [9]:
def compute_sim_old(cik, df, bert_model, window = 1): 
    
    result = []
      
    years = np.sort(df.fyear.unique())
    

    for i in range(len(years)-1):
        for j in range(i+1,len(years)):
            if years[j]-years[i] <= window:
                
                doc1 = df[df.fyear == years[i]].text.tolist()
                doc2 = df[df.fyear == years[j]].text.tolist()
                
                doc1 = bert_model.encode(doc1, convert_to_tensor=True)
                doc2 = bert_model.encode(doc2, convert_to_tensor=True)
                
                score, row_ind, col_ind, sim = match_two_docs_old(doc1, doc2)
                
                # get matched pair similarity
                matched_sim = sim[row_ind, col_ind]
                
                # return CIK, year 1, year 2, four score, 
                # sizes of doc1 and doc2
                # matched paragraphs of doc1, matched paragraphs of doc2
                # similarity of matched paragraphs
                result.append([cik, years[i], years[j], score, sim.shape, \
                               #doc1_idx, doc2_idx, \
                               row_ind, col_ind, matched_sim])
    
    return result

In [10]:
def compute_sim(cik, df, bert_model, window = 1): 
    
    result = []
      
    years = np.sort(df.fyear.unique())
    
    embeddings = bert_model.encode(df.text.tolist(), convert_to_tensor=False, \
                                   normalize_embeddings = True,\
                                   convert_to_numpy = True)

    for i in range(len(years)-1):
        for j in range(i+1,len(years)):
            if years[j]-years[i] <= window:
                
                doc1_index = df[df.fyear == years[i]].index.tolist()
                doc2_index = df[df.fyear == years[j]].index.tolist()
                
                doc1 = embeddings[doc1_index]
                doc2 = embeddings[doc2_index]
                
                score, row_ind, col_ind, sim = match_two_docs(doc1, doc2)
                
                # get matched pair similarity
                matched_sim = sim[row_ind, col_ind]
                
                # return CIK, year 1, year 2, four score, 
                # sizes of doc1 and doc2
                # matched paragraphs of doc1, matched paragraphs of doc2
                # similarity of matched paragraphs
                result.append([cik, years[i], years[j], score, sim.shape, \
                               #doc1_idx, doc2_idx, \
                               row_ind, col_ind, matched_sim])
    
    return result

In [26]:
def get_df_for_cik(cik, para_map):
    
    df = pd.concat({k: pd.Series(v) for k, v in para_map[cik].items()})
    df = df.reset_index()
    
    df.columns = ['fyear','pid','text']
    df = df.sort_values(by=['fyear','pid'])
    df = df.reset_index(drop = True)
    
    return df

# Test Functions

In [12]:
para_map = pickle.load(open("data/mda/paragraphs_1994_2016.pkl", 'rb'))
#para_map = pickle.load(open("data/mda/paragraphs_1994_2016_original.pkl", 'rb'))
len(para_map)

20893

In [13]:
# get all paragraphs to fit a single dtm. save the index map

para_id_map = {}
cnt = 0
docs = []

cnt = 0

for cik in para_map:
    
    para_id_map[cik] = {}
    
    for fyear in para_map[cik]:
        
        para_id_map[cik][fyear] = [cnt, cnt + len(para_map[cik][fyear])]
        docs += para_map[cik][fyear]
            
        cnt += len(para_map[cik][fyear])

In [14]:
# check one

len(docs)
cnt
len(para_map[20][2000])
para_id_map[20][2000]

9056901

9056901

34

[106, 140]

## Test case 1

In [15]:
# for debug purpose

cik = 1007021

df = get_df_for_cik(cik)
df.head()
df.groupby("fyear")["pid"].count()

Unnamed: 0,fyear,pid,text
0,1996,0,The following discussion and analysis of the f...
1,1996,1,The Company does not provide forecasts of the ...
2,1996,2,The software industry is highly competitive an...
3,1996,3,Revenue. The Company's revenue consists of lic...
4,1996,4,International revenue decreased dd% to $dd in ...


fyear
1996    26
1997    28
1998    36
1999    35
2000    37
2001    37
Name: pid, dtype: int64

In [16]:
df.fyear.unique()

sim = compute_sim(cik, df, bert_model, window = 1)
sim

array([1996, 1997, 1998, 1999, 2000, 2001])

[[1007021,
  1996,
  1997,
  [0.9614706773024339, 0.8927942003522601],
  (26, 28),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25]),
  array([ 0, 25, 26, 11, 12, 13, 14,  4,  5, 17, 18, 19, 20, 21,  2,  3,  1,
         15, 16,  6,  7,  8,  9, 10, 22, 23]),
  array([0.99999994, 0.9664061 , 0.8958518 , 0.9999999 , 1.        ,
         1.        , 0.9999999 , 0.9992029 , 0.9932897 , 0.99991393,
         0.9995846 , 0.9990673 , 0.99999994, 0.9974862 , 0.9047917 ,
         0.9532611 , 0.7643708 , 0.9609227 , 0.94390476, 0.9618541 ,
         0.95012134, 0.95395386, 0.9172517 , 0.8829765 , 0.96266544,
         0.99136007], dtype=float32)],
 [1007021,
  1997,
  1998,
  [0.9525764329092843, 0.7408927811516656],
  (28, 36),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
  array([ 0,  2, 27, 13,  4,  5,  6, 17, 18, 19, 20,  1, 11, 12,  3, 1

In [19]:
sim = compute_sim_old(cik, df, bert_model, window = 1)
sim

[[1007021,
  1996,
  1997,
  [0.9614706773024339, 0.8927942003522601],
  (26, 28),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25]),
  array([ 0, 25, 26, 11, 12, 13, 14,  4,  5, 17, 18, 19, 20, 21,  2,  3,  1,
         15, 16,  6,  7,  8,  9, 10, 22, 23]),
  array([1.        , 0.9664059 , 0.89585185, 1.        , 1.        ,
         1.        , 1.        , 0.99920285, 0.9932897 , 0.999914  ,
         0.9995846 , 0.9990674 , 1.        , 0.9974861 , 0.9047918 ,
         0.95326096, 0.7643708 , 0.9609227 , 0.94390476, 0.9618542 ,
         0.9501214 , 0.953954  , 0.9172517 , 0.8829764 , 0.9626653 ,
         0.99136   ], dtype=float32)],
 [1007021,
  1997,
  1998,
  [0.9525764329092843, 0.7408927811516656],
  (28, 36),
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]),
  array([ 0,  2, 27, 13,  4,  5,  6, 17, 18, 19, 20,  1, 11, 12,  3, 1

## Test case 2

In [None]:
# Try with one pair with overall dtm
cik = 1750
df = get_df_for_cik(cik)
df.head()
df.groupby("fyear")["pid"].count()


Unnamed: 0,fyear,pid,text
0,1994,0,The Company reports its activities in one busi...
1,1994,1,The comparison of net sales of the Company ove...
2,1994,2,Aerospace/aviation manufacturers and certain d...
3,1994,3,The difficult general economic conditions duri...
4,1994,4,The Company believes that its established mark...


fyear
1994     23
1995     23
1996     25
1997     28
1998     28
1999     25
2000     36
2001     44
2002     55
2003     50
2004     49
2005    175
2006     59
2007     60
2008     67
2009     65
2010     57
2011     76
2012     61
2013     75
2014     88
2015     89
2016     85
Name: pid, dtype: int64

In [None]:
sim = compute_sim(cik, df,bert_model, window = 1)
#sim


## Display match

In [None]:
cik = 1007021

df = get_df_for_cik(cik)
df.fyear.unique()

doc1_idx = df[df.fyear == 1996].index.tolist()
doc2_idx= df[df.fyear == 1997].index.tolist()

doc1 = df[df.fyear == 1996].text.tolist()
doc2 = df[df.fyear == 1997].text.tolist()

doc1 = bert_model.encode(doc1, convert_to_tensor=True)
doc2 = bert_model.encode(doc2, convert_to_tensor=True)

array([1996, 1997, 1998, 1999, 2000, 2001])

In [None]:
df[df.fyear.isin([1996,1997])]

Unnamed: 0,fyear,pid,text
0,1996,0,The following discussion and analysis of the f...
1,1996,1,The Company does not provide forecasts of the ...
2,1996,2,The software industry is highly competitive an...
3,1996,3,Revenue. The Company's revenue consists of lic...
4,1996,4,International revenue decreased dd% to $dd in ...
5,1996,5,Software license revenue decreased dd% in fisc...
6,1996,6,Maintenance and services revenue increased dd%...
7,1996,7,Cost of Revenue and Gross Profit. The Company'...
8,1996,8,Cost of maintenance and services revenue consi...
9,1996,9,Selling and Marketing. Selling and marketing e...


In [None]:
score, row_index, col_index, sim = match_two_docs(doc1, doc2)
score

[0.9614706773024339, 0.8927942003522601]

In [None]:
row_index
col_index
#sim

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])

array([ 0, 25, 26, 11, 12, 13, 14,  4,  5, 17, 18, 19, 20, 21,  2,  3,  1,
       15, 16,  6,  7,  8,  9, 10, 22, 23])

In [None]:
for i, row_id in enumerate(row_index):
    print(row_id+1, "("+str(doc1_idx[row_id])+")", "\t",
          col_index[i]+1,"("+str(doc2_idx[col_index[i]])+")",\
          "\t", '%.2f'%sim[row_id, col_index[i]], "\t",\
         # doc1_len[row_id], "\t",\
         # doc2_len[col_index[i]]
         )
    print("-------------")
    print(row_id,df.loc[doc1_idx[row_id]].text)
    print("-------------")
    print(col_index[i], df.iloc[doc2_idx[col_index[i]]].text)
    print("\n")
    

1 (0) 	 1 (26) 	 1.00 	
-------------
0 The following discussion and analysis of the financial condition and results of operations of the Company should be read in conjunction with the financial statements and related notes thereto.
-------------
0 The following discussion and analysis of the financial condition and results of operations of the Company should be read in conjunction with the financial statements and related notes thereto.


2 (1) 	 26 (51) 	 0.97 	
-------------
1 The Company does not provide forecasts of the future financial performance of the Company. However, from time to time information provided by the Company or statements made by its employees may contain "forward looking" information that involves risks and uncertainties. In particular, statements contained in this Form dd-K which are not historical facts constitute forward looking statements and are made under the safe harbor provisions of the Private Securities Litigation Reform Act of dd. The Company's actual

In [None]:


for i, row_id in enumerate(row_index):
    print(row_id, "("+str(doc1_idx[row_id])+")", "\t",
          col_index[i],"("+str(doc2_idx[col_index[i]])+")",\
          "\t", '%.2f'%sim[row_id, col_index[i]], "\t",\
          doc1_len[row_id], "\t",\
          doc2_len[col_index[i]])
    print("-------------")
    print(row_id,df.loc[doc1_idx[row_id]].text)
    print("-------------")
    print(col_index[i], df.iloc[doc2_idx[col_index[i]]].text)
    print("\n")
    

0 (0) 	 0 (26) 	 1.00 	 14 	 14
-------------
0 The following discussion and analysis of the financial condition and results of operations of the Company should be read in conjunction with the financial statements and related notes thereto.
-------------
0 The following discussion and analysis of the financial condition and results of operations of the Company should be read in conjunction with the financial statements and related notes thereto.


1 (1) 	 25 (51) 	 0.95 	 45 	 50
-------------
1 The Company does not provide forecasts of the future financial performance of the Company. However, from time to time information provided by the Company or statements made by its employees may contain "forward looking" information that involves risks and uncertainties. In particular, statements contained in this Form dd-K which are not historical facts constitute forward looking statements and are made under the safe harbor provisions of the Private Securities Litigation Reform Act of dd. The 

In [None]:
mda_status = pd.read_csv("data/mda/disclosure_mda_status.csv")
mda_status.head()

Unnamed: 0,cik_orig,name_orig,form_orig,filing_date_orig,text_file,html_file,sic_orig,industry_orig,file_path,num_para,...,f_filing_date,f_name,f_sic,f_cik,h_filing_date,h_report_period,cik,report_period,filing_date,fyear
0,20,K TRON INTERNATIONAL INC,10-K,1996-03-28,edgar/data/20/0000893220-96-000500.txt,edgar/data/20/0000893220-96-000500-index.html,3823.0,Durable Manufacturers,20/0000893220-96-000500.txt,41.0,...,1996-03-28,K TRON INTERNATIONAL INC,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",20.0,1996-03-28,1995-12-31,20,1995-12-31,1996-03-28,1995
1,20,K TRON INTERNATIONAL INC,10-K,1997-03-19,edgar/data/20/0000893220-97-000572.txt,edgar/data/20/0000893220-97-000572-index.html,3823.0,Durable Manufacturers,20/0000893220-97-000572.txt,31.0,...,1997-03-19,K TRON INTERNATIONAL INC,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",20.0,1997-03-19,1996-12-28,20,1996-12-28,1997-03-19,1996
2,20,K TRON INTERNATIONAL INC,10-K405,1998-03-18,edgar/data/20/0000893220-98-000560.txt,edgar/data/20/0000893220-98-000560-index.html,3823.0,Durable Manufacturers,20/0000893220-98-000560.txt,34.0,...,1998-03-18,K TRON INTERNATIONAL INC,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",20.0,1998-03-18,1998-01-03,20,1998-01-03,1998-03-18,1997
3,20,K TRON INTERNATIONAL INC,10-K,1999-03-23,edgar/data/20/0000893220-99-000357.txt,edgar/data/20/0000893220-99-000357-index.html,3823.0,Durable Manufacturers,20/0000893220-99-000357.txt,49.0,...,1999-03-23,K TRON INTERNATIONAL INC,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",20.0,1999-03-23,1999-01-02,20,1999-01-02,1999-03-23,1998
4,20,K TRON INTERNATIONAL INC,10-K405,2000-03-30,edgar/data/20/0000893220-00-000394.txt,edgar/data/20/0000893220-00-000394-index.html,3823.0,Durable Manufacturers,20/0000893220-00-000394.txt,40.0,...,2000-03-30,K TRON INTERNATIONAL INC,"INDUSTRIAL INSTRUMENTS FOR MEASUREMENT, DISPLA...",20.0,2000-03-30,2000-01-01,20,2000-01-01,2000-03-30,1999


In [None]:
mda_status[mda_status.cik ==1853]

Unnamed: 0,cik_orig,name_orig,form_orig,filing_date_orig,text_file,html_file,sic_orig,industry_orig,file_path,num_para,...,f_filing_date,f_name,f_sic,f_cik,h_filing_date,h_report_period,cik,report_period,filing_date,fyear
64,1853,ABERDEEN IDAHO MINING CO,10KSB,2003-03-17,edgar/data/1853/0001052918-03-000043.txt,edgar/data/1853/0001052918-03-000043-index.html,3711.0,Durable Manufacturers,1853/0001052918-03-000043.txt,4.0,...,2003-03-17,ABERDEEN IDAHO MINING CO,"MINING, QUARRYING OF NONMETALLIC MINERALS (NO ...",1853.0,2003-03-17,2002-12-31,1853,2002-12-31,2003-03-17,2002
65,1853,ABERDEEN IDAHO MINING CO,10KSB,2004-02-20,edgar/data/1853/0001052918-04-000065.txt,edgar/data/1853/0001052918-04-000065-index.html,3711.0,Durable Manufacturers,1853/0001052918-04-000065.txt,4.0,...,2004-02-20,ABERDEEN IDAHO MINING CO,"MINING, QUARRYING OF NONMETALLIC MINERALS (NO ...",1853.0,2004-02-20,2003-12-31,1853,2003-12-31,2004-02-20,2003
66,1853,"MotivNation, Inc.",10KSB,2005-05-19,edgar/data/1853/0001077048-05-000297.txt,edgar/data/1853/0001077048-05-000297-index.html,3711.0,Durable Manufacturers,1853/0001077048-05-000297.txt,55.0,...,2005-05-19,"MotivNation, Inc.","MINING, QUARRYING OF NONMETALLIC MINERALS (NO ...",1853.0,2005-05-19,2004-12-31,1853,2004-12-31,2005-05-19,2004
67,1853,"MotivNation, Inc.",10KSB,2006-03-31,edgar/data/1853/0001077048-06-000127.txt,edgar/data/1853/0001077048-06-000127-index.html,3711.0,Durable Manufacturers,1853/0001077048-06-000127.txt,59.0,...,2006-03-31,"MotivNation, Inc.",MOTOR VEHICLES & PASSENGER CAR BODIES [3711],1853.0,2006-03-31,2005-12-31,1853,2005-12-31,2006-03-31,2005
68,1853,"MotivNation, Inc.",10KSB,2007-04-17,edgar/data/1853/0001077048-07-000192.txt,edgar/data/1853/0001077048-07-000192-index.html,3711.0,Durable Manufacturers,1853/0001077048-07-000192.txt,72.0,...,2007-04-17,"MotivNation, Inc.",MOTOR VEHICLES & PASSENGER CAR BODIES [3711],1853.0,2007-04-17,2006-12-31,1853,2006-12-31,2007-04-17,2006
69,1853,"MotivNation, Inc.",10KSB,2008-05-15,edgar/data/1853/0001266068-08-000048.txt,edgar/data/1853/0001266068-08-000048-index.html,3711.0,Durable Manufacturers,1853/0001266068-08-000048.txt,73.0,...,2008-05-15,"MotivNation, Inc.",MOTOR VEHICLES & PASSENGER CAR BODIES [3711],1853.0,2008-05-15,2007-12-31,1853,2007-12-31,2008-05-15,2007
70,1853,"MotivNation, Inc.",10-K,2009-04-13,edgar/data/1853/0001266068-09-000017.txt,edgar/data/1853/0001266068-09-000017-index.html,3711.0,Durable Manufacturers,1853/0001266068-09-000017.txt,29.0,...,2009-04-13,"MotivNation, Inc.",MOTOR VEHICLES & PASSENGER CAR BODIES [3711],1853.0,2009-04-13,2008-12-31,1853,2008-12-31,2009-04-13,2008
71,1853,"MotivNation, Inc.",10-K,2010-05-14,edgar/data/1853/0001266068-10-000020.txt,edgar/data/1853/0001266068-10-000020-index.html,3711.0,Durable Manufacturers,1853/0001266068-10-000020.txt,30.0,...,2010-05-14,"MotivNation, Inc.",MOTOR VEHICLES & PASSENGER CAR BODIES [3711],1853.0,2010-05-14,2009-12-31,1853,2009-12-31,2010-05-14,2009


In [None]:
# Try one pair with firm-specific DTM

cik = 882692
year1 = 1998
year2 = 1999


df_copy = para[(para.cik == cik)][["year","text"]].copy()
df_copy = df_copy.reset_index()
    
docs = df_copy["text"]
dtm = create_dtm(docs)
    
doc1_idx = df_copy[df_copy.year == year1].index.tolist()
doc2_idx = df_copy[df_copy.year == year2].index.tolist()

doc1 = dtm[doc1_idx]
doc2 = dtm[doc2_idx]

print(doc1_idx)
print(doc2_idx)



[35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]


In [None]:
score, row_index, col_index, sim = match_two_docs(doc1, doc2)
score

for i, row_id in enumerate(row_index):
    print(row_id, "("+str(doc1_idx[row_id])+")", "\t",
          col_index[i],"("+str(doc2_idx[col_index[i]])+")",\
          "\t", '%.2f'%sim[row_id, col_index[i]])
    print("-------------")
    print(row_id, df_copy.loc[doc1_idx[row_id]].text)
    print("-------------")
    print(col_index[i], df_copy.iloc[doc2_idx[col_index[i]]].text)
    print("\n")
    

[0.5736552472880472,
 0.4897056989044305,
 0.6840647070895525,
 0.6313175489525268]

0 (35) 	 7 (7) 	 0.87
-------------
0 the company 's   revenues   include sales of software and software   licenses net of sales   returns .   cost of   revenues   includes   the   costs   of   manuals ,   software duplication ,   packaging materials , assembly , paper goods , shipping , amortization of software   development costs and royalty fees paid to licensors of third - party software bundled with the company 's products .
-------------
7 cost of revenues includes the costs of manuals , software duplication ,   packaging materials , assembly , paper goods , shipping and royalty fees paid to licensors of third - party software bundled with the company 's products .


1 (36) 	 8 (8) 	 0.80
-------------
1 research and development   expenses consist   primarily of personnel and equipment costs required to conduct the company 's development effort . software development costs are expensed as incurred .
-------------
8 research and development   expenses consist   primarily of person

In [None]:
# Matched but with low sim < 0.3 (may change to lower number )
# or sentence cannot find a match
thresh = 0.3
row_index
col_index

for i, para_id in enumerate(doc1_idx):
    if (i not in row_index):
        print(i, "("+str(doc1_idx[i])+")", "\t", '\t', "\t", '\t','\t')
        print("-------------")
        print(df_copy.loc[para_id].text)
        print("\n")
            
    else:
        match_id = row_index.tolist().index(i)  # id in the matched list
        #print(match_id, row_index[match_id], col_index[match_id])
        if sim[row_index[match_id], col_index[match_id]] < thresh:
        
            print(row_index[match_id], "("+str(doc1_idx[row_index[match_id]])+")","\t", \
                  col_index[match_id], "("+str(doc2_idx[col_index[match_id]])+")", "\t", \
                  '%.2f'%sim[row_index[match_id], col_index[match_id]])
            print("-------------")
            print(row_index[match_id], df_copy.loc[doc1_idx[row_index[match_id]]].text)
            print("-------------")
            print(col_index[match_id], df_copy.iloc[doc2_idx[col_index[match_id]]].text)
            print("\n")

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 32, 33, 34, 36, 38, 39,
       40])

array([ 7,  8,  6,  9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
       28, 25, 26, 29, 30, 31,  1, 32, 12,  0,  2, 13, 33, 27,  3,  5,  4,
       34])

3 (38) 	 9 (9) 	 0.27
-------------
3 during dd and dd ,   the   company 's   research and   development ,   marketing and sales efforts were focused upon the   company 's   core   technologies   in sound and music .   the company has continued to reduce its reliance on technology   licensed from others through   internal   development of superior   technologies   or through direct acquisition of desirable technologies .
-------------
9 during dd and dd ,   the   company 's   research   and   development   efforts   were focused upon the company 's   core   technologies   in sound and music .   in dd the company   substantially   increased its expenditures for product development as it began   development   of   recordlab.com   and content for the website .   the company believes that successful   execution of this strategy will result in higher gross margins for its software business .   these development   efforts may not result in timely   introduction   of   new   products ,   and 

# Calculate all firms

In [None]:
threads = 0
finishes = 0
batch = 100
result_map = {}
output = []
results = []
#pool=mp.Pool(processes=1)
start= time.time()

print("total ciks: ", len(para_map))

cnt = 0

ciks = list(para_map.keys())

for cik in ciks[::-1]:
#for cik in para_map:

  df = get_df_for_cik(cik, para_map)
  
  results = compute_sim(cik, df, bert_model, 1)
  
  for r in results:
      cik = r[0]
      y1 = r[1]
      y2 = r[2]
      score = r[3]
      size = r[4]
      match = np.vstack([r[5], r[6], r[7]]).T

      entry = {"score": score, "size": size, "match": match}

      if cik not in result_map:
          result_map[cik] = {}

      result_map[cik][(y1, y2)] = entry

  threads+=1
  
  if threads%batch == 0:            
      pickle.dump(result_map, open("data/change/paragraph_sim_bert.pkl", "wb"))
      
  if threads%10 == 0:          
      print("finish process {0} @ {1:.2f}".format(threads, time.time()-start))                  

pickle.dump(result_map, open("data/change/paragraph_sim_bert.pkl", "wb"))

print("\n==========Done !!!!=============")

total ciks:  20893
finish process 10 @ 4.56
finish process 20 @ 9.01
finish process 30 @ 14.23
finish process 40 @ 20.74
finish process 50 @ 26.24
finish process 60 @ 31.19
finish process 70 @ 39.92
finish process 80 @ 48.19
finish process 90 @ 55.15
finish process 100 @ 63.17
finish process 110 @ 68.61
finish process 120 @ 75.45
finish process 130 @ 80.92
finish process 140 @ 83.74
finish process 150 @ 88.64
finish process 160 @ 95.02
finish process 170 @ 99.99
finish process 180 @ 106.14
finish process 190 @ 114.16
finish process 200 @ 125.03
finish process 210 @ 135.16
finish process 220 @ 139.90
finish process 230 @ 147.06
finish process 240 @ 149.40
finish process 250 @ 159.57
finish process 260 @ 167.05
finish process 270 @ 176.38
finish process 280 @ 187.59
finish process 290 @ 193.37
finish process 300 @ 201.78
finish process 310 @ 214.01
finish process 320 @ 220.77
finish process 330 @ 229.67
finish process 340 @ 237.10
finish process 350 @ 246.51
finish process 360 @ 259.16
f

In [24]:
# Consolidate all results

para_sim = pickle.load(open("data/change/paragraph_sim_bert.pkl", "rb"))
para_sim2 = pickle.load(open("data/change/paragraph_sim_bert1.pkl", "rb"))

for cik in para_sim:
  if cik not in para_sim2:
    para_sim2[cik] = para_sim[cik]
  else:
    for (y1, y2) in para_sim[cik]:
      if (y1, y2) not in para_sim2[cik]:
        para_sim2[cik][(y1, y2)] = para_sim[cik][(y1, y2)]

print(len(para_sim2))

18278


In [25]:
para_sim2[20][(1995, 1996)]

{'match': array([[ 0.        ,  0.        ,  0.98386681],
        [ 1.        ,  1.        ,  0.95965493],
        [ 5.        ,  2.        ,  0.93160373],
        [ 6.        , 20.        ,  0.86246496],
        [ 7.        ,  3.        ,  0.98884386],
        [ 8.        ,  4.        ,  0.96260548],
        [11.        ,  5.        ,  0.9747777 ],
        [12.        ,  7.        ,  0.99360722],
        [13.        ,  6.        ,  0.9221859 ],
        [14.        ,  8.        ,  0.97504497],
        [15.        ,  9.        ,  0.98967874],
        [16.        , 10.        ,  0.98671734],
        [17.        , 11.        ,  0.98505539],
        [18.        , 12.        ,  0.99050504],
        [19.        , 13.        ,  0.94404954],
        [20.        , 14.        ,  0.98394495],
        [23.        , 15.        ,  1.        ],
        [24.        , 16.        ,  0.95593768],
        [25.        , 18.        ,  0.84737688],
        [26.        , 19.        ,  0.88451612],
        [27

In [26]:
pickle.dump(para_sim2, open("data/change/paragraph_sim_bert.pkl", "wb"))

In [None]:
#pickle.dump(result_map, open("data/metric/paragraph_sim.pkl", "wb"))

# Tokenize and clean up paragraph text

In [None]:
para_map = pickle.load(open("data/mda/paragraphs_1994_2016.pkl", 'rb'))
len(para_map)

20893

In [None]:
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+'[vnt]\w*\b|\b\w\w+\b")
tokenizer = vectorizer.build_tokenizer()

In [None]:
def tokenize_paragraph(cik, para_per_cik):
    
    for fyear in para_per_cik:
        
        for i, p in enumerate(para_per_cik[fyear]):
            
            tokens = tokenizer(p)
            cleaned_tokens = []
            
            for w in tokens:  # detect if long tokens are words concatenated
                if len(w)>=10:
                    cleaned_tokens += wordninja.split(w)
                else:
                    cleaned_tokens.append(w)
            
            para_per_cik[fyear][i] = cleaned_tokens 
            
    return (cik, para_per_cik)

In [None]:
# run by cik

threads = 0
finishes = 0
batch = 5

#output = {}
results = []
pool=mp.Pool(processes=10)
start= time.time()

#para_map = pickle.load(open("data/mda/paragraphs_1994_2016.pkl", 'rb'))
print("total ciks: ", len(para_map))

cnt = 0

for cik in para_map:
    
    para_per_cik = para_map[cik]
    
    threads+=1
        
    results.append(pool.apply_async(tokenize_paragraph, args = (cik, para_per_cik))
    
    if threads%batch == 0:
        print("send process {}".format(threads))
        break
    

for p in results: 
    
    cik, out = p.get()
    
    para_map[cik] = out
    
    finishes += 1
    
    if finishes%batch == 0:
        
        print("finish process {} @ {}".format(finishes, time.time()-start))
        
        # save continously
        #pickle.dump(output, open("data/metric/paragraph_sim.pkl", "wb"))

pickle.dump(para_map, open("data/mda/paragraph_1994_2016_tokenized.pkl", "wb"))

print("\n==========Done !!!!=============")

In [None]:
cnt = 0
start = time.time()


for cik in para_map:
    
    for fyear in para_map[cik]:
        
        for i, p in enumerate(para_map[cik][fyear]):
            
            tokens = tokenizer(p)
            cleaned_tokens = []
            
            for w in tokens:  # detect if long tokens are words concatenated
                if len(w)>=10:
                    cleaned_tokens += wordninja.split(w)
                else:
                    cleaned_tokens.append(w)
            
            para_map[cik][fyear][i] = cleaned_tokens 
            
    cnt +=1
    if cnt%100 == 0:
        print("{0} in {1:.2f}".format(cnt, time.time()-start))
        
    #break

100 in 44.95
200 in 90.14
300 in 138.05
400 in 181.83
500 in 234.10
600 in 269.75
700 in 312.43
800 in 357.37
900 in 402.38
1000 in 442.00
1100 in 489.13
1200 in 533.83
1300 in 560.45
1400 in 591.92
1500 in 633.12
1600 in 676.68
1700 in 718.98
1800 in 755.44
1900 in 783.16
2000 in 819.35
2100 in 858.67
2200 in 887.31
2300 in 921.46
2400 in 959.06
2500 in 1002.05
2600 in 1028.67
2700 in 1058.65
2800 in 1086.27
2900 in 1111.81
3000 in 1141.03
3100 in 1166.85
3200 in 1190.46
3300 in 1219.45
3400 in 1243.65
3500 in 1279.32
3600 in 1310.75
3700 in 1345.34
3800 in 1374.34
3900 in 1400.65
4000 in 1428.96
4100 in 1453.54
4200 in 1486.08
4300 in 1518.11
4400 in 1542.23
4500 in 1569.35
4600 in 1594.66
4700 in 1628.74
4800 in 1655.96
4900 in 1680.48
5000 in 1702.00
5100 in 1737.00
5200 in 1763.70
5300 in 1795.63
5400 in 1827.53
5500 in 1868.48
5600 in 1895.67
5700 in 1927.73
5800 in 1960.62
5900 in 1996.18
6000 in 2030.93
6100 in 2064.61
6200 in 2113.59
6300 in 2141.97
6400 in 2177.11
6500 in 221

In [None]:
#pickle.dump(para_map, open("data/mda/paragraph_1994_2016_tokenized.pkl", "wb"))

para_map = pickle.load(open("data/mda/paragraph_1994_2016_tokenized.pkl", "rb"))

In [None]:
for cik in para_map:
    for y in para_map[cik]:
        for p in para_map[cik][y]:
            print(p)
            break
        break

['In', 'dd', 'the', 'Company', 'reported', 'pre', 'tax', 'loss', 'of', 'dd', 'The', 'principal', 'components', 'of', 'this', 'loss', 'were', 'second', 'quarter', 'loss', 'of', 'dd', 'recorded', 'in', 'connection', 'with', 'the', 'sale', 'of', 'Color', 'tronic', 'GmbH', 'Color', 'tronic', 'German', 'subsidiary', 'and', 'related', 'patent', 'and', 'patent', 'applications', 'and', 'third', 'and', 'fourth', 'quarter', 'losses', 'totaling', 'dd', 'resulting', 'from', 'the', 'sale', 'of', 'the', 'Company', 'Brazilian', 'and', 'Hasler', 'France', 'businesses', 'On', 'pro', 'forma', 'basis', 'assuming', 'that', 'these', 'transactions', 'and', 'the', 'discontinuance', 'of', 'the', 'Company', 'other', 'Color', 'tronic', 'brand', 'business', 'had', 'occurred', 'at', 'the', 'beginning', 'of', 'the', 'dd', 'fiscal', 'year', 'the', 'Company', 'had', 'dd', 'pre', 'tax', 'income', 'of', 'dd', 'and', 'net', 'income', 'of', 'dd']
['The', 'Company', 'reports', 'its', 'activities', 'in', 'one', 'business'

In [None]:
''.join(p)

'InddtheCompanyreportedpretaxlossofddTheprincipalcomponentsofthislossweresecondquarterlossofddrecordedinconnectionwiththesaleofColortronicGmbHColortronicGermansubsidiaryandrelatedpatentandpatentapplicationsandthirdandfourthquarterlossestotalingddresultingfromthesaleoftheCompanyBrazilianandHaslerFrancebusinessesOnproformabasisassumingthatthesetransactionsandthediscontinuanceoftheCompanyotherColortronicbrandbusinesshadoccurredatthebeginningoftheddfiscalyeartheCompanyhadddpretaxincomeofddandnetincomeofdd'

In [None]:
 #wordninja.split(''.join(p))

# Compute words per paragraph

In [None]:
def tokenize_and_clean(doc, tokenizer):
    
    tokens = tokenizer(doc)
    
    cleaned_tokens = []
            
    for w in tokens:  # detect if long tokens are words concatenated
        if len(w)>=10:
            cleaned_tokens += wordninja.split(w)
        else:
            cleaned_tokens.append(w)
    
    return cleaned_tokens

In [None]:
def get_words_by_cat(words, WORDS_BY_CAT):

    result = {}

    for cat in WORDS_BY_CAT:

        v = [item for item in words if item in WORDS_BY_CAT[cat]]

        if len(v)>0:
            result[cat] = v

    return result 

In [None]:
def get_words_per_cik(cik, para_per_cik, WORDS_BY_CAT, tokenizer):
    
    for fyear in para_per_cik:
        
        for i, p in enumerate(para_per_cik[fyear]):
            
            words = tokenize_and_clean(p, tokenizer)
            
            # words in each category
            result = get_words_by_cat(words, WORDS_BY_CAT)       
            
            para_per_cik[fyear][i] = (result, len(words))
            
    return (cik, para_per_cik)

In [None]:
def run_compute_words():
    
    vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+'[vnt]\w*\b|\b\w\w+\b")
    tokenizer = vectorizer.build_tokenizer()

    # run by cik
    para_map = pickle.load(open("data/mda/paragraphs_1994_2016.pkl", 'rb'))
    para_map = {key: para_map[key] for key in list(para_map.keys())[0:100]}
    len(para_map)
    
    WORDS_BY_CAT = pickle.load(open("data/extended_lexicon.pkl", "rb"))

    threads = 0
    finishes = 0
    batch = 100

    output = {}
    results = []
    pool=mp.Pool(processes=10)
    start= time.time()

    #para_map = pickle.load(open("data/mda/paragraphs_1994_2016.pkl", 'rb'))
    print("total ciks: ", len(para_map))

    cnt = 0

    for cik in para_map:

        para_per_cik = para_map[cik]
        args = (cik, para_per_cik, WORDS_BY_CAT, tokenizer)

        threads+=1

        results.append(pool.apply_async(get_words_per_cik, args = args))

        if threads%batch == 0:
            print("send process {}".format(threads))
            break


    for p in results: 

        cik, out = p.get()

        para_map[cik] = out

        finishes += 1

        if finishes%batch == 0:

            print("finish process {0} @ {1:.2f}".format(finishes, time.time()-start))

            # save continously
            #pickle.dump(output, open("data/metric/paragraph_words.pkl", "wb"))

    pickle.dump(para_map, open("data/metric/paragraph_words.pkl", "wb"))

    print("\n==========Done !!!!=============")

In [None]:
run_compute_words()

total ciks:  100
send process 100
finish process 100 @ 6.79



In [None]:
para_map = pickle.load(open("data/change/paragraph_words.pkl", "rb"))


In [None]:
para_map[1007021][1997][27]

({'WeakModal': ['could', 'may', 'may', 'could'],
  'Negative': ['adversely',
   'unanticipated',
   'shortfall',
   'adverse',
   'foregoing',
   'adversely'],
  'Compare': ['variety'],
  'Discrep': ['could', 'could'],
  'Positive': ['profitability', 'enhancements'],
  'Uncertainty': ['could',
   'anticipated',
   'may',
   'may',
   'fluctuations',
   'could']},
 107)

In [None]:
para_map[1007021][1997][26]

({'WeakModal': ['may', 'could'],
  'Negative': ['invalidation', 'adverse'],
  'Compare': ['either', 'broader', 'more', 'greater', 'greater', 'than'],
  'Discrep': ['needs', 'could'],
  'Positive': ['achieving', 'enhanced', 'benefit', 'greater', 'greater'],
  'Achieve': ['potential'],
  'Reward': ['benefit', 'greater', 'greater'],
  'Uncertainty': ['may',
   'risks',
   'uncertainties',
   'anticipation',
   'dependent',
   'could']},
 190)

In [None]:
para_sim = pickle.load(open("data/metric/paragraph_sim.pkl", "rb"))

In [None]:
para_sim[20][(1995, 1996)]

{'score': [0.7632464241544741,
  0.5770887597265536,
  0.7463789646787461,
  0.5922951884016213],
 'size': (41, 31),
 'match': array([[ 0.        ,  0.        ,  0.93753694],
        [ 1.        ,  1.        ,  0.55408075],
        [ 3.        , 20.        ,  0.52002718],
        [ 5.        ,  2.        ,  0.65019177],
        [ 6.        , 21.        ,  0.38627732],
        [ 7.        ,  3.        ,  0.73903348],
        [ 8.        ,  4.        ,  0.73029   ],
        [11.        ,  5.        ,  0.73333307],
        [12.        ,  7.        ,  0.98088939],
        [13.        ,  6.        ,  0.54827539],
        [14.        ,  8.        ,  0.69575396],
        [15.        ,  9.        ,  0.86804056],
        [16.        , 10.        ,  0.88165404],
        [17.        , 11.        ,  0.96988378],
        [18.        , 12.        ,  0.88925919],
        [19.        , 13.        ,  0.79964875],
        [20.        , 14.        ,  0.96610245],
        [23.        , 15.        ,  1.   

# Calculate granular metrics

In [8]:
def get_word_diff(word1_by_cat, word2_by_cat, word_cats):
    
    #print(word1_by_cat)
    #print(word2_by_cat)
    
    result = {}
    for cat in word_cats:
        
        m1 = len(word1_by_cat[cat]) if cat in word1_by_cat else 0
        m2 = len(word2_by_cat[cat]) if cat in word2_by_cat else 0
        
        if m2>m1:
            result[cat] = (m2 - m1)/m2           
        elif m1>m2:
            result[cat] = (m2 - m1)/m1
        else:
            result[cat] = 0
    
    #print(result)
    
    return pd.Series(result)

In [9]:
def get_granular_metric(cik, y1, y2, match_result, word_dict1, word_dict2,  word_cats, \
                        sim_th = 0.3, change_th = 0.1):
    
    # match part
    result = pd.Series()
    
    score = match_result["score"]
    score = pd.Series(score, index=["s_min","s_max"])
    result = result.append(score)
    
    size = match_result["size"]
    df = pd.DataFrame(match_result["match"], columns=["p1","p2","sim"])
    df[["p1","p2"]] = df[["p1","p2"]].astype(int)
    #print(df)
    
    df = df[df.sim >= sim_th]
    
    if len(df) > 0:
    
        df["p1_len"] = df.p1.apply(lambda i: word_dict1[i][1])
        df["p2_len"] = df.p2.apply(lambda i: word_dict2[i][1])  
    
        #print(df)
    
        df[word_cats] = df.apply(lambda row: get_word_diff(word_dict1[int(row["p1"])][0], \
                                                       word_dict2[int(row["p2"])][0], word_cats), \
                             axis = 1) 
        
        recur_up = (df[word_cats] >= change_th).astype(int)
        recur_up = recur_up.mean(axis = 0) 
    
    
        recur_down = (df[word_cats] <= -change_th).astype(int)
        recur_down = recur_down.mean(axis = 0) 
    
        recur_up.index = [i+"_up" for i in recur_up.index]
        recur_down.index = [i+"_down" for i in recur_down.index]
    
        result = result.append(recur_up)
        result = result.append(recur_down)
    #print(result)
    
    # new added
    ps = set(range(size[1])) - set(df["p2"].values.tolist())
    m = {}
    
    if len(ps) > 0:
        for p in ps:
            p_words = word_dict2[p][0] 
            p_len = word_dict2[p][1]
        
            m[p] = {key: (len(p_words[key])/p_len if key in p_words else 0) for key in word_cats}
    
        m = pd.DataFrame.from_dict(m, orient="index")
        m = m.mean(axis = 0)
        m.index = [i + "_add" for i in m.index]
    
        result = result.append(m)
    
    
    # delete
    ps = set(range(size[0])) - set(df["p1"].values.tolist())
    m = {}
    
    if len(ps)>0:
        for p in ps:
            p_words = word_dict1[p][0] 
            p_len = word_dict1[p][1]
        
            m[p] = {key: (len(p_words[key])/p_len if key in p_words else 0) for key in word_cats}
    
        m = pd.DataFrame.from_dict(m, orient="index")
        m = m.mean(axis = 0)
        m.index = [i + "_del" for i in m.index]
    
        result = result.append(m)
        
    result = result.sort_index()   
    
    return (cik, y1, y2), result

In [None]:
para_map = pickle.load(open("data/change/paragraph_words.pkl", "rb"))
    
para_sim = pickle.load(open("data/change/paragraph_sim_bert.pkl", "rb"))

In [27]:
#para_sim = para_sim2

In [None]:
key = 1007021
y1, y2 = 1996, 1997

match_result = para_sim[key][(y1, y2)]
word_dict1 = para_map[key][y1]
word_dict2 = para_map[key][y2]
word_cats = ['WeakModal','Negative', 'Compare', 'Discrep','Positive',\
        'Achieve',  'Reward', 'StrongModal','Uncertainty', 'Litigious']

key, metrics = get_granular_metric(key, y1, y2, match_result, word_dict1, word_dict2,  word_cats)

In [None]:
metrics

In [38]:
match_result

{'match': array([[ 0.        ,  0.        ,  0.99999994],
        [ 1.        , 25.        ,  0.96640611],
        [ 2.        , 26.        ,  0.89585179],
        [ 3.        , 11.        ,  0.99999988],
        [ 4.        , 12.        ,  1.        ],
        [ 5.        , 13.        ,  1.        ],
        [ 6.        , 14.        ,  0.99999988],
        [ 7.        ,  4.        ,  0.99920291],
        [ 8.        ,  5.        ,  0.99328971],
        [ 9.        , 17.        ,  0.99991393],
        [10.        , 18.        ,  0.99958462],
        [11.        , 19.        ,  0.99906731],
        [12.        , 20.        ,  0.99999994],
        [13.        , 21.        ,  0.99748617],
        [14.        ,  2.        ,  0.90479171],
        [15.        ,  3.        ,  0.95326108],
        [16.        ,  1.        ,  0.7643708 ],
        [17.        , 15.        ,  0.96092272],
        [18.        , 16.        ,  0.94390476],
        [19.        ,  6.        ,  0.9618541 ],
        [20

In [39]:
para_map[20][1995][2]

({'Achieve': ['loss'], 'Compare': ['as'], 'Negative': ['divest', 'loss']}, 65)

In [40]:
key
metrics

(1007021, 1996, 1997)

Achieve_add         0.000000
Achieve_down        0.192308
Achieve_up          0.000000
Compare_add         0.013932
Compare_down        0.192308
Compare_up          0.230769
Discrep_add         0.009346
Discrep_down        0.000000
Discrep_up          0.076923
Litigious_add       0.000000
Litigious_down      0.038462
Litigious_up        0.000000
Negative_add        0.028037
Negative_down       0.115385
Negative_up         0.076923
Positive_add        0.009346
Positive_down       0.076923
Positive_up         0.076923
Reward_add          0.000000
Reward_down         0.038462
Reward_up           0.076923
StrongModal_add     0.009259
StrongModal_down    0.038462
StrongModal_up      0.000000
Uncertainty_add     0.037297
Uncertainty_down    0.038462
Uncertainty_up      0.038462
WeakModal_add       0.018692
WeakModal_down      0.000000
WeakModal_up        0.038462
s_max               0.892794
s_min               0.961471
dtype: float64

In [None]:
def run_granular_metric():
    
    # run by cik
    #para_map = pickle.load(open("data/metric/paragraph_words.pkl", "rb"))
    
    #para_sim = pickle.load(open("data/metric/paragraph_sim.pkl", "rb"))
    
    threads = 0
    finishes = 0
    batch = 100

    output = {}
    results = []
    pool=mp.Pool(processes=10)
    start= time.time()

    print("total pairs: ", len( para_sim))

    cnt = 0

    for cik in para_sim:
        
        for (y1, y2) in para_sim[cik]:
            
            match_result = para_sim[cik][(y1, y2)]
            word_dict1 = para_map[cik][y1]
            word_dict2 = para_map[cik][y2]

            args = (cik, y1, y2, match_result, word_dict1, word_dict2,  word_cats, 0.3, 0.1)

            threads+=1

            results.append(pool.apply_async(get_granular_metric, args = args))

            if threads%batch == 0:
                print("send process {}".format(threads))
                
        #if threads>batch:     
        #    break


    for p in results: 

        key,out = p.get()

        output[key] = out

        finishes += 1

        if finishes%batch == 0:

            print("finish process {0} @ {1:.2f}".format(finishes, time.time()-start))

            # save continously
            #pickle.dump(output, open("data/metric/paragraph_words.pkl", "wb"))
            
    pickle.dump(output, open("data/change/granular_metric_bert.pkl", "wb"))
    
    data = pd.concat(output.values(), axis=0, keys=output.keys())
    data = data.unstack(-1)
    data = data.reset_index()
    data.columns = ["cik","y1","y2"] + data.columns.tolist()[3:]
    data = data.fillna(0)
    
    data.to_pickle("data/change/granular_metric_df_bert.pkl")

    print("\n==========Done !!!!=============")

In [None]:
run_granular_metric()

In [None]:
result = pickle.load( open("data/change/granular_metric_bert.pkl", "rb"))

In [None]:
# bert
len(result)
result[0]

In [None]:
# Original
len(result)
result[0]

128369

((20, 1995, 1996),
 Achieve_add         0.000000
 Achieve_del         0.009593
 Achieve_down        0.266667
 Achieve_up          0.033333
 Compare_add         0.029412
 Compare_del         0.025222
 Compare_down        0.233333
 Compare_up          0.300000
 Discrep_add         0.000000
 Discrep_del         0.008154
 Discrep_down        0.133333
 Discrep_up          0.166667
 Litigious_add       0.029412
 Litigious_del       0.001581
 Litigious_down      0.166667
 Litigious_up        0.033333
 Negative_add        0.029412
 Negative_del        0.007459
 Negative_down       0.200000
 Negative_up         0.100000
 Positive_add        0.000000
 Positive_del        0.004422
 Positive_down       0.133333
 Positive_up         0.100000
 Reward_add          0.000000
 Reward_del          0.001484
 Reward_down         0.133333
 Reward_up           0.066667
 StrongModal_add     0.000000
 StrongModal_del     0.006244
 StrongModal_down    0.033333
 StrongModal_up      0.033333
 Uncertainty_add     

In [None]:
fin_ratio = pd.read_csv("data/fin_ratio_20211228.csv")
len(fin_ratio)
fin_ratio.head()

151539

Unnamed: 0,sic_orig,industry_orig,cik,report_period,filing_date,fyear,gvkey,datadate,indfmt,consol,...,asset_turnover,cfed,dsir,depi,gmi,lev,opm,rg,sg,sgee
0,3823.0,Durable Manufacturers,20,1995-12-31,1996-03-28,1995,6314.0,19951231.0,INDL,C,...,1.593079,-0.02673,0.667225,0.769961,0.973074,0.728023,-0.084189,0.703028,1.053659,0.891313
1,3823.0,Durable Manufacturers,20,1996-12-28,1997-03-19,1996,6314.0,19961231.0,INDL,C,...,1.624273,0.210732,0.962468,1.226305,0.914198,0.715334,0.044798,0.783539,0.814093,1.013528
2,3823.0,Durable Manufacturers,20,1998-01-03,1998-03-18,1997,6314.0,19971231.0,INDL,C,...,1.606518,0.025826,0.968074,1.088859,0.978102,0.755016,0.062466,0.938786,0.969746,1.012432
3,3823.0,Durable Manufacturers,20,1999-01-02,1999-03-23,1998,6314.0,19981231.0,INDL,C,...,1.574474,0.020191,1.213425,0.99178,0.991078,1.073227,0.073961,1.241132,1.022834,0.983269
4,3823.0,Durable Manufacturers,20,2000-01-01,2000-03-30,1999,6314.0,19991231.0,INDL,C,...,1.604656,0.003715,1.0924,1.025961,1.024634,0.948804,0.076906,1.07702,0.985921,0.983057


In [None]:
fin_ratio.columns

fin_cols = ['rsst_acc','ch_rec', 'ch_inv', 'soft_asset', 'ch_cs', 'ch_roa', 'issue', 'aqi',
       'asset_turnover', 'cfed', 'depi', 'gmi', 'lev', 'opm', 'rg', 'sg',
       'sgee']

Index(['sic_orig', 'industry_orig', 'cik', 'report_period', 'filing_date',
       'fyear', 'gvkey', 'datadate', 'indfmt', 'consol', 'popsrc', 'datafmt',
       'tic', 'cusip', 'conm', 'act', 'at', 'che', 'cogs', 'dlc', 'dltis',
       'dltt', 'dp', 'ib', 'invt', 'ivao', 'ivst', 'lct', 'lt', 'ni', 'ppent',
       'pstk', 'rect', 'sale', 'sstk', 'xsga', 'report_period_y', 'avg_at',
       'wc', 'nco', 'fin', 'rsst_acc', 'ch_rec', 'ch_inv', 'soft_asset', 'cs',
       'ch_cs', 'ch_roa', 'issue', 'aqi', 'asset_turnover', 'cfed', 'dsir',
       'depi', 'gmi', 'lev', 'opm', 'rg', 'sg', 'sgee'],
      dtype='object')

In [None]:
pd.isnull(fin_ratio[fin_cols]).sum(axis=0)

rsst_acc          79488
ch_rec            72144
ch_inv            72768
soft_asset        71730
ch_cs             73250
ch_roa            72732
issue             72804
aqi               74080
asset_turnover    71668
cfed              81451
depi              72308
gmi               71954
lev               72493
opm               71669
rg                72144
sg                71954
sgee              80036
dtype: int64

In [None]:
fin_ratio.fyear.value_counts().sort_index()
fin_ratio[pd.isnull(fin_ratio[fin_cols]).any(axis =1)].fyear.value_counts().sort_index()

1994    1802
1995    4066
1996    7977
1997    8388
1998    8297
1999    8797
2000    8851
2001    8379
2002    7749
2003    7398
2004    7292
2005    7167
2006    7135
2007    7110
2008    6786
2009    6456
2010    6116
2011    5790
2012    5545
2013    5460
2014    5320
2015    4999
2016    4659
Name: fyear, dtype: int64

1994     835
1995    2039
1996    4331
1997    4484
1998    4550
1999    5275
2000    5426
2001    5298
2002    4837
2003    4593
2004    4468
2005    4391
2006    4416
2007    4463
2008    4303
2009    4088
2010    3795
2011    3546
2012    3319
2013    3210
2014    3004
2015    2699
2016    2441
Name: fyear, dtype: int64

In [None]:

data1 = data.merge(fin_ratio[["cik", "fyear"] + fin_cols], \
                  left_on = ["cik", "y2"],
                  right_on = ["cik", "fyear"])

len(data1)

128628

In [None]:

data1 = data1.dropna(axis = 0)
len(data1)
data1.head()

55262

Unnamed: 0,cik,y1,y2,Achieve_add,Achieve_del,Achieve_down,Achieve_up,Compare_add,Compare_del,Compare_down,...,aqi,asset_turnover,cfed,depi,gmi,lev,opm,rg,sg,sgee
0,20,1995,1996,0.0,0.009593,0.266667,0.033333,0.029412,0.025222,0.233333,...,0.974056,1.624273,0.210732,1.226305,0.914198,0.715334,0.044798,0.783539,0.814093,1.013528
1,20,1996,1997,0.0,0.0,0.133333,0.1,0.0,0.027778,0.1,...,1.248039,1.606518,0.025826,1.088859,0.978102,0.755016,0.062466,0.938786,0.969746,1.012432
2,20,1997,1998,0.008147,0.018731,0.321429,0.035714,0.021616,0.014858,0.392857,...,0.84054,1.574474,0.020191,0.99178,0.991078,1.073227,0.073961,1.241132,1.022834,0.983269
3,20,1998,1999,0.012213,0.011677,0.083333,0.083333,0.02159,0.024158,0.166667,...,0.837833,1.604656,0.003715,1.025961,1.024634,0.948804,0.076906,1.07702,0.985921,0.983057
4,20,1999,2000,0.0,0.007622,0.0,0.058824,0.0,0.026772,0.176471,...,1.093648,1.56028,-0.015406,1.113062,0.995699,1.221034,0.068754,0.93361,0.96615,1.000184


# Case study for the paper

In [None]:
para_map = pickle.load(open("data/mda/paragraphs_1994_2016.pkl", 'rb'))

In [None]:
para_map[1007021][1997]

In [None]:
file = "data/final/modify_score_unmatch_merged.pkl"
data = pd.read_pickle(file)
data[(data.cik == 1007021) & (data.y2==1997)].T

Unnamed: 0,32408
cik,1007021.0
y1,1996.0
y2,1997.0
unmatch_WeakModal+,0.00166067
unmatch_Negative+,0.003874896
unmatch_Compare+,0.01688348
unmatch_Discrep+,0.00166067
unmatch_Positive+,0.003874896
unmatch_Achieve+,0.0008303349
unmatch_Reward+,0.00332134


In [None]:
file = "data/final/metrics_granular_merged.pkl"
data = pd.read_pickle(file)
data[(data.cik == 1007021) & (data.y2==1997)].T

Unnamed: 0,73117
cik,1007021.0
y1,1996.0
y2,1997.0
Achieve_add,0.001315789
Achieve_del,0.006493506
Achieve_down,0.1666667
Achieve_up,0.0
Compare_add,0.02149798
Compare_del,0.02929333
Compare_down,0.1666667


In [None]:
import pickle

pickle.load(open("data/change/paragraph_sim_bert-Copy1.pkl","rb"))

{20: {(1995, 1996): {'score': [0.9562679413826235, 0.723031858118569],
   'size': (41, 31),
   'match': array([[ 0.        ,  0.        ,  0.98386681],
          [ 1.        ,  1.        ,  0.95965493],
          [ 5.        ,  2.        ,  0.93160373],
          [ 6.        , 20.        ,  0.86246496],
          [ 7.        ,  3.        ,  0.98884386],
          [ 8.        ,  4.        ,  0.96260548],
          [11.        ,  5.        ,  0.9747777 ],
          [12.        ,  7.        ,  0.99360722],
          [13.        ,  6.        ,  0.9221859 ],
          [14.        ,  8.        ,  0.97504497],
          [15.        ,  9.        ,  0.98967874],
          [16.        , 10.        ,  0.98671734],
          [17.        , 11.        ,  0.98505539],
          [18.        , 12.        ,  0.99050504],
          [19.        , 13.        ,  0.94404954],
          [20.        , 14.        ,  0.98394495],
          [23.        , 15.        ,  1.        ],
          [24.        , 16.     