# NDR Spike Modelling
##### This notebook contains details about the different Spike related models created based on each of the 3 Sections (1,1A and 7).
##### The input to this notebook are the 3 pickle files generated from the prior PreProcessing step
##### Most of the models here are quick and should execute within 5-7 minutes, but the one utilizing Word2Vec (Spike based on Word2Vec and Global Word2Vec) could take > 4-5 hours depending on processing speed

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import re as re
import matplotlib.pyplot as plt
from gensim.models import Phrases

import json


import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

from scipy.ndimage import maximum_filter1d

from nltk.corpus import stopwords



import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook
#import en_core_web_md
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore, LdaModel
from gensim.models import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim import corpora

import spacy
nlp = spacy.load('en_core_web_lg')
from scipy.signal import find_peaks

from collections import defaultdict

def peaks_previousN(a, n):
    W = (n-1)//2
    return np.flatnonzero(a[1:]>maximum_filter1d(a, n, origin=W)[:-1])+1


##### This might be redundant since we took care of stop word removal as part of pre-processing but we still use stop word removal as part of running as NLP vectorizers

In [None]:


stops = stopwords.words('english')

stops_list =['table_end','table_start','president','vice','business','chief','exerience','mhz','usa','senior','www','world',\
'january','february','march',\
'april','may','june','july','august','september','october','november','december','year','mr','north','america','industry',\
'payment','solution','solutions','i','ii','iii','iv','v','vi','vii','viii','company','product','management','customer',\
'item','ability','access','annual','report','broad','related','system','service','uncertain','unauthorized','false',\
'system','uk','revenue','earning','approval','region','example','incident','company','financial','statements','supplementary',\
'data','summary','notes','consolidated','note','notes','due', 'adjusted','review','versus','herein','reference','industrial', \
             'products','overall','result','aa','aaa','write','years','year','able','acceptable','acceptance'\
            ]
stops=stops+stops_list

In [None]:
industry_data=pd.read_csv("industry.txt", delimiter="|")
industry_data.head(5)

##### The below analysis are all based on top 75 industries from Company count, but it could adjusted accordingly as needed

In [None]:
industry_data.dropna(subset=['ID','SubInd'], inplace=True)
industry_data['ID']=industry_data['ID'].astype('int')
main_75_industry = list(industry_data['SubInd'].value_counts()[:75].index)
#main_75_industry

In [None]:
main_75_industry

##### Here we start reading individual preprocessed pickle files from the earlier notebook and start executing the models

In [None]:
final_merge_1 = pd.read_pickle("item1_cleaned.pkl")  
final_merge_1=final_merge_1[['date','cik','company','year','item1_cleaned']]
final_merge_1.rename(columns={'year':'actual_year', 'company': 'actual_company', 'date': 'actual_date'},inplace=True)
final_merge_1.dropna(subset=['actual_year', 'item1_cleaned'], inplace=True)
final_merge_1.reset_index(drop=True,inplace=True)
final_merge_1.head(2)

In [None]:
final_merge_1 = final_merge_1.merge(industry_data[['ID', 'SubInd']], left_on = 'cik', right_on='ID',how = 'left')
final_merge_1.drop(['ID'], axis=1,inplace=True)


In [None]:
final_merge_1.head(2)

In [None]:
# vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
#                       stop_words = stops, max_df=.75) 
# dtm_new_nlp = vec_new_nlp.fit_transform(final_merge_1['item1_cleaned'])

# dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
# d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
# dtm_tfidf_new.rename(columns=d, inplace=True)
# final_merge_sec1 = pd.concat([final_merge_1, dtm_tfidf_new], axis=1)
# final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
# final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

# numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
# numeric_cols

# final_merge_transpose = final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()

# final_merge_groupby_subind = final_merge_sec1.groupby(['SubInd','actual_year'])[numeric_cols].sum().reset_index()
# final_merge_groupby_subind.head(5) #here

### Sec 1 - Model 1: 
##### The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 1, of the SEC records belonging to the top 75 industries split by year. The top word identification is done by using a custom function that finds peaks in data based on its tfidf distribution
##### Configurations that can be modified are highlighted as comments

In [None]:
sec1_tfidf_model_1=defaultdict(defaultdict(list).copy)

try:
    
    for item in main_75_industry:
        if(final_merge_1[final_merge_1['SubInd']==item].shape[0]==0): continue
        new_df=final_merge_1[final_merge_1['SubInd']==item].reset_index()
        new_df=new_df.drop(['SubInd'], axis=1)

        vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                          stop_words = stops,  max_df=.7) # ngram_range, max_features, max_df, min_df can be adjusted
        dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1_cleaned'])


        dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
        d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
        dtm_tfidf_new.rename(columns=d, inplace=True)
        final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
        final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
        final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

        numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
        #numeric_cols

        new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
        b_next=new_df_next.transpose()
        b_next=b_next[(b_next.max(axis=1) >= .25) ] # This signifies that the word should be present with a 
        #higher relavance in some of the years. The number can be adjusted
        new_df_next=b_next.transpose()
        new_df_next

        for item_2 in list(new_df_next.columns):
            if item_2=='index': continue
            testing_peaks=new_df_next.loc[:,item_2]
            peaks=peaks_previousN(testing_peaks.values, 22) # This is the custom function that generates the spike words. 

            for next_item in list(testing_peaks.iloc[peaks].index):
                if (item not in sec1_tfidf_model_1.keys()): sec1_tfidf_model_1[item]=defaultdict(list)
                sec1_tfidf_model_1[item][next_item].append(item_2)

        if item in sec1_tfidf_model_1.keys(): sec1_tfidf_model_1[item]=dict(sorted(sec1_tfidf_model_1[item].items()))

            #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
        sec1_tfidf_model_1=dict(sorted(sec1_tfidf_model_1.items()))
        
        # The final dict object is in sec1_tfidf_model_1

except:
    print(item)
    print(peaks)
    #print(list(new_df_next.columns))
    #print(list(testing_peaks.iloc[peaks]))
    #print(list(testing_peaks.iloc[peaks].index))
    #print(testing_peaks)
#sec1_tfidf_model_1

### If we need to visualize this function, execute the below two sections. As an example, this is to see the covid-19 word that was captured from the model within the Biotech industry

In [None]:
new_df=final_merge_1[final_merge_1['SubInd']=='Biotechnology'].reset_index()
new_df=new_df.drop(['SubInd'], axis=1)

vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                  stop_words = stops,  max_df=.7) 

dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1_cleaned'])


dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
dtm_tfidf_new.rename(columns=d, inplace=True)
final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
#numeric_cols

new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
b_next=new_df_next.transpose()
b_next=b_next[(b_next.max(axis=1) >= .25) ] 
new_df_next=b_next.transpose()


In [None]:
testing_peaks=new_df_next.loc[:,'covid-19']
peaks=peaks_previousN(testing_peaks.values, 22)

plt.plot(testing_peaks)
plt.plot(testing_peaks.iloc[peaks].index, testing_peaks.iloc[peaks].values, "x")
#plt.plot(np.zeros_like(testing_peaks), "--", color="gray")
plt.title("Spikes for Covid-19 across years")

### Section 1 - Model 2: 
##### The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 1, in the SEC records belonging to the top 75 industries split by year. The top word identification is done by using a custom function that finds peaks based on how many sigmas away the peak values are from the norm.
##### Configurations that can be modified are highlighted as comments

In [None]:
sec1_tfidf_model_2=defaultdict(defaultdict(list).copy)

try:
    
    for item in main_75_industry:
        if(final_merge_1[final_merge_1['SubInd']==item].shape[0]==0): continue
        new_df=final_merge_1[final_merge_1['SubInd']==item].reset_index()
        new_df=new_df.drop(['SubInd'], axis=1)

        vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                          stop_words = stops,  max_df=.75)  # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1_cleaned'])


        dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
        d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
        dtm_tfidf_new.rename(columns=d, inplace=True)
        final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
        final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
        final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

        numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
        #numeric_cols

        new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
        b_next=new_df_next.transpose()
        b_next=b_next[(b_next.max(axis=1) >= .15)  ] # This signifies that the word should be present with a 
        # higher relavance in some of the years
        new_df_next=b_next.transpose()
        new_df_next

        for item_2 in list(new_df_next.columns):
            if item_2=='index': continue
            testing_peaks=new_df_next.loc[:,item_2]
            
            
            
            peaks=np.argwhere((testing_peaks-np.mean(testing_peaks))/np.std(testing_peaks) > 2).ravel() 
            # Here, we look at peaks that occur more than 2 Sigmas away
            

            for next_item in list(testing_peaks.iloc[peaks].index):
                if (item not in sec1_tfidf_model_2.keys()): sec1_tfidf_model_2[item]=defaultdict(list)
                sec1_tfidf_model_2[item][next_item].append(item_2)

        if item in sec1_tfidf_model_2.keys(): sec1_tfidf_model_2[item]=dict(sorted(sec1_tfidf_model_2[item].items()))

            #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
        sec1_tfidf_model_2=dict(sorted(sec1_tfidf_model_2.items()))
        
        # The dict model would be available in sec1_tfidf_model_2

except:
    print(item)
    print(peaks)
    #print(list(new_df_next.columns))
    #print(list(testing_peaks.iloc[peaks]))
    #print(list(testing_peaks.iloc[peaks].index))
    #print(testing_peaks)
#sec1_tfidf_model_2

In [None]:
#dict(sorted(sec1_tfidf_model_2.items()))

### Sec 1 - Model 3: The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 1, in the SEC records belonging to the top 75 industries split by year. The top word identification is done taking the top 20 words per Industry per year
##### Configurations that can be modified are highlighted as comments

In [None]:
sec1_tfidf_model_3=defaultdict(defaultdict(list).copy)


    
for item in main_75_industry:
    if(final_merge_1[final_merge_1['SubInd']==item].shape[0]==0): continue
    new_df=final_merge_1[final_merge_1['SubInd']==item].reset_index()
    new_df=new_df.drop(['SubInd','index'], axis=1)

    vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 1000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                      stop_words = stops,  max_df=.75) 
    # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        
    dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1_cleaned'])


    dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
    d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
    dtm_tfidf_new.rename(columns=d, inplace=True)


    final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
    final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
    final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')



    numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
    #numeric_cols

    new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
    #print(new_df_next.head(5))

    for item_2 in new_df_next.index:
        temp_df=new_df_next.loc[item_2]
        #top20 = temp_df.values.argsort()[::-1][:20]
        #top20=sparse_argsort(temp_df)[::-1][:20]
        top20=np.argsort(temp_df.values)[np.in1d(np.argsort(temp_df.values),np.where(temp_df.values),1)][::-1][:20] 
        # Here we get the top 20 words
        #print(new_df_next.head(2) )
        #print(item)
        #print(top20)
        #print(temp_df.values)
        top_20_features=[vec_new_nlp.get_feature_names_out()[i] for i in list(top20)] 
        if (item not in sec1_tfidf_model_3.keys()): sec1_tfidf_model_3[item]=defaultdict(list)
        sec1_tfidf_model_3[item][item_2].append(top_20_features)



    if item in sec1_tfidf_model_3.keys(): sec1_tfidf_model_3[item]=dict(sorted(sec1_tfidf_model_3[item].items()))

        #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
    sec1_tfidf_model_3=dict(sorted(sec1_tfidf_model_3.items()))


#sec1_tfidf_model_3

### Sec 1 - Model 4: The below code takes the top 20 words for each of the 75 Industries across all years. Then it created seperate Word2Vec model for each year and 10 most similar words to that. This is time consuming model but the output dictionary does give back valuable inights on how words trended across years. This model could be further updated to input a different set of 'Spike' words to the Word2Vec model to get the corresponding similar words; and even the Word2Vec models can be modified.
##### Configurations that can be modified are highlighted as comments

In [None]:
iter_dict_ind_top20_1_similar=defaultdict(defaultdict(defaultdict(dict).copy).copy)


    
for item in main_75_industry:
    if(final_merge[final_merge['SubInd']==item].shape[0]==0): continue
    new_df=final_merge[final_merge['SubInd']==item].reset_index()
    new_df=new_df.drop(['SubInd','index'], axis=1)

    vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 1000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                      stop_words = stops,  max_df=.75) 
    
        # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        
    dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1_cleaned'])


    dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
    d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
    dtm_tfidf_new.rename(columns=d, inplace=True)


    final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
    final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
    final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')



    numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
    #numeric_cols

    new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
    
    new_df_next_sum=np.sum(new_df_next, axis=0)
    
    top20=np.argsort(new_df_next_sum.values)[np.in1d(np.argsort(new_df_next_sum.values),\
                                                     np.where(new_df_next_sum.values),1)][::-1][:20]
    top_20_features=[vec_new_nlp.get_feature_names_out()[i] for i in list(top20)] 
    
    
    #print(new_df_next.head(5))

    for item_2 in new_df_next.index:
        #temp_df=new_df_next.loc[item_2]
        #top20 = temp_df.values.argsort()[::-1][:20]
        #top20=sparse_argsort(temp_df)[::-1][:20]
        all_sentences_inter = final_merge[final_merge['actual_year']==item_2]['item1_cleaned'].apply(lambda x: x.split())
        
        #Different configurations can be done on the Word2Vec model especially aroud min_count, scoring and threshold
        
        
        bigram_transformer = Phrases(all_sentences_inter, min_count=20, delimiter=' ')
        bigram = Phraser(bigram_transformer)
        
        trigram_txnformer=Phrases(bigram[all_sentences_inter], min_count=20, delimiter=' ')
        trigram = Phraser(trigram_txnformer)
        
        w2v_model_bi_tri = Word2Vec(trigram[all_sentences_inter], min_count=20)
        
        for item3 in top_20_features:
            if ' ' not in item3 and item3 in w2v_model_bi_tri.wv.key_to_index.keys() :
                ea = w2v_model_bi_tri.wv.most_similar(item3)
            elif all(x in w2v_model_bi_tri.wv.key_to_index.keys() for x in item3):
                
                ea = w2v_model_bi_tri.wv.most_similar(positive=item3.split())

        
            if (item not in iter_dict_ind_top20_1_similar.keys()): 
                iter_dict_ind_top20_1_similar[item]=defaultdict(dict)
            if (item_2 not in  iter_dict_ind_top20_1_similar[item].keys()):
                iter_dict_ind_top20_1_similar[item][item_2] = defaultdict(dict)
            iter_dict_ind_top20_1_similar[item][item_2][item3] = ea[:10]



    if item in iter_dict_ind_top20_1_similar.keys(): iter_dict_ind_top20_1_similar[item]=\
        dict(sorted(iter_dict_ind_top20_1_similar[item].items()))

        #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
    iter_dict_ind_top20_1_similar=dict(sorted(iter_dict_ind_top20_1_similar.items()))


iter_dict_ind_top20_1_similar

## Loading Section 1A

In [None]:
final_merge_1A = pd.read_pickle("item1A_cleaned.pkl")  
final_merge_1A.rename(columns={'year':'actual_year', 'company': 'actual_company', 'date': 'actual_date'},inplace=True)
final_merge_1A=final_merge_1A[['actual_date','cik','actual_company','actual_year','item1A_cleaned']]
#final_merge_1A.rename(columns={'year':'actual_year', 'company': 'actual_company', 'date': 'actual_date'},inplace=True)
final_merge_1A.dropna(subset=['actual_year', 'item1A_cleaned'], inplace=True)
final_merge_1A = final_merge_1A.merge(industry_data[['ID', 'SubInd']], left_on = 'cik', right_on='ID',how = 'left')
final_merge_1A.drop(['ID'], axis=1,inplace=True)
final_merge_1A #11400

In [None]:
# vec_new_nlp_1A = TfidfVectorizer(ngram_range = (1,3), max_features = 2500,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
#                       stop_words = stops, min_df=.15, max_df=.6) 
# dtm_new_nlp_1A = vec_new_nlp_1A.fit_transform(final_merge_1A['item1A_cleaned'])

# dtm_tfidf_new_1A= pd.DataFrame(dtm_new_nlp_1A.toarray())
# d = dict(zip(list(dtm_tfidf_new_1A.columns), list(np.array(vec_new_nlp_1A.get_feature_names_out()))))
# dtm_tfidf_new_1A.rename(columns=d, inplace=True)
# final_merge_sec_1A = pd.concat([final_merge_1A, dtm_tfidf_new_1A], axis=1)
# final_merge_sec_1A['cik']=final_merge_sec_1A['cik'].astype('object')
# final_merge_sec_1A['actual_year']=final_merge_sec_1A['actual_year'].astype('object')

# numeric_cols = list(final_merge_sec_1A.select_dtypes(include='number').columns)
# final_merge_transpose_1A = final_merge_sec_1A.groupby(['actual_year'])[numeric_cols].sum()

# final_merge_groupby_subind_1A = final_merge_sec_1A.groupby(['SubInd','actual_year'])[numeric_cols].sum().reset_index()
# final_merge_groupby_subind_1A.head(5)


### Sec 1A - Model 1: 
##### The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 1A, in the SEC records belonging to the top 75 industries split by year. The top word identification is done by using a custom function that finds peaks in data based on its tfidf distribution
##### Configurations that can be modified are highlighted as comments

In [None]:
sec1A_tfidf_model_1=defaultdict(defaultdict(list).copy)

try:
    
    for item in main_75_industry:
        if(final_merge_1A[final_merge_1A['SubInd']==item].shape[0]==0): continue
        new_df=final_merge_1A[final_merge_1A['SubInd']==item].reset_index()
        new_df=new_df.drop(['SubInd'], axis=1)

        vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                          stop_words = stops,  max_df=.65) 
            # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1A_cleaned'])


        dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
        d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
        dtm_tfidf_new.rename(columns=d, inplace=True)
        final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
        final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
        final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

        numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
        #numeric_cols

        new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
        b_next=new_df_next.transpose()
         # This signifies that the word should be present with a 
        # higher relavance in some of the years
        b_next=b_next[(b_next.max(axis=1) >= .25) ]
        new_df_next=b_next.transpose()
        new_df_next

        for item_2 in list(new_df_next.columns):
            if item_2=='index': continue
            testing_peaks=new_df_next.loc[:,item_2]
            
            # This is the function that generates the spike words. 
            peaks=peaks_previousN(testing_peaks.values, 22)
            
            

            for next_item in list(testing_peaks.iloc[peaks].index):
                if (item not in sec1A_tfidf_model_1.keys()): sec1A_tfidf_model_1[item]=defaultdict(list)
                sec1A_tfidf_model_1[item][next_item].append(item_2)

        if item in sec1A_tfidf_model_1.keys(): sec1A_tfidf_model_1[item]=dict(sorted(sec1A_tfidf_model_1[item].items()))

            #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
        sec1A_tfidf_model_1=dict(sorted(sec1A_tfidf_model_1.items()))

except:
    print(item)
    print(peaks)
    #print(list(new_df_next.columns))
    #print(list(testing_peaks.iloc[peaks]))
    #print(list(testing_peaks.iloc[peaks].index))
    #print(testing_peaks)
#sec1A_tfidf_model_1

### Section 1A - Model 2: 
##### The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 1A, in the SEC records belonging to the top 75 industries split by year. The top word identification is done by using a custom function that finds peaks based on how many sigmas away the peak values from the norm.
##### Configurations that can be modified are highlighted as comments

In [None]:
sec1A_tfidf_model_2=defaultdict(defaultdict(list).copy)

try:
    
    for item in main_75_industry:
        if(final_merge_1A[final_merge_1A['SubInd']==item].shape[0]==0): continue
        new_df=final_merge_1A[final_merge_1A['SubInd']==item].reset_index()
        new_df=new_df.drop(['SubInd'], axis=1)

        vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                          stop_words = stops,  max_df=.7) 
            # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1A_cleaned'])


        dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
        d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
        dtm_tfidf_new.rename(columns=d, inplace=True)
        final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
        final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
        final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

        numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
        #numeric_cols

        new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
        b_next=new_df_next.transpose()
         # This signifies that the word should be present with a 
        # higher relavance in some of the years
        
        b_next=b_next[(b_next.max(axis=1) >= .25) ]
        new_df_next=b_next.transpose()
        new_df_next

        for item_2 in list(new_df_next.columns):
            if item_2=='index': continue
            testing_peaks=new_df_next.loc[:,item_2]
           # Here, we look at peaks that occur more than 1.5 Sigmas away
            
            peaks=np.argwhere((testing_peaks-np.mean(testing_peaks))/np.std(testing_peaks) > 1.5).ravel()

            for next_item in list(testing_peaks.iloc[peaks].index):
                if (item not in sec1A_tfidf_model_2.keys()): sec1A_tfidf_model_2[item]=defaultdict(list)
                sec1A_tfidf_model_2[item][next_item].append(item_2)

        if item in sec1A_tfidf_model_2.keys(): sec1A_tfidf_model_2[item]=\
            dict(sorted(sec1A_tfidf_model_2[item].items()))

            #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
        sec1A_tfidf_model_2=dict(sorted(sec1A_tfidf_model_2.items()))

except:
    print(item)
    print(peaks)
    #print(list(new_df_next.columns))
    #print(list(testing_peaks.iloc[peaks]))
    #print(list(testing_peaks.iloc[peaks].index))
    #print(testing_peaks)
#sec1A_tfidf_model_2

In [None]:
new_df=final_merge_1A[final_merge_1A['SubInd']=='Interactive Media & Services'].reset_index()
new_df=new_df.drop(['SubInd'], axis=1)

vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                  stop_words = stops,  max_df=.7) 
dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1A_cleaned'])


dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
dtm_tfidf_new.rename(columns=d, inplace=True)
final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
#numeric_cols

new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
b_next=new_df_next.transpose()
b_next=b_next[(b_next.max(axis=1) >= .25) ]
new_df_next=b_next.transpose()
new_df_next

In [None]:
testing_peaks=new_df_next.loc[:,'twitter']
            #peaks=peaks_previousN(testing_peaks.values, 22)
peaks=np.argwhere((testing_peaks-np.mean(testing_peaks))/np.std(testing_peaks) > 1.5).ravel()
plt.plot(testing_peaks)
plt.plot(testing_peaks.iloc[peaks].index, testing_peaks.iloc[peaks].values, "x")
#plt.plot(np.zeros_like(testing_peaks), "--", color="gray")
plt.title("Spikes for covid-19 across years")

### Sec 1A - Model 3: The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 1A, on the SEC records belonging to the top 75 industries split by year. The top word identification is done taking the top 20 words per Industry per year
##### Configurations that can be modified are highlighted as comments

In [None]:
sec1A_tfidf_model_3=defaultdict(defaultdict(list).copy)


    
for item in main_75_industry:
    if(final_merge_1A[final_merge_1A['SubInd']==item].shape[0]==0): continue
    new_df=final_merge_1A[final_merge_1A['SubInd']==item].reset_index()
    new_df=new_df.drop(['SubInd','index'], axis=1)

    vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                      stop_words = stops,  max_df=.5) 
        # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
    dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1A_cleaned'])


    dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
    d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
    dtm_tfidf_new.rename(columns=d, inplace=True)


    final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
    final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
    final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')



    numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
    #numeric_cols

    new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
    #print(new_df_next.head(5))

    for item_2 in new_df_next.index:
        temp_df=new_df_next.loc[item_2]
        #top20 = temp_df.values.argsort()[::-1][:20]
        #top20=sparse_argsort(temp_df)[::-1][:20]
        top20=np.argsort(temp_df.values)[np.in1d(np.argsort(temp_df.values),np.where(temp_df.values),1)][::-1][:20]
        #print(new_df_next.head(2) )
        #print(item)
        #print(top20)
        #print(temp_df.values)
        top_20_features=[vec_new_nlp.get_feature_names_out()[i] for i in list(top20)] 
        if (item not in sec1A_tfidf_model_3.keys()): sec1A_tfidf_model_3[item]=defaultdict(list)
        sec1A_tfidf_model_3[item][item_2].append(top_20_features)



    if item in sec1A_tfidf_model_3.keys(): sec1A_tfidf_model_3[item]=dict(sorted(sec1A_tfidf_model_3[item].items()))

        #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
    sec1A_tfidf_model_3=dict(sorted(sec1A_tfidf_model_3.items()))


#sec1A_tfidf_model_3

### Sec 1A - Model 4: The below code takes the top 20 words for each of the 75 Industries across all years. Then it created seperate Word2Vec model for each year and 10 most similar words to that. This is time consuming model but the output dictionary does give back valuable inights on how words trended across years
##### Configurations that can be modified are highlighted as comments

In [None]:
iter_dict_ind_top20_1A_similar=defaultdict(defaultdict(defaultdict(dict).copy).copy)


    
for item in main_75_industry:
    if(final_merge_1A[final_merge_1A['SubInd']==item].shape[0]==0): continue
    new_df=final_merge_1A[final_merge_1A['SubInd']==item].reset_index()
    new_df=new_df.drop(['SubInd','index'], axis=1)

    vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                      stop_words = stops,  max_df=.75) 
    
                    # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        
    dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item1A_cleaned'])


    dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
    d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
    dtm_tfidf_new.rename(columns=d, inplace=True)


    final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
    final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
    final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')



    numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
    #numeric_cols

    new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
    
    new_df_next_sum=np.sum(new_df_next, axis=0)
    
    top20=np.argsort(new_df_next_sum.values)[np.in1d(np.argsort(new_df_next_sum.values),\
                                                     np.where(new_df_next_sum.values),1)][::-1][:20]
    top_20_features=[vec_new_nlp.get_feature_names_out()[i] for i in list(top20)] 
    
    
    #print(new_df_next.head(5))

    for item_2 in new_df_next.index:
        #temp_df=new_df_next.loc[item_2]
        #top20 = temp_df.values.argsort()[::-1][:20]
        #top20=sparse_argsort(temp_df)[::-1][:20]
        all_sentences_inter = final_merge[final_merge['actual_year']==item_2]['item1A_cleaned'].apply(lambda x: x.split())
        
        bigram_transformer = Phrases(all_sentences_inter, min_count=20, delimiter=' ')
        bigram = Phraser(bigram_transformer)
        
        trigram_txnformer=Phrases(bigram[all_sentences_inter], min_count=20, delimiter=' ')
        trigram = Phraser(trigram_txnformer)
        
        w2v_model_bi_tri = Word2Vec(trigram[all_sentences_inter], min_count=20)
        
        for item3 in top_20_features:
            if ' ' not in item3 and item3 in w2v_model_bi_tri.wv.key_to_index.keys() :
                ea = w2v_model_bi_tri.wv.most_similar(item3)
            elif all(x in w2v_model_bi_tri.wv.key_to_index.keys() for x in item3):
                
                ea = w2v_model_bi_tri.wv.most_similar(positive=item3.split())

        
            if (item not in iter_dict_ind_top20_1A_similar.keys()): 
                iter_dict_ind_top20_1A_similar[item]=defaultdict(dict)
            if (item_2 not in  iter_dict_ind_top20_1A_similar[item].keys()):
                iter_dict_ind_top20_1A_similar[item][item_2] = defaultdict(dict)
            iter_dict_ind_top20_1A_similar[item][item_2][item3] = ea[:10]



    if item in iter_dict_ind_top20_1A_similar.keys(): iter_dict_ind_top20_1A_similar[item]=\
        dict(sorted(iter_dict_ind_top20_1A_similar[item].items()))

        #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
    iter_dict_ind_top20_1A_similar=dict(sorted(iter_dict_ind_top20_1A_similar.items()))


iter_dict_ind_top20_1A_similar

## Loading Section 7

In [None]:
final_merge_7 = pd.read_pickle("item7_cleaned.pkl")
final_merge_7=final_merge_7[['date','cik','company','year','item7_cleaned']]
final_merge_7.rename(columns={'year':'actual_year', 'company': 'actual_company', 'date': 'actual_date'},inplace=True)
final_merge_7.dropna(subset=['actual_year', 'item7_cleaned'], inplace=True)

final_merge_7.reset_index(drop=True,inplace=True)

final_merge_7 = final_merge_7.merge(industry_data[['ID', 'SubInd']], left_on = 'cik', right_on='ID',how = 'left')
final_merge_7.drop(['ID'], axis=1,inplace=True)
final_merge_7



In [None]:
# vec_new_nlp_7 = TfidfVectorizer(ngram_range = (1,3), max_features = 2500,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
#                       stop_words = stops, max_df=.67) 
# dtm_new_nlp_7 = vec_new_nlp_7.fit_transform(final_merge_7['item7_cleaned'])

# dtm_tfidf_new_7= pd.DataFrame(dtm_new_nlp_7.toarray())
# d = dict(zip(list(dtm_tfidf_new_7.columns), list(np.array(vec_new_nlp_7.get_feature_names_out()))))
# dtm_tfidf_new_7.rename(columns=d, inplace=True)
# final_merge_sec_7 = pd.concat([final_merge_7, dtm_tfidf_new_7], axis=1)
# final_merge_sec_7['cik']=final_merge_sec_7['cik'].astype('object')
# final_merge_sec_7['actual_year']=final_merge_sec_7['actual_year'].astype('object')

# numeric_cols = list(final_merge_sec_7.select_dtypes(include='number').columns)
# final_merge_transpose_7 = final_merge_sec_7.groupby(['actual_year'])[numeric_cols].sum()

# final_merge_groupby_subind_7 = final_merge_sec_7.groupby(['SubInd','actual_year'])[numeric_cols].sum().reset_index()
# final_merge_groupby_subind_7.head(5)

### Sec 7 - Model 1: 
##### The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 7, in the SEC records belonging to the top 75 industries split by year. The top word identification is done by using a custom function that finds peaks in data based on its tfidf distribution
##### Configurations that can be modified are highlighted as comments

In [None]:
sec7_tfidf_model_1=defaultdict(defaultdict(list).copy)

try:
    
    for item in main_75_industry:
        if(final_merge_7[final_merge_7['SubInd']==item].shape[0]==0): continue
        new_df=final_merge_7[final_merge_7['SubInd']==item].reset_index()
        new_df=new_df.drop(['SubInd'], axis=1)

        vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                          stop_words = stops,  max_df=.6) 
        
                # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        
        dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item7_cleaned'])


        dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
        d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
        dtm_tfidf_new.rename(columns=d, inplace=True)
        final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
        final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
        final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

        numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
        #numeric_cols

        new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
        b_next=new_df_next.transpose()
        
        # This signifies that the word should be present with a 
        # higher relavance in some of the years
        
        
        b_next=b_next[(b_next.max(axis=1) >= .2) ]
        new_df_next=b_next.transpose()
        new_df_next

        for item_2 in list(new_df_next.columns):
            if item_2=='index': continue
            testing_peaks=new_df_next.loc[:,item_2]
            
            # This is the function that generates the spike words.
            
            peaks=peaks_previousN(testing_peaks.values, 22)

            for next_item in list(testing_peaks.iloc[peaks].index):
                if (item not in sec7_tfidf_model_1.keys()): sec7_tfidf_model_1[item]=defaultdict(list)
                sec7_tfidf_model_1[item][next_item].append(item_2)

        if item in sec7_tfidf_model_1.keys(): sec7_tfidf_model_1[item]=dict(sorted(sec7_tfidf_model_1[item].items()))

            #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
        sec7_tfidf_model_1=dict(sorted(sec7_tfidf_model_1.items()))

except:
    print(item)
    print(peaks)
    #print(list(new_df_next.columns))
    #print(list(testing_peaks.iloc[peaks]))
    #print(list(testing_peaks.iloc[peaks].index))
    #print(testing_peaks)
#sec7_tfidf_model_1

In [None]:
#here

### Section 7 - Model 2: 
##### The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 7, in the SEC records belonging to the top 75 industries split by year. The top word identification is done by using a custom function that finds peaks based on how many sigmas away the peak values from the norm.
##### Configurations that can be modified are highlighted as comments

In [None]:
sec7_tfidf_model_2=defaultdict(defaultdict(list).copy)
sec7_industry_dict=defaultdict(list)

try:
    
    for item in main_75_industry:
        if(final_merge_7[final_merge_7['SubInd']==item].shape[0]==0): continue
        new_df=final_merge_7[final_merge_7['SubInd']==item].reset_index()
        new_df=new_df.drop(['SubInd'], axis=1)

        vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                          stop_words = stops,  max_df=.7) 
        
                # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        
        dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item7_cleaned'])


        dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
        d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
        dtm_tfidf_new.rename(columns=d, inplace=True)
        final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
        final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
        final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')

        numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
        #numeric_cols

        new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
        b_next=new_df_next.transpose()
        
        # This signifies that the word should be present with a 
        # higher relavance in some of the years
        
        
        b_next=b_next[(b_next.max(axis=1) >= .2) ]
        new_df_next=b_next.transpose()
        new_df_next
        a_list=[]

        for item_2 in list(new_df_next.columns):
            if item_2=='index': continue
            testing_peaks=new_df_next.loc[:,item_2]
            
            
                      # Here, we look at peaks that occur more than 2.5 Sigmas away
            
            peaks=np.argwhere((testing_peaks-np.mean(testing_peaks))/np.std(testing_peaks) > 2.5).ravel()

            for next_item in list(testing_peaks.iloc[peaks].index):
                if (item not in sec7_tfidf_model_2.keys()): sec7_tfidf_model_2[item]=defaultdict(list)
                sec7_tfidf_model_2[item][next_item].append(item_2)
                if next_item > 2019: sec7_industry_dict[item].append(item_2)

        sec7_industry_dict[item]  = a_list[:20]
        a_list=[]
        if item in sec7_tfidf_model_2.keys(): sec7_tfidf_model_2[item]=\
            dict(sorted(sec7_tfidf_model_2[item].items()))

            #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
        sec7_tfidf_model_2=dict(sorted(sec7_tfidf_model_2.items()))

except Exception as e:
    print(e)
    print(peaks)
    #print(list(new_df_next.columns))
    #print(list(testing_peaks.iloc[peaks]))
    #print(list(testing_peaks.iloc[peaks].index))
    #print(testing_peaks)
#sec7_tfidf_model_2

### Sec 7 - Model 3: The below code would create a dictionary object that has the top words / bigrams / trigrams based on Section 7, in the SEC records belonging to the top 75 industries split by year. The top word identification is done taking the top 20 words per Industry per year
##### Configurations that can be modified are highlighted as comments 

In [None]:
sec7_tfidf_model_3=defaultdict(defaultdict(list).copy)


    
for item in main_75_industry:
    if(final_merge_7[final_merge_7['SubInd']==item].shape[0]==0): continue
    new_df=final_merge_7[final_merge_7['SubInd']==item].reset_index()
    new_df=new_df.drop(['SubInd','index'], axis=1)

    vec_new_nlp = TfidfVectorizer(ngram_range = (1,1), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                      stop_words = stops,  max_df=.7) 
    
            # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
        
    dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item7_cleaned'])


    dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
    d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
    dtm_tfidf_new.rename(columns=d, inplace=True)


    final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
    final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
    final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')



    numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
    

    new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
   

    for item_2 in new_df_next.index:
        temp_df=new_df_next.loc[item_2]
        
        top20=np.argsort(temp_df.values)[np.in1d(np.argsort(temp_df.values),np.where(temp_df.values),1)][::-1][:20]
       
        top_20_features=[vec_new_nlp.get_feature_names_out()[i] for i in list(top20)] 
        if (item not in sec7_tfidf_model_3.keys()): sec7_tfidf_model_3[item]=defaultdict(list)
        sec7_tfidf_model_3[item][item_2].append(top_20_features)



    if item in sec7_tfidf_model_3.keys(): sec7_tfidf_model_3[item]=dict(sorted(sec7_tfidf_model_3[item].items()))

        
    sec7_tfidf_model_3=dict(sorted(sec7_tfidf_model_3.items()))


#sec7_tfidf_model_3

### Sec 7 - Model 4: The below code takes the top 20 words for each of the 75 Industries across all years. Then it created seperate Word2Vec model for each year and 10 most similar words to that. This is time consuming model but the output dictionary does give back valuable inights on how words trended across years
##### Configurations that can be modified are highlighted as comments

In [None]:
iter_dict_ind_top20_7_similar=defaultdict(defaultdict(defaultdict(dict).copy).copy)


    
for item in main_75_industry:
    if(final_merge_7[final_merge_7['SubInd']==item].shape[0]==0): continue
    new_df=final_merge_7[final_merge_7['SubInd']==item].reset_index()
    new_df=new_df.drop(['SubInd','index'], axis=1)

    vec_new_nlp = TfidfVectorizer(ngram_range = (1,3), max_features = 3000,token_pattern=r'[0-9]*[a-zA-Z]+[a-zA-Z0-9-_]+', \
                      stop_words = stops,  max_df=.75) 
    
                    # ngram_range, max_features, max_df, min_df can be adjusted
        # re can be adjusted also
    dtm_new_nlp = vec_new_nlp.fit_transform(new_df['item7_cleaned'])


    dtm_tfidf_new= pd.DataFrame(dtm_new_nlp.toarray())
    d = dict(zip(list(dtm_tfidf_new.columns), list(np.array(vec_new_nlp.get_feature_names_out()))))
    dtm_tfidf_new.rename(columns=d, inplace=True)


    final_merge_sec1 = pd.concat([new_df, dtm_tfidf_new], axis=1)
    final_merge_sec1['cik']=final_merge_sec1['cik'].astype('object')
    final_merge_sec1['actual_year']=final_merge_sec1['actual_year'].astype('object')



    numeric_cols = list(final_merge_sec1.select_dtypes(include='number').columns)
    #numeric_cols

    new_df_next=final_merge_sec1.groupby(['actual_year'])[numeric_cols].sum()
    
    new_df_next_sum=np.sum(new_df_next, axis=0)
    
    top20=np.argsort(new_df_next_sum.values)[np.in1d(np.argsort(new_df_next_sum.values),\
                                                     np.where(new_df_next_sum.values),1)][::-1][:20]
    top_20_features=[vec_new_nlp.get_feature_names_out()[i] for i in list(top20)] 
    
    
    #print(new_df_next.head(5))

    for item_2 in new_df_next.index:
        #temp_df=new_df_next.loc[item_2]
        #top20 = temp_df.values.argsort()[::-1][:20]
        #top20=sparse_argsort(temp_df)[::-1][:20]
        all_sentences_inter = final_merge[final_merge['actual_year']==item_2]['item7_cleaned'].apply(lambda x: x.split())
        
        bigram_transformer = Phrases(all_sentences_inter, min_count=20, delimiter=' ')
        bigram = Phraser(bigram_transformer)
        
        trigram_txnformer=Phrases(bigram[all_sentences_inter], min_count=20, delimiter=' ')
        trigram = Phraser(trigram_txnformer)
        
        w2v_model_bi_tri = Word2Vec(trigram[all_sentences_inter], min_count=20)
        
        for item3 in top_20_features:
            if ' ' not in item3 and item3 in w2v_model_bi_tri.wv.key_to_index.keys() :
                ea = w2v_model_bi_tri.wv.most_similar(item3)
            elif all(x in w2v_model_bi_tri.wv.key_to_index.keys() for x in item3):
                
                ea = w2v_model_bi_tri.wv.most_similar(positive=item3.split())

        
            if (item not in iter_dict_ind_top20_7_similar.keys()): 
                iter_dict_ind_top20_7_similar[item]=defaultdict(dict)
            if (item_2 not in  iter_dict_ind_top20_7_similar[item].keys()):
                iter_dict_ind_top20_7_similar[item][item_2] = defaultdict(dict)
            iter_dict_ind_top20_7_similar[item][item_2][item3] = ea[:10]



    if item in iter_dict_ind_top20_7_similar.keys(): iter_dict_ind_top20_7_similar[item]=\
        dict(sorted(iter_dict_ind_top20_7_similar[item].items()))

        #iter_dict[item].append(list(testing_peaks.iloc[peaks].index))
    iter_dict_ind_top20_7_similar=dict(sorted(iter_dict_ind_top20_7_similar.items()))


#iter_dict_ind_top20_7_similar

In [None]:
# f = open(r'item1_similar.json')
 
# # returns JSON object as 
# # a dictionary
# data = json.load(f)
 
# # Iterating through the json
# # list


In [None]:
# pd.set_option('display.max_colwidth', None)
# inter=pd.concat({k: pd.DataFrame.from_dict(v, orient='index') \
#                  for k, v in dict(list(data.items())).items() if k=='Biotechnology' }\
#          )['vaccine'].reset_index()
# inter.drop(columns=['level_0'], inplace=True)
# inter.index=inter['level_1']
# inter.index.names = ['year']
# inter.drop(columns=['level_1'], inplace=True)
# inter['vaccine']=inter['vaccine'].apply(lambda x: [item[0] for item in x])
# inter

## Global Word2Vec Model

In [None]:
import gensim.downloader
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
w2v_google = gensim.downloader.load('word2vec-google-news-300') # this will take a few minutes



#### We are creating a single corpus combining data from Section 1, 1A and 7

In [None]:
final_merge_1['list_item1_cleaned']=final_merge_1['item1_cleaned'].apply(lambda x: x.split())
final_merge_1A['list_item1A_cleaned']=final_merge_1A['item1A_cleaned'].apply(lambda x: x.split())
final_merge_7['list_item7_cleaned']=final_merge_7['item7_cleaned'].apply(lambda x: x.split())
all_sentences = pd.concat([final_merge_1['list_item1_cleaned'], final_merge_1A['list_item1A_cleaned']\
                          ,final_merge_7['list_item7_cleaned']], ignore_index=True, axis=0)

#### Some points:
#### We need to create bigrams and trigrams. One aspect of that is that we need to put in a threshold for scoring in terms of in how many documents should the phrase appear (this is the min_count). Details present in  https://radimrehurek.com/gensim/models/phrases.html) can explain some of these aspects around scoring and threshold.
#### plus what should be the threshold. So all phrases above and equal to the threshold are extracted

In [None]:
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS






all_sentences.dropna(inplace=True)
bigram_transformer = Phrases(all_sentences, min_count=100, delimiter=' ', scoring='npmi', threshold=.65, connector_words=ENGLISH_CONNECTOR_WORDS)
bigram = Phraser(bigram_transformer)
trigram_txnformer=Phrases(bigram_transformer[all_sentences], min_count=100, delimiter=' ', scoring='npmi',  threshold=.65)


In [None]:
trigram = Phraser(trigram_txnformer)

In [None]:
w2v_model_bi_tri = Word2Vec(trigram[all_sentences], min_count=30)

## Finally, we can take some of the words which we extracted from earlier Spike tests and use it identify closley matching words in the google model, and our global model

In [None]:
goog=''
for term in ['ai','management', 'layoff', 'covid-19', 'bitcoin','5g','cannabis','marijuana']:
   
    if ' ' not in term:
        
        w2v_vocab = w2v_google.key_to_index.keys()
        if term in w2v_vocab: # Check if word in vocab
            goog = w2v_google.most_similar(term)

        ea = w2v_model_bi_tri.wv.most_similar(term) # Word2Vec requires the extra "wv"
        #mod =  w2v_ea.wv.most_similar(term) # Word2Vec requires the extra "wv"
        if goog: print(f"The five terms most similar to {term} using the google based model are {', '.join([s[0] for s in goog[:5]])}.")
        print(f"The five terms most similar to {term} using the custom model are {', '.join([s[0] for s in ea[:5]])}.")
        #print(f"The five terms most similar to {term} using the bigram model are {', '.join([s[0] for s in mod[:5]])}.")
        print("\n")
        goog=''
    else:
        #w2v_vocab = w2v_google.key_to_index.keys()
       # if term in w2v_vocab: # Check if word in vocab
           # goog = w2v_google.most_similar(term)

        ea = w2v_ea.wv.most_similar(positive=term.split()) # Word2Vec requires the extra "wv"
        #mod = w2v_ea.wv.most_similar(positive=term.split()) # Word2Vec requires the extra "wv"
        if goog: print(f"The five terms most similar to {term} using the google based model are {', '.join([s[0] for s in goog[:5]])}.")
        print(f"The five terms most similar to {term} using the custom model are {', '.join([s[0] for s in ea[:5]])}.")
        #print(f"The five terms most similar to {term} using the bigram model are {', '.join([s[0] for s in mod[:5]])}.")
        print("\n")
       # goog=''