In [110]:
import csv
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer

In [119]:
filings = pd.read_csv("LazyPriceData.csv", index_col=0)
filings_s = filings
filings.head(10)

Unnamed: 0,corp,MDA,RF,LEGAL,QNQ,CONTROL,OTHER
2020-03-28,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0
2019-12-28,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0
2019-09-28,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0
2019-06-29,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0
2019-03-30,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0
2018-12-29,AAPL,item 2 management s discussion and analysis of...,item 1a risk factors the following description...,item 1 legal proceedings the company is subjec...,item 3 quantitative and qualitative disclosure...,item 4 controls and procedures evaluation of d...,item 5 other information none
2018-09-29,AAPL,item 7 management s discussion and analysis of...,item 1a risk factors the following discussion ...,item 3 legal proceedings the company is subjec...,item 7a quantitative and qualitative disclosur...,item 9a controls and procedures evaluation of ...,item 9b other information none
2018-06-30,AAPL,item 2 management s discussion and analysis of...,item 1a risk factors the following description...,item 1 legal proceedings the company is subjec...,item 3 quantitative and qualitative disclosure...,item 4 controls and procedures evaluation of d...,item 5 other information none
2018-03-31,AAPL,item 2 management s discussion and analysis of...,item 1a risk factors the following description...,item 1 legal proceedings the company is subjec...,item 3 quantitative and qualitative disclosure...,item 4 controls and procedures evaluation of d...,item 5 other information none
2017-12-30,AAPL,item 2 management s discussion and analysis of...,item 1a risk factors the following description...,item 1 legal proceedings the company is subjec...,item 3 quantitative and qualitative disclosure...,item 4 controls and procedures evaluation of d...,item 5 other information none


#### Similarity Calculation in-between Periods

In [6]:
#n_periods: total number of reports for each company
get_sims = ['MDA', 'RF', 'LEGAL', 'QNQ', 'CONTROL', 'OTHER']
n_periods = 20
n_sections = len(get_sims)
get_periods = list(filings.index)[0:n_periods-4]

In [7]:
#prepare each column
def prep_fun(x):
    return(x.lower().replace("[^[:alnum:]]", " ").replace("\\s+", " "))

#takes in tokenized lists and calculate jaccard similarity
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

#https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/
def cosine_similarity(list1, list2):
    X_set = set(list1)
    Y_set = set(list2)
    l1 = []
    l2 = []
    union = X_set.union(Y_set)
    for w in union: 
        if w in X_set: l1.append(1)
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0 
    for i in range(len(union)): 
            c += l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    return(cosine)

In [8]:
#iterate through each section
def sim_calc(this_period,last_year,d, sim_type):
    texta = prep_fun(this_period[d])
    textb = prep_fun(last_year[d])
    token_a = nltk.word_tokenize(texta)
    token_b = nltk.word_tokenize(textb)
    if(sim_type == "jaccard"):
        return(jaccard_similarity(token_a, token_b))
    elif(sim_type == "cosine"):
        return(cosine_similarity(token_a, token_b))
    else:
        print("Invalid type of similarity.")

In [9]:
#iterate through each period
def sim(filings_corp, sim_type):
    sims_mat = pd.DataFrame(np.zeros(shape=(n_periods-4, n_sections)),columns = get_sims, index = get_periods)
    for i in range(n_periods-4):
        this_period = filings.iloc[i,:]
        last_year = filings.iloc[i+4,:]
        for j in range(len(get_sims)):
            sims_mat.iloc[i, j] = sim_calc(this_period, last_year,get_sims[j], sim_type)     
    sims_mat.insert(0, "Company", [filings_corp.iloc[0,0]]*(n_periods-4))
    return(sims_mat)

In [10]:
#prints out jaccard similarity matrix
for corp in range(int(filings.shape[0]/n_periods)):
    if(corp == 0):
        jaccard_sims = sim(filings.iloc[corp:corp+n_periods, :],"jaccard")
    else:
        temp = sim(filings.iloc[corp*n_periods:corp*n_periods+n_periods, :],"jaccard")
        jaccard_sims = jaccard_sims.append(temp, ignore_index=True)
jaccard_sims.index = list(filings.index)[0:n_periods-4]*2
print(jaccard_sims)

           Company       MDA        RF     LEGAL       QNQ   CONTROL     OTHER
2020-03-28    AAPL  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000
2019-12-28    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2019-09-28    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2019-06-29    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2019-03-30    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2018-12-29    AAPL  0.087133  0.086939  0.304478  0.410526  0.297468  1.000000
2018-09-29    AAPL  0.077189  0.087151  0.305389  0.190819  0.210762  0.571429
2018-06-30    AAPL  0.084454  0.088730  0.311178  0.410526  0.295238  1.000000
2018-03-31    AAPL  0.087948  0.091393  0.311178  0.410526  0.295238  1.000000
2017-12-30    AAPL  0.095678  0.091610  0.363057  0.385417  0.303514  1.000000
2017-09-30    AAPL  0.080807  0.091809  0.363057  0.185612  0.210762  1.000000
2017-07-01    AAPL  0.089937  0.093345  0.359621  0.

In [11]:
#prints out cosine similarity matrix
for corp in range(int(filings.shape[0]/n_periods)):
    if(corp == 0):
        cos_sims = sim(filings.iloc[corp:corp+n_periods, :], "cosine")
    else:
        temp = sim(filings.iloc[corp*n_periods:corp*n_periods+n_periods, :], "cosine")
        cos_sims = cos_sims.append(temp, ignore_index=True)
cos_sims.index = list(filings.index)[0:n_periods-4]*2
print(cos_sims)

           Company       MDA        RF     LEGAL       QNQ   CONTROL     OTHER
2020-03-28    AAPL  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000
2019-12-28    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2019-09-28    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2019-06-29    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2019-03-30    AAPL  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
2018-12-29    AAPL  0.838851  0.948606  0.890838  0.975000  0.959234  1.000000
2018-09-29    AAPL  0.916769  0.947059  0.886957  0.984309  0.991561  0.730297
2018-06-30    AAPL  0.903303  0.949026  0.895652  0.975000  0.968750  1.000000
2018-03-31    AAPL  0.911823  0.960062  0.895652  0.975000  0.968750  1.000000
2017-12-30    AAPL  0.908703  0.962433  0.991304  0.936784  0.979381  1.000000
2017-09-30    AAPL  0.946590  0.963082  0.991304  0.965714  0.991561  1.000000
2017-07-01    AAPL  0.900709  0.970948  0.982795  0.

#### Topic Modeling on each filing

In [120]:
#returns topics modeled from LDA for each filing
def get_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    combined = []
    for topic_idx, topic in enumerate(model.components_):
        topics = " ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
        
        combined.append("Topic "+str(topic_idx)+": "+topics)
    return(combined)

In [121]:
filings_s['combined']=filings_s.values.tolist()

In [134]:
#change model parameters here
number_topics = 5
number_words = 10
filings_s['Topics'] = [0]*filings_s.shape[0]

In [135]:
for i in range(filings_s.shape[0]):
    count_vec = CountVectorizer(stop_words='english')
    data = count_vec.fit_transform(filings_s['combined'][i])
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(data)
    topics = get_topics(lda, count_vec, number_words)
    filings_s.iloc[i,8] = "; ".join(topics)

In [138]:
filings_s.head(6)

Unnamed: 0,corp,MDA,RF,LEGAL,QNQ,CONTROL,OTHER,combined,Topics
2020-03-28,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0,"[AAPL, haracter0, haracter0, haracter0, haract...",Topic 0: haracter0 aapl; Topic 1: haracter0 aa...
2019-12-28,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0,"[AAPL, haracter0, haracter0, haracter0, haract...",Topic 0: haracter0 aapl; Topic 1: haracter0 aa...
2019-09-28,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0,"[AAPL, haracter0, haracter0, haracter0, haract...",Topic 0: aapl haracter0; Topic 1: haracter0 aa...
2019-06-29,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0,"[AAPL, haracter0, haracter0, haracter0, haract...",Topic 0: aapl haracter0; Topic 1: haracter0 aa...
2019-03-30,AAPL,haracter0,haracter0,haracter0,haracter0,haracter0,haracter0,"[AAPL, haracter0, haracter0, haracter0, haract...",Topic 0: aapl haracter0; Topic 1: haracter0 aa...
2018-12-29,AAPL,item 2 management s discussion and analysis of...,item 1a risk factors the following description...,item 1 legal proceedings the company is subjec...,item 3 quantitative and qualitative disclosure...,item 4 controls and procedures evaluation of d...,item 5 other information none,"[AAPL, item 2 management s discussion and anal...",Topic 0: aapl item information settled factor ...


In [140]:
filings_s['Topics'][6]

'Topic 0: company foreign currency rate rates risk investment september exchange debt; Topic 1: company sales net tax billion 2018 2017 2016 services financial; Topic 2: 9b item information aapl fourth factor did settled refer infringed; Topic 3: company products services financial business adversely subject operating results product; Topic 4: company financial control internal reporting management controls reasonable provide assurance'