In [88]:
from tika import parser
import re
import timeit
import os
import pandas as pd
import string
import numpy as np
from numpy.linalg import inv


## Clean data##

In [89]:
def generate_raw_data():
    
    """
    This function generates raw text data from FOMC transcripts
    
    returns a list where each element is the full text within each FOMC meeting
    
    It will take about 4-5 minutes
    """

    cwd = os.getcwd() # get current working directory
    base_directory = './FOMC_pdf' # set directory of pdfs
    raw_doc = os.listdir(base_directory) # as above
    filelist = sorted(raw_doc) # sort the pdfs in order
    onlyfiles = [f for f in raw_doc if os.path.isfile(os.path.join(base_directory, f))] # keep if in correct dir
    date = [f[4:10] for f in onlyfiles] # keep the dates in pdfs

    raw_text = pd.DataFrame(columns = ['Date','Speaker', 'content']) #empty dataframe

    start = timeit.default_timer()
    for i,file in enumerate(filelist):
        print('Document {} of {}'.format(i, len(filelist)))
        
        parsed = parser.from_file(os.path.join(cwd, 'FOMC_pdf',file)) # parse the pdf
        interjections = re.split('\nMR. |\nMS. |\nCHAIRMAN |\nVICE CHAIRMAN ', parsed['content']) # split the entire string by the names (looking for MR, MS, Chairman or Vice Chairman)
        temp_df = pd.DataFrame(columns = ['Date','Speaker','content']) # create a temporary dataframe
        interjections = [interjection.replace('\n',' ') for interjection in interjections] # replace \n linebreaks with spaces
        temp = [re.split('(^\S*)', interjection.lstrip()) for interjection in interjections] # changed to this split because sometimes (rarely) there was not a period, either other punctuation or whitespace
        
        speaker = []
        content = []
        for interjection in temp:
            speaker.append(interjection[1].strip(string.punctuation))
            content.append(interjection[2])
            
        temp_df['Speaker'] = speaker

        temp_df['content'] = content # save interjections

        temp_df['Date'] = date[i]
        raw_text = pd.concat([raw_text, temp_df], ignore_index = True)

    end = timeit.default_timer()
    raw_text.index = raw_text['Date'] # set dataframe index to the Date
    raw_text.to_excel('raw_text.xlsx') # save as raw_text.xlsx

    print("Documents processed. Time: {}".format(end - start))
    
    return raw_text


In [90]:
import pandas as pd
import topicmodels
import numpy as np
from nltk.stem import PorterStemmer
import nltk
from nltk.collocations import *

def preprocess():
    
    
    '''
    main function for preprocessing
        
    This function writes the tokenized documents, which includes columns of 
    
    Date: date of the meeting
    Section: FOMC1 or FOMC2
    Speaker: speaker of the interjection
    content: list of tokens in the interjection
    
    '''
    
    
    text = pd.read_excel('raw_text.xlsx')
    
    text_separated = separation(text)
    
    text_separated_col = find_collocation(text_separated)
    text_separated_col['content'] = tokenize(text_separated_col['content'].values)
    text_separated_col.to_excel('FOMC_token_separated_col.xlsx')
    
def tokenize(content):
    '''
    Code for tokenization:
        1. remove words with length of 1
        2. remove non-alphabetical words
        3. remove stop words
        4. stem all words
    '''
    FOMC_token = []
    for statement in content:
        statement = statement.lower()
        docsobj = topicmodels.RawDocs([statement], "long")
        docsobj.token_clean(1)
        docsobj.stopword_remove("tokens")
        docsobj.stem()
        docsobj.stopword_remove("stems")
        ps = PorterStemmer()
        FOMC_token.append(' '.join([ps.stem(word) for word in docsobj.tokens[0]]))
        
    return FOMC_token

def separation(raw_text):
    
    separation_rule = pd.read_excel('Separation.xlsx')
    
    FOMC_separation = pd.DataFrame(columns = ['Date','Speaker','content','Section'])
    for i in separation_rule.index:

        temp1 = raw_text[raw_text["Date"] == i].iloc[separation_rule['FOMC1_start'][i]:separation_rule['FOMC1_end'][i]]
        temp1['Section'] = 1
        if separation_rule['FOMC2_end'][i] == 'end':
            temp2 = raw_text[raw_text["Date"] == i].iloc[separation_rule['FOMC2_start'][i]:]
        else:
            temp2 = raw_text[raw_text["Date"] == i].iloc[separation_rule['FOMC2_start'][i]:separation_rule['FOMC2_end'][i]]
        temp2['Section'] = 2
        FOMC_separation = FOMC_separation.append(temp1, ignore_index=True)
        FOMC_separation = FOMC_separation.append(temp2, ignore_index = True)
        
    FOMC_separation.to_excel('raw_text_separated.xlsx')
    return FOMC_separation

def find_collocation(raw_text_separated):
    
    content = raw_text_separated['content'].apply(lambda x: re.sub(r'[^\w\s]','',x)) #remove punctuations
    
    big_document = content.apply(lambda x: x.split(' ')).values
    
    bigram_list = bigrams(big_document)
    trigram_list = trigram(big_document)
    
    replace_word = [''.join(x.split(' ')) + 'xx' for x in bigram_list] + [''.join(x.split(' ')) + 'xxx' for x in trigram_list]
    
    dict_collocation = dict(zip(bigram_list + trigram_list, replace_word))

    content = content.apply(lambda x: replace_collocation(x, dict_collocation))
    
    raw_text_separated['content'] = content
    raw_text_separated.to_excel('FOMC_separated_Collocation.xlsx')
    return raw_text_separated

def bigrams(big_document):
    
    ignored_words = nltk.corpus.stopwords.words('english')
    ignored_words.append('percent')
    ignored_words.append('governor')
    ignored_words.append('dont')
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    finder = BigramCollocationFinder.from_documents(big_document)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    finder.apply_freq_filter(150)
    
    return [' '.join(x) for x in list(finder.ngram_fd.keys())]


def trigram(big_document):
    
    ignored_words = nltk.corpus.stopwords.words('english')
    ignored_words.append('percent')
    ignored_words.append('governor')
    ignored_words.append('dont')
    trigram_measures = nltk.collocations.TrigramAssocMeasures()

    finder = TrigramCollocationFinder.from_documents(big_document)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    finder.apply_freq_filter(100)
    
    return [' '.join(x) for x in list(finder.ngram_fd.keys())]

def replace_collocation(string, dict_collocation):
    
    for key in dict_collocation.keys():
        
        string = string.replace(key, dict_collocation[key])
        
    return string

In [91]:
import gensim
from gensim import corpora, models
import matplotlib.pyplot as plt

def generate_term_document_interjection(option = None):
    '''
    option = 'text' or 'matrix' for return options
    
    '''
    data = pd.read_excel('FOMC_token_separated_col.xlsx')
    texts = []
    for line in data['content'].fillna(' ').values:
        texts.append(line.split(' '))
        
    dictionary2 = corpora.Dictionary(texts)
    corpus2 = [dictionary2.doc2bow(text) for text in texts]
    
    term_document2 = gensim.matutils.corpus2dense(corpus2, num_terms=len(dictionary2.keys()))
    
    TF = 1+np.log(term_document2.sum(axis = 1))
    IDF = np.log(term_document2.shape[1] / np.count_nonzero(term_document2, axis = 1))

    TF_IDF = pd.Series(dict(zip(dictionary2.keys(), TF*IDF)))
    
    # use top 9000 in TF-IDF
    keys_to_use2 = TF_IDF.sort_values(ascending = False)[:9000].index.values
    
    TF_IDF.sort_values(ascending = False).reset_index()[0].plot()
    plt.show()
    
    dictionary2.filter_tokens(good_ids = keys_to_use2)
    
    #pd.Series(dictionary2.token2id).to_csv('dictionary.csv')
    
    new_text = []
    for line in texts:
        new_text.append([x for x in line if x in dictionary2.token2id.keys()])
    
    new_corpus2 = [dictionary2.doc2bow(text) for text in texts]
    new_term_document2 = gensim.matutils.corpus2dense(new_corpus2, num_terms=len(dictionary2.keys()))
    
    #pd.DataFrame(new_term_document2).to_csv('Matrix_interjection_tfidf.csv')
    
    if option == 'text':
        return new_text
    elif option == 'matrix':
        return new_term_document2
    else:
        return None

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [92]:
def generate_term_document_meeting(option = None):
    '''
    option = 'text' or 'matrix' for return options
    
    '''
    data = pd.read_excel('FOMC_token_separated_col.xlsx')
    data = data.dropna()
    meeting_text = []
    for meeting in data['Date'].unique():
        meeting_text.append(' '.join(data.groupby('Date').get_group(meeting)['content'].values).split(' '))
        
    dictionary = corpora.Dictionary(meeting_text)
    corpus = [dictionary.doc2bow(text) for text in meeting_text]
    
    term_document = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.keys()))
    
    TF = 1+np.log(term_document.sum(axis = 1))
    IDF = np.log(term_document.shape[1] / np.count_nonzero(term_document, axis = 1))

    TF_IDF = pd.Series(dict(zip(dictionary.keys(), TF*IDF)))
    
    # use top 9000 in TF-IDF
    keys_to_use = TF_IDF.sort_values(ascending = False)[:9000].index.values
    
    TF_IDF.sort_values(ascending = False).reset_index()[0].plot()
    plt.show()
    
    dictionary.filter_tokens(good_ids = keys_to_use)
    
    pd.Series(dictionary.token2id).to_csv('dictionary_meeting.csv')
    
    new_text = []
    for line in meeting_text:
        new_text.append([x for x in line if x in dictionary.token2id.keys()])
    
    new_corpus = [dictionary.doc2bow(text) for text in meeting_text]
    new_term_document = gensim.matutils.corpus2dense(new_corpus, num_terms=len(dictionary.keys()))
    
    #pd.DataFrame(new_term_document).to_csv('Matrix_interjection_tfidf.csv')
    
    if option == 'text':
        return new_text
    elif option == 'matrix':
        return new_term_document
    else:
        return None

In [93]:
text = generate_term_document_meeting('text')



## LDA Implementation##

In [94]:
import topicmodels

In [95]:
topicmodels.LDA.LDAGibbs?

In [96]:
# 40 topics
ldaobj = topicmodels.LDA.LDAGibbs(text,40)

In [109]:
# Gibbs sampling
# using the strategy in HMP (4000 burning, sample every 50 trials for total of 4000 trials)
ldaobj.sample(4000,80,50)

Iteration 10 of (collapsed) Gibbs sampling
Iteration 20 of (collapsed) Gibbs sampling
Iteration 30 of (collapsed) Gibbs sampling
Iteration 40 of (collapsed) Gibbs sampling
Iteration 50 of (collapsed) Gibbs sampling
Iteration 60 of (collapsed) Gibbs sampling
Iteration 70 of (collapsed) Gibbs sampling
Iteration 80 of (collapsed) Gibbs sampling
Iteration 90 of (collapsed) Gibbs sampling
Iteration 100 of (collapsed) Gibbs sampling
Iteration 110 of (collapsed) Gibbs sampling
Iteration 120 of (collapsed) Gibbs sampling
Iteration 130 of (collapsed) Gibbs sampling
Iteration 140 of (collapsed) Gibbs sampling
Iteration 150 of (collapsed) Gibbs sampling
Iteration 160 of (collapsed) Gibbs sampling
Iteration 170 of (collapsed) Gibbs sampling
Iteration 180 of (collapsed) Gibbs sampling
Iteration 190 of (collapsed) Gibbs sampling
Iteration 200 of (collapsed) Gibbs sampling
Iteration 210 of (collapsed) Gibbs sampling
Iteration 220 of (collapsed) Gibbs sampling
Iteration 230 of (collapsed) Gibbs sampli

Iteration 1860 of (collapsed) Gibbs sampling
Iteration 1870 of (collapsed) Gibbs sampling
Iteration 1880 of (collapsed) Gibbs sampling
Iteration 1890 of (collapsed) Gibbs sampling
Iteration 1900 of (collapsed) Gibbs sampling
Iteration 1910 of (collapsed) Gibbs sampling
Iteration 1920 of (collapsed) Gibbs sampling
Iteration 1930 of (collapsed) Gibbs sampling
Iteration 1940 of (collapsed) Gibbs sampling
Iteration 1950 of (collapsed) Gibbs sampling
Iteration 1960 of (collapsed) Gibbs sampling
Iteration 1970 of (collapsed) Gibbs sampling
Iteration 1980 of (collapsed) Gibbs sampling
Iteration 1990 of (collapsed) Gibbs sampling
Iteration 2000 of (collapsed) Gibbs sampling
Iteration 2010 of (collapsed) Gibbs sampling
Iteration 2020 of (collapsed) Gibbs sampling
Iteration 2030 of (collapsed) Gibbs sampling
Iteration 2040 of (collapsed) Gibbs sampling
Iteration 2050 of (collapsed) Gibbs sampling
Iteration 2060 of (collapsed) Gibbs sampling
Iteration 2070 of (collapsed) Gibbs sampling
Iteration 

Iteration 3690 of (collapsed) Gibbs sampling
Iteration 3700 of (collapsed) Gibbs sampling
Iteration 3710 of (collapsed) Gibbs sampling
Iteration 3720 of (collapsed) Gibbs sampling
Iteration 3730 of (collapsed) Gibbs sampling
Iteration 3740 of (collapsed) Gibbs sampling
Iteration 3750 of (collapsed) Gibbs sampling
Iteration 3760 of (collapsed) Gibbs sampling
Iteration 3770 of (collapsed) Gibbs sampling
Iteration 3780 of (collapsed) Gibbs sampling
Iteration 3790 of (collapsed) Gibbs sampling
Iteration 3800 of (collapsed) Gibbs sampling
Iteration 3810 of (collapsed) Gibbs sampling
Iteration 3820 of (collapsed) Gibbs sampling
Iteration 3830 of (collapsed) Gibbs sampling
Iteration 3840 of (collapsed) Gibbs sampling
Iteration 3850 of (collapsed) Gibbs sampling
Iteration 3860 of (collapsed) Gibbs sampling
Iteration 3870 of (collapsed) Gibbs sampling
Iteration 3880 of (collapsed) Gibbs sampling
Iteration 3890 of (collapsed) Gibbs sampling
Iteration 3900 of (collapsed) Gibbs sampling
Iteration 

Iteration 5520 of (collapsed) Gibbs sampling
Iteration 5530 of (collapsed) Gibbs sampling
Iteration 5540 of (collapsed) Gibbs sampling
Iteration 5550 of (collapsed) Gibbs sampling
Iteration 5560 of (collapsed) Gibbs sampling
Iteration 5570 of (collapsed) Gibbs sampling
Iteration 5580 of (collapsed) Gibbs sampling
Iteration 5590 of (collapsed) Gibbs sampling
Iteration 5600 of (collapsed) Gibbs sampling
Iteration 5610 of (collapsed) Gibbs sampling
Iteration 5620 of (collapsed) Gibbs sampling
Iteration 5630 of (collapsed) Gibbs sampling
Iteration 5640 of (collapsed) Gibbs sampling
Iteration 5650 of (collapsed) Gibbs sampling
Iteration 5660 of (collapsed) Gibbs sampling
Iteration 5670 of (collapsed) Gibbs sampling
Iteration 5680 of (collapsed) Gibbs sampling
Iteration 5690 of (collapsed) Gibbs sampling
Iteration 5700 of (collapsed) Gibbs sampling
Iteration 5710 of (collapsed) Gibbs sampling
Iteration 5720 of (collapsed) Gibbs sampling
Iteration 5730 of (collapsed) Gibbs sampling
Iteration 

Iteration 7350 of (collapsed) Gibbs sampling
Iteration 7360 of (collapsed) Gibbs sampling
Iteration 7370 of (collapsed) Gibbs sampling
Iteration 7380 of (collapsed) Gibbs sampling
Iteration 7390 of (collapsed) Gibbs sampling
Iteration 7400 of (collapsed) Gibbs sampling
Iteration 7410 of (collapsed) Gibbs sampling
Iteration 7420 of (collapsed) Gibbs sampling
Iteration 7430 of (collapsed) Gibbs sampling
Iteration 7440 of (collapsed) Gibbs sampling
Iteration 7450 of (collapsed) Gibbs sampling
Iteration 7460 of (collapsed) Gibbs sampling
Iteration 7470 of (collapsed) Gibbs sampling
Iteration 7480 of (collapsed) Gibbs sampling
Iteration 7490 of (collapsed) Gibbs sampling
Iteration 7500 of (collapsed) Gibbs sampling
Iteration 7510 of (collapsed) Gibbs sampling
Iteration 7520 of (collapsed) Gibbs sampling
Iteration 7530 of (collapsed) Gibbs sampling
Iteration 7540 of (collapsed) Gibbs sampling
Iteration 7550 of (collapsed) Gibbs sampling
Iteration 7560 of (collapsed) Gibbs sampling
Iteration 

In [98]:
# theta matrix
ldaobj.dt_avg()

array([[0.01333935, 0.0117148 , 0.03092058, ..., 0.01120939, 0.0050361 ,
        0.01016245],
       [0.01982287, 0.00468599, 0.03644122, ..., 0.0168277 , 0.00436393,
        0.01247987],
       [0.0112966 , 0.0093517 , 0.00948136, ..., 0.01641815, 0.00734198,
        0.03171799],
       ...,
       [0.00519157, 0.0049106 , 0.02322478, ..., 0.00621328, 0.15717114,
        0.02236909],
       [0.01002377, 0.01523772, 0.02268621, ..., 0.00615689, 0.12008716,
        0.01374802],
       [0.00801971, 0.01095878, 0.01628136, ..., 0.01065412, 0.12900538,
        0.00336022]])

In [432]:
ldaobj.alpha
ldaobj.beta

0.022222222222222223

In [99]:
# B matrix
ldaobj.tt_avg()

array([[1.39778232e-04, 1.72649650e-05, 1.96187940e-05, ...,
        4.25439884e-05, 5.20374479e-06, 1.12388728e-04],
       [8.26000682e-06, 3.41415519e-05, 1.51904635e-05, ...,
        9.36517437e-06, 1.01444563e-05, 1.42445012e-05],
       [8.26000682e-06, 6.13993021e-06, 5.35345873e-06, ...,
        9.36517437e-06, 5.20374479e-06, 7.56893378e-06],
       ...,
       [8.26000682e-06, 6.13993021e-06, 1.93984965e-05, ...,
        3.49315224e-05, 5.20374479e-06, 2.87591817e-05],
       [4.55902017e-05, 1.65056099e-04, 2.91038850e-05, ...,
        3.50625707e-05, 1.49065368e-05, 2.28653009e-05],
       [3.11079389e-05, 1.34411539e-04, 5.35345873e-06, ...,
        2.55508830e-05, 1.43049962e-05, 7.56893378e-06]])

In [112]:
ldaobj.perplexity()

array([2357.02750331, 2361.4460772 , 2355.43948735, 2360.65819513,
       2358.34571651, 2364.164352  , 2361.63085631, 2353.20327096,
       2358.31560992, 2354.65026229, 2356.09704059, 2359.96775016,
       2360.77474038, 2358.27443277, 2356.32225691, 2355.07916471,
       2352.59639058, 2354.40460389, 2350.39999534, 2358.13202002,
       2363.25173821, 2359.98474341, 2358.97932268, 2355.13868686,
       2355.72183581, 2356.77268569, 2358.78261379, 2354.02597026,
       2366.20360014, 2361.0037464 , 2358.06933294, 2360.72412391,
       2360.64946261, 2362.00162315, 2357.70234681, 2355.91105146,
       2359.7468124 , 2360.25299126, 2363.23418896, 2360.48956184,
       2355.47529154, 2361.5712743 , 2350.74629634, 2365.17396233,
       2361.40507553, 2355.54187499, 2352.32753874, 2354.49182417,
       2359.10751616, 2357.17949778, 2356.17271362, 2358.5292816 ,
       2354.59434695, 2355.49881783, 2359.04459081, 2353.24745516,
       2357.64224948, 2348.29998576, 2352.70210256, 2359.86356

In [113]:
#only keep the last 5 samples
ldaobj.samples_keep(5)

(148, 40)

## NMF with ANLS

using numba

In [1]:
import numba
@numba.jit
def NMF_ANLS_full(P,k):

    B = np.random.uniform(low = 1, high = 1000,size = (P.shape[0], k))
    B = B/B.sum(axis = 0)

    return B, NMF_ANLS(B,P)

@numba.jit
def NMF_ANLS(B, P):
    
    theta = np.zeros(shape = (B.shape[1],P.shape[1]))
    
    for i,column in enumerate(P.T):
        theta[:,i] = ANLS_column(B, column)
        
    return theta

@numba.jit
def ANLS_column(B, y):
    
    m,n = B.shape
    g = np.zeros(n)
    E = np.arange(n)
    S = np.array([])
    w = np.matmul(B.T, y-np.matmul(B,g))
    

    while len(E) != 0 and w[E].max() > 0:
        
        t = w[E].argmax()
        E = np.delete(E, t).astype(int)
        S = np.append(S,t).astype(int)
        
        Bs = B.copy()
        Bs[:,E] = 0
        
        z = np.linalg.lstsq(Bs, y, rcond=None)[0]
        z[E] = 0

        while z[S].max() <= 0:
            
            alpha = (g/(g-z))[S].min()
            
            g = g + alpha*(z-g)
            
            s_move = S[g[S] == 0]
            S = np.delete(S, s_move).astype(int)
            E = np.append(E, s_move).astype(int)
            Bs = B.copy()
            Bs[:,E] = 0            
            z = np.linalg.lstsq(Bs, y, rcond=None)[0]
            #z[E] = 0
        g = z
        w = np.matmul(B.T, y-np.matmul(B,g))
    
    return g

In [5]:
td_matrix_pd = pd.read_csv('Matrix_meeting_tfidf.csv')
td_matrix_pd = td_matrix_pd.replace(0,1e-10)
td_matrix = td_matrix_pd.values
td_matrix = td_matrix / td_matrix.sum(axis = 0)

In [None]:
B, theta = NMF_ANLS_full(td_matrix, 40)

In [34]:
B

array([[1.71974442e-04, 9.03911796e-05, 9.56472032e-05, ...,
        6.99692424e-05, 2.02703651e-04, 1.16438672e-04],
       [3.96215277e-05, 6.30405396e-05, 4.33657345e-06, ...,
        7.31605332e-05, 9.24224835e-05, 1.60204430e-05],
       [1.29826388e-04, 7.57184254e-05, 1.77945139e-04, ...,
        1.58156657e-04, 1.79502456e-04, 1.41359672e-04],
       ...,
       [1.30907479e-04, 7.31240679e-05, 7.65248867e-05, ...,
        1.14985118e-04, 2.00218524e-04, 1.14366160e-04],
       [1.72398956e-04, 1.14349836e-04, 6.05783768e-05, ...,
        2.38149056e-05, 1.55171657e-04, 3.52874560e-05],
       [1.24297140e-04, 5.99347261e-05, 9.76079869e-05, ...,
        2.07104685e-04, 4.37709252e-05, 1.45002747e-04]])

In [35]:
theta

array([[0.01430441, 0.        , 0.        , ..., 0.02697671, 0.06354522,
        0.        ],
       [0.03625602, 0.        , 0.        , ..., 0.11516249, 0.07665657,
        0.        ],
       [0.03090193, 0.        , 0.        , ..., 0.        , 0.02174012,
        0.        ],
       ...,
       [0.04247959, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01578778, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02294112, 0.09574468, 0.17671297, ..., 0.        , 0.        ,
        0.04406879]])

### KL-divergence non-increasing updating

In [69]:
@numba.jit
def WKLNMF(P, k, eps, maxit):
    V, D = P.shape
    W = P

    B = np.random.uniform(1e-5,1,size = (V, k))
    B = B / B.sum(axis = 0)

    Theta = np.random.uniform(1e-5,1,size = (k, D))
    Theta = Theta/Theta.sum(axis = 0)
    
    KL_div_prev = np.inf
    for i in numba.prange(maxit):
        Theta = (Theta/np.matmul(B.T,W))*np.matmul(B.T,(W*P)/np.matmul(B,Theta))
        B = (B/np.matmul(W, Theta.T))*np.matmul((W*P)/np.matmul(B, Theta), Theta.T)
        KL_div = KL(W,P,B,Theta)
        if abs(KL_div_prev - KL_div) < eps:
            print("converged after {}".format(str(i)))
            break
        KL_div_prev = KL_div
    
    B_norm = B/B.sum(axis = 0)
    Theta_norm = Theta/Theta.sum(axis = 0)
    return B_norm, Theta_norm

@numba.jit
def KL(W,P,B,Theta):
    loss = 0
    P_hat = np.matmul(B, Theta)
    
    loss = np.sum(W*(P*np.log(P/P_hat) - P + P_hat))
            
    return loss



In [57]:
B1, theta1 = WKLNMF(td_matrix, 40, 1e-5,1000)
B2, theta2 = WKLNMF(td_matrix, 40, 1e-5,1000)

converged after 198
converged after 199


What is the functional to report?

Herfindahl index over topic composition in each meeting (how concentrated topics in each meeting)

In [74]:
def band(P, k, eps, maxit, M):
    
    record = np.zeros((M, P.shape[1]))
    for m in range(M):
        print('Trial number {}'.format(str(m)))
        
        B, Theta = WKLNMF(P, k, eps,maxit)
    
        record[m] = (Theta**2).sum(axis = 0)
    
    return record
        

In [76]:
Herfindahl = band(td_matrix,40,1e-5,1000,120)

Trial number 0
converged after 187
Trial number 1
converged after 198
Trial number 2
converged after 193
Trial number 3
converged after 198
Trial number 4
converged after 191
Trial number 5
converged after 188
Trial number 6
converged after 191
Trial number 7
converged after 181
Trial number 8
converged after 183
Trial number 9
converged after 189
Trial number 10
converged after 197
Trial number 11
converged after 186
Trial number 12
converged after 195
Trial number 13
converged after 194
Trial number 14
converged after 183
Trial number 15
converged after 194
Trial number 16
converged after 192
Trial number 17
converged after 192
Trial number 18
converged after 193
Trial number 19
converged after 203
Trial number 20
converged after 194
Trial number 21
converged after 192
Trial number 22
converged after 207
Trial number 23
converged after 194
Trial number 24
converged after 187
Trial number 25
converged after 198
Trial number 26
converged after 197
Trial number 27
converged after 183
Tr

In [128]:
import matplotlib.pyplot as plt
%matplotlib notebook

plt.plot(Herfindahl.max(axis = 0), c = 'grey', label = 'NMF max')
plt.plot(Herfindahl.min(axis = 0), label = 'NMF min')

theta_LDA = ldaobj.dt_avg()
plt.plot((theta_LDA**2).sum(axis = 1),c = 'r', label = 'LDA')
plt.title('Herfindahl measure of topic concentration in each meeting')
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x2b6bfc2d128>