In [1]:
import os

import pickle as pkl
import re

import numpy as np
import pandas as pd

from gensim.models.phrases import Phrases, Phraser

import textacy
from textacy import preprocess_text, Doc, Corpus
from textacy.vsm import Vectorizer, GroupVectorizer
from textacy.tm import TopicModel
en = textacy.load_spacy("en_core_web_sm", disable='parser')

data_directory = '/'.join(os.getcwd().split("/")[:-1]) + '/data/'

test_set = [173,  74,  20, 101,  83,   1,  38,  39,  72,  50,  21, 164,  57,
       169,   8,  63, 102,  34,  80, 192, 139,  88, 112, 116,  61,  46,
        51, 165, 135,  89, 108,   7,  25,  15, 125,  93, 130,  71]

In [2]:

orig_data = pd.read_csv(data_directory + 'qaData.csv', parse_dates=['Date'])
orig_data['Year'] = orig_data['Date'].dt.year
orig_data['Month'] = orig_data['Date'].dt.month
orig_data['Quarter'] = orig_data['Month'].apply(lambda x: 1 if x < 4 else 2 if x < 7 else 3 if x < 9 else 4)
orig_data['Company'] = orig_data['Company'].str.title().str.replace(" ", "")
orig_data['EventType'] = orig_data['EventType'].str.title().str.replace(" ", "")
orig_data['Participants'] = orig_data['Participants'].str.title().str.replace(" ", "")
orig_data['AnalystName'] = orig_data['AnalystName'].str.title().str.replace(" ", "")
orig_data['AnalystCompany'] = orig_data['AnalystCompany'].str.title().str.replace(" ", "")
orig_data['Tag'] = orig_data['EarningTag2'].str.title().str.replace(" ", "")

orig_data = orig_data.loc[~orig_data['AnalystName'].isna()].copy()

groups = []
for i, (name, group) in enumerate(orig_data.groupby(['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType', 'Date'])):
    g2 = group.copy()
    g2['EventNumber'] = i
    groups.append(g2)
    
indexed_data = pd.concat(groups)
#train_data = indexed_data.loc[~indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True)

q_data = indexed_data[['Date', 'EventNumber', 'Year', 'Quarter', 'Company', 'AnalystName', 'EventType', 'Tag', 'Question']].copy()


The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.


In [3]:
docs = Corpus(lang=en, docs=q_data.apply(lambda x: Doc(content=' '.join(
                                                        [token for token in preprocess_text(text=x['Question'], lowercase=True, no_punct=True, no_contractions=True, no_accents=True, no_currency_symbols=True, no_numbers=True).split(' ') if len(token)>2]),
                                                    lang=en, metadata={'Year':x['Year'],
                                                                       'Quarter':x['Quarter'],
                                                                       'Company':x['Company'],
                                                                       'Analyst':x["AnalystName"],
                                                                       'Tag':x['Tag'],
                                                                       'EventType':x['EventType'],
                                                                       'EventNumber':x['EventNumber']}),axis=1).tolist())
tokenized_docs = [list(doc.to_terms_list(ngrams=(1), as_strings=True, normalize='lemma', drop_determiners=True)) for doc in docs]

bigram_phraser = Phraser(Phrases(tokenized_docs, min_count=10, threshold=20, delimiter=b' '))
bigram_docs = [bigram_phraser[doc] for doc in tokenized_docs] 

trigram_phraser = Phraser(Phrases(bigram_docs, min_count=5, threshold=10, delimiter=b' '))
trigram_docs = [trigram_phraser[doc] for doc in bigram_docs]

q_list = [{'Year':docs[i].metadata['Year'], 
            'Quarter':"Q{}".format(docs[i].metadata['Quarter']), 
            'Company':docs[i].metadata['Company'], 
            'Analyst':docs[i].metadata['Analyst'], 
            'Tag':docs[i].metadata['Tag'],
            'EventType':docs[i].metadata['EventType'], 
            'EventNumber':docs[i].metadata['EventNumber'],
            'Question':trigram_docs[i]} for i in range(len(trigram_docs))]

#meta_docs = [doc + ["{}".format(q_list[i]['Year']), q_list[i]['Quarter'], q_list[i]['Company']] for i, doc in enumerate(trigram_docs)]


In [33]:
a_topic_list = []

analysts = q_data['AnalystName'].unique().tolist()

NUM_TOPICS = 5
topic_cols = ["t{}".format(i) for i in range(NUM_TOPICS)]

for i in q_data['EventNumber'].unique():
    if i == 0:
        continue
    
    a_dict = {a:[] for a in analysts}

    for doc in q_list:
        if doc['EventNumber'] < i:
            a_dict[doc['Analyst']] += doc['Question'] 
        
    a_docs = [v for k,v in a_dict.items()]
    a_list = [{'Analyst':k, 'Words':v} for k,v in a_dict.items()]

    vec = Vectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(a_docs)
    doc_term_matrix = vec.transform(a_docs)

    model = TopicModel('nmf', n_topics=NUM_TOPICS)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)

    a_topic_dict = {}

    for doc, topic in model.top_doc_topics(doc_topic_matrix, docs=-1, top_n=-1, weights=True):
        a_topic_dict[a_list[doc]['Analyst']] = {"t{}".format(k):v for k,v in topic}

    a_t = pd.DataFrame.from_dict(a_topic_dict, orient='index').reset_index().fillna(0)
    a_t = a_t[['index'] + topic_cols]
    a_t.columns = ['AnalystName'] + topic_cols
    a_t['EventNumber'] = i
    a_topic_list.append(a_t)

a_doc_term_matrix = doc_topic_matrix.copy()

a_topic_df = pd.concat(a_topic_list)
a_topic_df['tMax'] = a_topic_df[topic_cols].idxmax(axis=1)

In [14]:
a_topic_df.to_csv(data_directory+"analystTopic.csv", index=False)

In [34]:
t_topic_list = []

tags = q_data['Tag'].unique().tolist()

NUM_TOPICS = 5
topic_cols = ["t{}".format(i) for i in range(NUM_TOPICS)]

for i in q_data['EventNumber'].unique():
    if i == 0:
        continue
    
    t_dict = {t:[] for t in tags}

    for doc in q_list:
        if doc['EventNumber'] < i:
            t_dict[doc['Tag']] += doc['Question'] 
        
    t_docs = [v for k,v in t_dict.items()]
    t_list = [{'Tag':k, 'Words':v} for k,v in t_dict.items()]

    vec = Vectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(t_docs)
    doc_term_matrix = vec.transform(t_docs)

    model = TopicModel('nmf', n_topics=NUM_TOPICS)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)

    t_topic_dict = {}

    for doc, topic in model.top_doc_topics(doc_topic_matrix, docs=-1, top_n=-1, weights=True):
        t_topic_dict[t_list[doc]['Tag']] = {"t{}".format(k):v for k,v in topic}

    t_t = pd.DataFrame.from_dict(t_topic_dict, orient='index').reset_index().fillna(0)
    t_t = t_t[['index'] + topic_cols]
    t_t.columns = ['Tag'] + topic_cols
    t_t['EventNumber'] = i
    t_topic_list.append(t_t)
    
t_doc_term_matrix = doc_topic_matrix.copy()

t_topic_df = pd.concat(t_topic_list)
t_topic_df['tMax'] = t_topic_df[topic_cols].idxmax(axis=1)

In [35]:
a_doc_term_matrix.T(t_doc_term_matrix)

11

In [38]:
a_doc_term_matrix.dot(t_doc_term_matrix.T)

array([[ 6.5740467 ,  6.39871547, 10.86417935, ..., 17.25449029,
         6.51700495,  3.96572495],
       [ 2.76958947,  0.        ,  0.        , ...,  4.29012093,
         0.        ,  2.14244999],
       [ 4.65533809,  3.84143165,  4.30523789, ..., 10.02205078,
         2.69667457,  3.19657974],
       ...,
       [ 0.44639927,  0.        ,  0.92406501, ...,  0.9784097 ,
         0.52581223,  0.27669521],
       [ 2.75606472,  0.84471951,  5.6238114 , ...,  7.47807263,
         3.13157921,  1.51703465],
       [ 0.2960852 ,  0.12903673,  0.47813936, ...,  0.69597067,
         0.29836503,  0.19353779]])

In [30]:
for topic_idx, top_terms in model.top_topic_terms(vec.id_to_term, topics=-1):
    print('topic', topic_idx, ':', '   '.join(top_terms))
    
    for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=topic_idx, top_n=4, weights=True):
        for doc, weight in top_docs:
            print("\t-{}: {:.4}".format(t_list[doc]['Tag'], weight))

topic 0 : weak   fourth quarter   see   think   number   rate   market   fee   maybe   quarter
	-Cib: 7.709
	-Revenue: 5.738
	-Awm: 5.717
	-Expenses: 5.7
topic 1 : scb   basel   propose   dividend payout ratio   stress test   minimis   capital ratio   stress capital buffer   gsib buffer   tier number
	-Capital: 11.43
	-RegulatoryTopics: 10.27
	-BalanceSheet: 1.637
	-AccountingAndTaxes: 0.9872
topic 2 : reserve release   redetermination   share national   credit exam   oil price   non accrual   non investment grade   unfunded   charge off   oil gas
	-CreditCosts: 13.08
	-MacroeconomicUpdate: 1.886
	-BalanceSheet: 1.021
	-Cb: 1.019
topic 3 : promotional   new account   mortgage banking   promotional balance   zelle   origination   promotion   airline   credit card   retail partner
	-Ccb: 14.49
	-BalanceSheet: 2.563
	-Awm: 0.7186
	-Cb: 0.4639
topic 4 : blockchain   buy stock   stakeholder   investment spending   selling   proxy   tangible book   digitalization   stake   culture
	-OtherTop