In [1]:
import os

import pickle as pkl
import re

import scipy as sp
import numpy as np
import pandas as pd

import nimfa 

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.metrics import roc_auc_score, accuracy_score, make_scorer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV

from gensim.models.phrases import Phrases, Phraser

data_directory = '/'.join(os.getcwd().split("/")[:-2]) + '/data/'

import sys
sys.path.insert(3, '/'.join(os.getcwd().split("/")[:-2]) + '/textacy')

import textacy
from textacy import preprocess_text, Doc, Corpus
from textacy.vsm import Vectorizer, GroupVectorizer
from textacy.tm import TopicModel
en = textacy.load_spacy("en_core_web_sm", disable='parser')

test_set = [173,  74,  20, 101,  83,   1,  38,  39,  72,  50,  21, 164,  57,
       169, 8,  63, 102,  34,  80, 192, 139,  88, 112, 116,  61,  46,
        51, 165, 135,  89, 108,   7,  25,  15, 125,  93, 130,  71]

In [2]:
with open(data_directory+"tokenized_docs.p", "rb") as f:
        docs = pkl.load(f)
        
with open(data_directory+"cleaned_data.p", "rb") as f:
        orig_data = pkl.load(f)

In [3]:
bigram_phraser = Phraser(Phrases([doc[0] for doc in docs], min_count=10, threshold=20, delimiter=b' '))
bigram_docs = [bigram_phraser[doc[0]] for doc in docs] 

trigram_phraser = Phraser(Phrases(bigram_docs, min_count=5, threshold=10, delimiter=b' '))
trigram_docs = [trigram_phraser[doc] for doc in bigram_docs] 

analysts = [d[1]['AnalystName'] for d in docs]
tags = [d[1]['Tag'] for d in docs]
companies = [d[1]['Company'] for d in docs]
quarters = [d[1]['Quarter'] for d in docs]

In [149]:
A_RANK = 4

a_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, analysts)
a_doc_term_matrix = a_vec.transform(trigram_docs, analysts)

a_mod = nimfa.Lsnmf(V=a_doc_term_matrix, max_iter=1000, rank=A_RANK, n_run=10)
a_mod_fit = a_mod()

a_df = pd.SparseDataFrame(normalize(a_mod_fit.basis()), columns = ['aTopic'+str(i) for i in range(A_RANK)], index=a_vec.grps_list).fillna(0)
a_df.index.name = 'AnalystName'
#a_df.join(a_df.idxmax(axis=1).rename('aTopicMax')).reset_index().to_csv(data_directory+"analystTopic.csv", index=False)
a_df.reset_index().to_csv(data_directory+"analystTopic.csv", index=False)

In [119]:
T_RANK = 6

t_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, tags)
t_doc_term_matrix = t_vec.transform(trigram_docs, tags)

t_mod = nimfa.Lsnmf(V=t_doc_term_matrix, max_iter=1000, rank=T_RANK, n_run=10)
t_mod_fit = t_mod()

t_df = pd.SparseDataFrame(normalize(t_mod_fit.basis()), columns = ['tTopic'+str(i) for i in range(T_RANK)], index=t_vec.grps_list).fillna(0)
t_df.index.name = 'Tag'
#t_df.join(t_df.idxmax(axis=1).rename('tTopicMax')).reset_index().to_csv(data_directory+"tagTopic.csv", index=False)
t_df.reset_index().to_csv(data_directory+"tagTopic.csv", index=False)

In [120]:
C_RANK = 4

c_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, companies)
c_doc_term_matrix = c_vec.transform(trigram_docs, companies)

c_mod = nimfa.Lsnmf(V=c_doc_term_matrix, max_iter=1000, rank=C_RANK, n_run=10)
c_mod_fit = c_mod()

c_df = pd.SparseDataFrame(normalize(c_mod_fit.basis()), columns = ['cTopic'+str(i) for i in range(C_RANK)], index=c_vec.grps_list).fillna(0)
c_df.index.name = 'Company'
#c_df.join(c_df.idxmax(axis=1).rename('cTopicMax')).reset_index().to_csv(data_directory+"companyTopic.csv", index=False)
c_df.reset_index().to_csv(data_directory+"companyTopic.csv", index=False)

In [121]:
Q_RANK = 4

q_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, quarters)
q_doc_term_matrix = q_vec.transform(trigram_docs, quarters)

q_mod = nimfa.Lsnmf(V=q_doc_term_matrix, max_iter=1000, rank=Q_RANK, n_run=10)
q_mod_fit = q_mod()

q_df = pd.SparseDataFrame(normalize(q_mod_fit.basis()), columns = ['qTopic'+str(i) for i in range(Q_RANK)], index=q_vec.grps_list).fillna(0)
q_df.index.name = 'Quarter'
#c_df.join(c_df.idxmax(axis=1).rename('cTopicMax')).reset_index().to_csv(data_directory+"companyTopic.csv", index=False)
q_df.reset_index().to_csv(data_directory+"quarterTopic.csv", index=False)

In [4]:
cols_save = ['EventNumber', 'Company', 'AnalystName', 'Month', 'Year', 'Quarter', 'EventType', 'Date'] 

pivot_data = pd.pivot_table(orig_data, index=cols_save, columns=['Tag'], aggfunc='size', fill_value=0)
pivot_data.reset_index(inplace=True)
pivot_data.columns = pivot_data.columns.get_level_values(0)
pivot_sum_data = pivot_data.groupby(cols_save).sum(axis=1).reset_index()

melt_data = pd.melt(pivot_sum_data, id_vars=cols_save, var_name=['Tag'], value_name='NumQ')
melt_data['NumQ'] = melt_data['NumQ'].astype(bool).astype(int)
melt_data['Tag'] = melt_data['Tag'].str.split("_").str[-1]

analyst_data = pd.read_csv(data_directory+"analystTopic.csv")
tag_data = pd.read_csv(data_directory+"tagTopic.csv")
company_data = pd.read_csv(data_directory+"companyTopic.csv")
quarter_data = pd.read_csv(data_directory+"quarterTopic.csv")

a_topic_cols = analyst_data.drop(['AnalystName'], axis=1).columns.tolist()
t_topic_cols = tag_data.drop(['Tag'], axis=1).columns.tolist()
c_topic_cols = company_data.drop(['Company'], axis=1).columns.tolist()
q_topic_cols = quarter_data.drop(['Quarter'], axis=1).columns.tolist()

merge_data = melt_data.merge(analyst_data, on=['AnalystName'])
merge_data = merge_data.merge(tag_data, on=['Tag'])
merge_data = merge_data.merge(company_data, on=['Company'])
merge_data = merge_data.merge(quarter_data, on=['Quarter'])

merge_data = pd.concat([merge_data,
                      pd.get_dummies(merge_data[["EventType"]])], axis=1).reset_index(drop=True)

#features_data = merge_data.drop(['Company', 'AnalystName', 'QuestionNumber' ,'EventType', 'Date', 'Tag', 'Month', 'Year'], axis=1).drop_duplicates().copy()
features_data = merge_data.drop(['Company', 'AnalystName' ,'EventType', 'Date', 'Tag', 'Month', 'Year'], axis=1).copy()
features_data[a_topic_cols + t_topic_cols + c_topic_cols + q_topic_cols] = features_data[a_topic_cols + t_topic_cols + c_topic_cols + q_topic_cols]**2
train, test = features_data.loc[~features_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True), \
                features_data.loc[features_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ','EventNumber'], axis=1), train['NumQ']
X_test, y_test = test.drop(['NumQ', 'EventNumber'], axis=1), test['NumQ']

cols_list = train.drop(['NumQ','EventNumber'], axis=1).columns.values

In [5]:
estimator = GradientBoostingClassifier(warm_start=True, n_estimators=1000, n_iter_no_change=5).fit(X_train, y_train)
preds = estimator.predict_proba(X_test)[:,1]
print("ROC AUC:", roc_auc_score(y_test, preds))
print("Accuracy:", accuracy_score(y_test, np.round(preds)))

ROC AUC: 0.7751641003840418
Accuracy: 0.8576371725160653


In [6]:
estimator = RandomForestClassifier(warm_start=True, n_estimators=1000).fit(X_train, y_train)
preds = estimator.predict_proba(X_test)[:,1]
print("ROC AUC:", roc_auc_score(y_test, preds))
print("Accuracy:", accuracy_score(y_test, np.round(preds)))

importances = estimator.feature_importances_
std = np.std([tree.feature_importances_ for tree in estimator.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

for f in range(indices.shape[0]):
    
    print("{}. {}: {:.4} +/- {:.4}".format(f, cols_list[indices[f]], importances[indices[f]], std[indices[f]]))
    

ROC AUC: 0.7398531562754437
Accuracy: 0.8386060306475531
0. aTopic1: 0.1108 +/- 0.01477
1. aTopic0: 0.1103 +/- 0.01476
2. aTopic2: 0.1094 +/- 0.01486
3. aTopic3: 0.1078 +/- 0.01426
4. cTopic0: 0.04995 +/- 0.0114
5. cTopic3: 0.04688 +/- 0.01006
6. cTopic1: 0.0431 +/- 0.009839
7. tTopic4: 0.04289 +/- 0.01691
8. tTopic0: 0.03897 +/- 0.01648
9. tTopic1: 0.0387 +/- 0.01416
10. EventType_Conference: 0.03677 +/- 0.01477
11. tTopic5: 0.03363 +/- 0.01317
12. tTopic2: 0.03314 +/- 0.01547
13. cTopic2: 0.03279 +/- 0.007383
14. tTopic3: 0.0295 +/- 0.01254
15. qTopic3: 0.02187 +/- 0.006052
16. qTopic0: 0.02171 +/- 0.005962
17. Quarter: 0.02159 +/- 0.005959
18. qTopic2: 0.02158 +/- 0.006047
19. EventType_EarningsCall: 0.0195 +/- 0.009614
20. qTopic1: 0.01774 +/- 0.004891
21. EventType_FixedIncomeCall: 0.00589 +/- 0.00311
22. EventType_InvestorDay: 0.003888 +/- 0.002107
23. EventType_Other: 0.001598 +/- 0.001115


In [133]:
estimator = GradientBoostingClassifier(warm_start=True, 
                                       n_estimators=1000,
                                       learning_rate=1.0,
                                       max_depth=25,
                                       min_samples_leaf=25,
                                       n_iter_no_change=5)

param_grid = {'min_samples_split':[2,3,5,10,15,25]}


grid = GridSearchCV(estimator, 
                    param_grid,
                    make_scorer(roc_auc_score, greater_is_better=True),
                    cv = 5,
                    return_train_score=False)
grid.fit(X_train, y_train)

grid.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=25,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=25, min_samples_split=25,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=5, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=True)

In [7]:
estimator = GradientBoostingClassifier(warm_start=True, 
                                       n_estimators=1000,
                                       learning_rate=0.01,
                                       max_depth=25,
                                       max_features='auto',
                                       min_samples_leaf=25,
                                       min_samples_split=10, 
                                       n_iter_no_change=5).fit(X_train, y_train)

preds = estimator.predict_proba(X_test)[:,1]
print("ROC AUC:", roc_auc_score(y_test, preds))
print("Accuracy:", accuracy_score(y_test, np.round(preds)))

ROC AUC: 0.7847418466186795
Accuracy: 0.8608502224419179


In [145]:
calibrated = CalibratedClassifierCV(estimator, method='sigmoid', cv=5).fit(X_train, y_train)

preds = calibrated.predict_proba(X_test)[:,1]
print("ROC AUC:", roc_auc_score(y_test, preds))
print("Accuracy:", accuracy_score(y_test, np.round(preds)))

ROC AUC: 0.7884823506524385
Accuracy: 0.856401384083045


In [146]:
X = features_data.drop(['NumQ', 'EventNumber'], axis=1).copy()
y = features_data['NumQ'].copy()

estimator = GradientBoostingClassifier(warm_start=True, n_estimators=1000).fit(X, y)

In [147]:
q_ind_data = pd.read_csv(data_directory + 'qaData.csv', parse_dates=['Date'])

q_ind_data['EarningTag2'] = q_ind_data['EarningTag2'].str.strip()
q_ind_data['Year'] = q_ind_data['Date'].dt.year
q_ind_data['Month'] = q_ind_data['Date'].dt.month
q_ind_data['Quarter'] = q_ind_data['Month'].apply(lambda x: 1 if x < 4 else 2 if x < 7 else 3 if x < 9 else 4)
q_ind_data['Company'] = q_ind_data['Company'].str.title().str.replace(" ", "")
q_ind_data['EventType'] = q_ind_data['EventType'].str.title().str.replace(" ", "")
q_ind_data['AnalystName'] = q_ind_data['AnalystName'].str.title().str.replace(" ", "")
q_ind_data['Tag'] = q_ind_data['EarningTag2'].str.title().str.replace(" ", "")

q_ind_data = q_ind_data.loc[~q_ind_data['AnalystName'].isna()].copy()

groups = []
for i, (name, group) in enumerate(q_ind_data.groupby(['Company', 'Month', 'Year', 'Quarter', 'EventType', 'Date'])):
    g2 = group.copy()
    g2['EventNumber'] = i
    g2.reset_index(drop=True, inplace=True)
    g2.index.name = "QuestionNumber"
    g2.reset_index(inplace=True)
    groups.append(g2)

q_ind_data = pd.concat(groups)[['EventNumber', 'QuestionNumber', 'Company', 'Month', 'Year', 'Quarter', 'EventType', 'Date', 'AnalystName', "Tag", "Question"]]

In [150]:
q_vec = Vectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs)
q_doc_term_matrix = q_vec.transform(trigram_docs)

t_affinity_mat = 1- sp.spatial.distance.cdist(q_doc_term_matrix.toarray(), t_doc_term_matrix.toarray(), 'cosine')
t_affinity = pd.SparseDataFrame(t_affinity_mat, columns=t_vec.grps_list)

a_affinity_mat = 1- sp.spatial.distance.cdist(q_doc_term_matrix.toarray(), a_doc_term_matrix.toarray(), 'cosine')
a_affinity = pd.SparseDataFrame(a_affinity_mat, columns=a_vec.grps_list)


In [151]:
event = features_data.loc[features_data['EventNumber']==155].reset_index(drop=True).copy()
X_event = event.drop(['NumQ', 'EventNumber'], axis=1).copy()
y_event = event['NumQ'].copy()
predictions = pd.Series(estimator.predict_proba(X_event)[:,1]).rename('Prediction')

event = pd.concat([event, predictions], axis=1)
event = pd.merge(event, analyst_data)
event['tTopicMax'] = event[['tTopicMax_tTopic'+str(i) for i in range(6)]].idxmax(1)
event['tTopicMax'] = event['tTopicMax'].str.split("_").str[-1]
event = pd.merge(event, tag_data[['tTopicMax', 'Tag']]).drop_duplicates()

top3 = event.groupby(['AnalystName']).apply(lambda x: x[['Tag', 'Prediction']].nlargest(2, 'Prediction')).reset_index()[['AnalystName', 'Tag', 'Prediction']]

KeyError: "['tTopicMax_tTopic0' 'tTopicMax_tTopic1' 'tTopicMax_tTopic2'\n 'tTopicMax_tTopic3' 'tTopicMax_tTopic4' 'tTopicMax_tTopic5'] not in index"

In [None]:
pred_q = {}

for ind in top3[['AnalystName', 'Tag']].values:
    a, t = ind
    if a not in pred_q:
        pred_q[a] = {}
    
    pred_q[a][t] = []
    affinities = (a_affinity[a] + t_affinity[t])/2
    q_ind = affinities.nlargest(1).reset_index()['index'].values

    for val in q_ind:
        #' '.join(docs[val][0])
        question = q_ind_data.loc[(q_ind_data['EventNumber']==docs[val][1]['EventNumber']) & 
                       (q_ind_data['QuestionNumber']==docs[val][1]['QuestionNumber']),'Question'].item()
        pred_q[a][t].append((question, affinities[val]))

for a, t_dict in pred_q.items():
    print("Analyst: {}".format(a))
    for t, values in pred_q[a].items():
        prob = top3.loc[(top3['AnalystName']==a) & (top3['Tag']==t),'Prediction'].item()
        print("\tTag: {} ({:.4})".format(t, prob))
        for v0, v1 in values:
            print("\t\t({:.4}) - {}".format(v1, v0))
    print("*********************************************************************")