In [4]:
import os
import sys

from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

In [5]:
#Set for your computer
data_directory = '/'.join(os.getcwd().split("/")[:-1]) + '/data/'

In [8]:
test_set = [173,  74,  20, 101,  83,   1,  38,  39,  72,  50,  21, 164,  57,
       169,   8,  63, 102,  34,  80, 192, 139,  88, 112, 116,  61,  46,
        51, 165, 135,  89, 108,   7,  25,  15, 125,  93, 130,  71]

In [6]:
#Read in data
orig_data = pd.read_csv(data_directory + 'qaData.csv', parse_dates=['Date'])
orig_data['EarningTag2'] = orig_data['EarningTag2'].str.strip()

#Add Year and Month, Quarter from Data
orig_data['Year'] = orig_data['Date'].dt.year
orig_data['Month'] = orig_data['Date'].dt.month
orig_data['Quarter'] = orig_data['Month'].apply(lambda x: 1 if x < 4 else 2 if x < 7 else 3 if x < 9 else 4)
orig_data['Company'] = orig_data['Company'].str.title().str.replace(" ", "")
orig_data['EventType'] = orig_data['EventType'].str.title().str.replace(" ", "")
orig_data['Participants'] = orig_data['Participants'].str.title().str.replace(" ", "")
orig_data['AnalystName'] = orig_data['AnalystName'].str.title().str.replace(" ", "")
orig_data['AnalystCompany'] = orig_data['AnalystCompany'].str.title().str.replace(" ", "")
orig_data['EarningTag2'] = orig_data['EarningTag2'].str.title().str.replace(" ", "")

#Pivot tag
pivot_data = (pd.pivot_table(orig_data, index=['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType'], columns='EarningTag2', aggfunc='size', fill_value=0)).reset_index()

#Melt data
pivot_melt_data = pd.melt(pivot_data, id_vars=['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType'], var_name='Tag', value_name='NumQ')
#One-hot encode
pivot_melt_data = pd.concat([pivot_melt_data, 
                             pd.get_dummies(pivot_melt_data['Company'], prefix='C', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['Participants'], prefix='P', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['AnalystName'], prefix='A', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['AnalystCompany'], prefix='AC', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['EventType'], prefix='ET', prefix_sep="_"),
                             pd.get_dummies(pivot_melt_data['Tag'], prefix='T', prefix_sep="_")], axis=1)
pivot_melt_data = pivot_melt_data.reset_index(drop=True)

#Analysts Present Data
event_analyst_data = orig_data[['Company', 'Participants', 'AnalystName', 'AnalystCompany', 'Month', 'Year', 'Quarter', 'EventType']].drop_duplicates().reset_index(drop=True)
event_analyst_data = pd.concat([event_analyst_data, 
                                pd.get_dummies(event_analyst_data['AnalystName'], prefix='AP', prefix_sep="_"),
                                pd.get_dummies(event_analyst_data['AnalystCompany'], prefix='ACP', prefix_sep="_")], axis=1).drop(['AnalystName', 'AnalystCompany'], axis=1)
event_analyst_data = event_analyst_data.groupby(['Company', 'Participants', 'Year', 'Month', 'Quarter', 'EventType']).sum().reset_index()

all_features_data = pd.merge(pivot_melt_data, event_analyst_data, on=['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType'])

#Index Data
groups = []
for i, (name, group) in enumerate(all_features_data.groupby(['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType'])):
    g2 = group.copy()
    g2['EventNumber'] = i
    groups.append(g2)
    
indexed_data = pd.concat(groups)

#Merge
indexed_data = indexed_data.drop(['Company', 'AnalystName', 'AnalystCompany', 'Participants', 'Tag', 'EventType'], axis=1)
indexed_data = indexed_data.reset_index(drop=True)
indexed_data['NumQ'] = indexed_data['NumQ'].astype(bool).astype(int)

In [5]:
cntModel_data = pd.read_csv("data/tagCntModel.csv")
merged_data = pd.merge(indexed_data, cntModel_data, on=['EventNumber']+pd.get_dummies(pivot_melt_data['Tag'], prefix='T', prefix_sep="_").columns.tolist())

train, test = merged_data.loc[~merged_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True), \
                merged_data.loc[merged_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ','EventNumber'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ', 'EventNumber'], axis=1), test['NumQ'].values

scores = np.zeros(50)
scores_gbc = np.zeros(50)
scores_rf = np.zeros(50)
scores_mlp = np.zeros(50)

estimator = LogisticRegression().fit(X_train, y_train)
preds = estimator.predict_proba(X_test)[:,1]
scores[0] = roc_auc_score(y_test, preds)
    
estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train, y_train)
preds_gbc = estimator_gbc.predict_proba(X_test)[:,1]
scores_gbc[0] = roc_auc_score(y_test, preds_gbc)
    
estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train, y_train)
preds_rf = estimator_rf.predict_proba(X_test)[:,1]
scores_rf[0] = roc_auc_score(y_test, preds_rf)

estimator_mlp = MLPClassifier(warm_start=True).fit(X_train, y_train)
preds_mlp = estimator_mlp.predict_proba(X_test)[:,1]
scores_mlp[0] = roc_auc_score(y_test, preds_mlp)

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator = LogisticRegression().fit(X_train_W, y_train)
    preds = estimator.predict_proba(X_test_W)[:,1]
    scores[comp] = roc_auc_score(y_test, preds)
    
    estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_gbc = estimator_gbc.predict_proba(X_test_W)[:,1]
    scores_gbc[comp] = roc_auc_score(y_test, preds_gbc)
    
    estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_rf = estimator_rf.predict_proba(X_test_W)[:,1]
    scores_rf[comp] = roc_auc_score(y_test, preds_rf)
    
    estimator_mlp = MLPClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_mlp = estimator_mlp.predict_proba(X_test_W)[:,1]
    scores_mlp[comp] = roc_auc_score(y_test, preds_mlp)

print(scores.max(), scores.argmax())
print(scores_gbc.max(), scores_gbc.argmax())
print(scores_rf.max(), scores_rf.argmax())
print(scores_mlp.max(), scores_mlp.argmax())

0.7204057794973152 0
0.775150047830823 0
0.7154414962877131 0
0.7204980209543571 0


In [10]:
indexed_data = pd.concat(groups)

#Merge
indexed_data = indexed_data.drop(['Company', 'AnalystName', 'AnalystCompany', 'Participants', 'Tag', 'EventType'], axis=1)
indexed_data = indexed_data.reset_index(drop=True)
indexed_data['NumQ'] = indexed_data['NumQ'].astype(bool).astype(int)

train, test = indexed_data.loc[~indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True), \
                indexed_data.loc[indexed_data['EventNumber'].isin(test_set)].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ','EventNumber'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ', 'EventNumber'], axis=1), test['NumQ'].values

scores = np.zeros(50)
acc = np.zeros(50)
scores_gbc = np.zeros(50)
acc_gbc = np.zeros(50)
scores_rf = np.zeros(50)
acc_rf = np.zeros(50)
scores_mlp = np.zeros(50)
acc_mlp = np.zeros(50)

estimator = LogisticRegression().fit(X_train, y_train)
preds = estimator.predict_proba(X_test)[:,1]
scores[0] = roc_auc_score(y_test, preds)
acc[0] = accuracy_score(y_test, preds.round())
    
estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train, y_train)
preds_gbc = estimator_gbc.predict_proba(X_test)[:,1]
scores_gbc[0] = roc_auc_score(y_test, preds_gbc)
acc_gbc[0] = accuracy_score(y_test, preds_gbc.round())
    
estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train, y_train)
preds_rf = estimator_rf.predict_proba(X_test)[:,1]
scores_rf[0] = roc_auc_score(y_test, preds_rf)
acc_rf[0] = accuracy_score(y_test, preds_rf.round())

estimator_mlp = MLPClassifier(warm_start=True).fit(X_train, y_train)
preds_mlp = estimator_mlp.predict_proba(X_test)[:,1]
scores_mlp[0] = roc_auc_score(y_test, preds_mlp)
acc_mlp[0] = accuracy_score(y_test, preds_mlp.round())

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator = LogisticRegression().fit(X_train_W, y_train)
    preds = estimator.predict_proba(X_test_W)[:,1]
    scores[comp] = roc_auc_score(y_test, preds)
    acc[comp] = accuracy_score(y_test, preds.round())
    
    estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_gbc = estimator_gbc.predict_proba(X_test_W)[:,1]
    scores_gbc[comp] = roc_auc_score(y_test, preds_gbc)
    acc_gbc[comp] = accuracy_score(y_test, preds_gbc.round())
    
    estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_rf = estimator_rf.predict_proba(X_test_W)[:,1]
    scores_rf[comp] = roc_auc_score(y_test, preds_rf)
    acc_rf[comp] = accuracy_score(y_test, preds_rf.round())
    
    estimator_mlp = MLPClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_mlp = estimator_mlp.predict_proba(X_test_W)[:,1]
    scores_mlp[comp] = roc_auc_score(y_test, preds_mlp)
    acc_mlp[comp] = accuracy_score(y_test, preds_mlp.round())


print('logit ROC:', scores.max(), scores.argmax())
print('logit ACC', acc.max(), acc.argmax())
print('GBC ROC', scores_gbc.max(), scores_gbc.argmax())
print('GBC ACC', acc_gbc.max(), acc_gbc.argmax())
print('RF ROC', scores_rf.max(), scores_rf.argmax())
print('RF ACC', acc_rf.max(), acc_rf.argmax())
print('MLP ROC', scores_mlp.max(), scores_mlp.argmax())
print('MLP ACC', acc_mlp.max(), acc_mlp.argmax())

logit ROC: 0.7221445482362363 0
logit ACC 0.8636179684338324 0
GBC ROC 0.7743632316569917 0
GBC ACC 0.8668555240793201 0
RF ROC 0.7293723746163947 0
RF ACC 0.8634156212059895 8
MLP ROC 0.7187374873772576 0
MLP ACC 0.8638203156616754 0


In [13]:
model = NMF(n_components=49).fit(X_train)
X_train_W = model.transform(X_train)

param_grid = {'learning_rate': 10.0**np.arange(-3,0,1),
              'min_samples_split': np.arange(2, 10, 2, dtype=int),
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth': np.arange(1, 5, 1, dtype=int),
              'min_samples_leaf': np.arange(1, 10, 1, dtype=int)}

grid = GridSearchCV(GradientBoostingClassifier(warm_start=True), cv=5, param_grid=param_grid, return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)

KeyboardInterrupt: 

In [None]:
grid.best_params_

In [82]:
model = NMF(n_components=49).fit(X_train)
X_train_W = model.transform(X_train)
X_test_W = model.transform(X_test)

estimator = GradientBoostingClassifier().fit(X_train_W, y_train)
roc_auc_score(y_test, estimator.predict_proba(X_test_W)[:,1])

0.683920174843479