In [45]:
from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc, make_scorer, accuracy_score, mean_squared_error

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

In [26]:
data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
data['EarningTag2'] = data['EarningTag2'].str.strip()

#Add Lagged Column
data['Lag1'] = data.groupby(["Company", "Participants", "Date", "EventName", "EventType"])['EarningTag2'].shift(1)

#Add Year and Month from Data
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

#Drop non-earning calls
nn_data = data.loc[data['EventType']=="Earnings call", ['Company', 'Participants', 'Month', 'Year', 'AnalystName',	'AnalystCompany', 'EventName', 'Lag1', 'EarningTag2']].copy()
#Add quarter
nn_data['Quarter'] = nn_data['EventName'].str.split("Q").str[0]
#Drop bad features
nn_data = nn_data[['Company', "Participants", "AnalystName", "AnalystCompany", "Month", "Year", "Quarter", "Lag1", "EarningTag2"]].copy()

nn_data['NewIndex'] =   nn_data['AnalystName'].str.replace(" ", "") +  "_Y" + \
                        nn_data['Year'].astype(str) + "_M" + nn_data['Month'].astype(str) + "_Q" + \
                        nn_data['Quarter'].astype(str)
pct_data = (nn_data.groupby(['NewIndex', "EarningTag2"]).size().reset_index()).pivot(index='NewIndex', columns='EarningTag2', values=0).fillna(0)

#pct_div_data = pct_data.div(pct_data.sum(axis=1), axis=0)
pct_div_data = pct_data
pct_div_data = pd.concat([pct_div_data.reset_index(drop=True), pct_div_data.reset_index()['NewIndex'].str.split("_", expand=True)[[0, 1, 2, 3]]], axis=1, ignore_index=True)
pct_div_data.columns = pct_data.columns.tolist() + ['AnalystName', 'Year', 'Month', 'Quarter']
pct_div_data = pct_div_data[['AnalystName', 'Year', 'Month', 'Quarter'] + pct_data.columns.tolist()]

pct_melt_data = pd.melt(pct_div_data, id_vars=['AnalystName', 'Year', 'Month', 'Quarter'], var_name='Tag', value_name='NumQ')
pct_melt_data = pd.concat([pct_melt_data, 
                             pd.get_dummies(pct_melt_data['AnalystName'], prefix='A', prefix_sep=""),
                             pd.get_dummies(pct_melt_data['Month']),
                             pd.get_dummies(pct_melt_data['Quarter']),
                             pd.get_dummies(pct_melt_data['Year']),
                             pd.get_dummies(pct_melt_data['Tag'], prefix='T', prefix_sep="")], axis=1)
pct_melt_data = pct_melt_data.drop(['AnalystName', 'Year', 'Month', 'Quarter', 'Tag'], axis=1)
pct_melt_data = pct_melt_data.reset_index(drop=True)
pct_melt_data['NumQ'] = pct_melt_data['NumQ'].astype(bool).astype(int)

train, test = pct_melt_data.loc[pct_melt_data['Y2018']!=1].copy().reset_index(drop=True), \
                pct_melt_data.loc[pct_melt_data['Y2018']==1].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ'], axis=1), test['NumQ'].values

In [66]:
scores = np.zeros(50)
scores_gbc = np.zeros(50)
scores_rf = np.zeros(50)

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator = LogisticRegression().fit(X_train_W, y_train)
    preds = estimator.predict_proba(X_test_W)[:,1]
    scores[comp] = roc_auc_score(y_test, preds)
    
    estimator_gbc = GradientBoostingClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_gbc = estimator_gbc.predict_proba(X_test_W)[:,1]
    scores_gbc[comp] = roc_auc_score(y_test, preds_gbc)
    
    estimator_rf = RandomForestClassifier(warm_start=True).fit(X_train_W, y_train)
    preds_rf = estimator_rf.predict_proba(X_test_W)[:,1]
    scores_rf[comp] = roc_auc_score(y_test, preds_rf)


In [67]:
print(scores[1:].max())
print(scores_gbc[1:].max())
print(scores_rf[1:].max())

0.7611043685046985
0.7789302809280225
0.7566335118232973


In [75]:
scores_rf[1:].argmax()

42

In [76]:
model = NMF(n_components=42).fit(X_train)
X_train_W = model.transform(X_train)

param_grid = {'min_samples_split': np.arange(2, 20,2, dtype=int),
              'max_depth': np.arange(1, 10, 1, dtype=int),
              'min_samples_leaf': np.arange(1, 20, 1, dtype=int)}

grid = GridSearchCV(RandomForestClassifier(criterion='entropy'), cv=5, param_grid=param_grid, return_train_score=False, scoring=make_scorer(roc_auc_score))
grid.fit(X_train_W, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=make_scorer(roc_auc_score), verbose=0)

In [79]:
grid.best_params_

{'max_depth': 1, 'min_samples_leaf': 11, 'min_samples_split': 18}

In [82]:
model = NMF(n_components=42).fit(X_train)
X_train_W = model.transform(X_train)
X_test_W = model.transform(X_test)

estimator = RandomForestClassifier(criterion='entropy', max_depth=2, min_samples_leaf=11, min_samples_split=18).fit(X_train_W, y_train)
roc_auc_score(y_test, estimator.predict_proba(X_test_W)[:,1])

0.683920174843479

In [77]:
print("Great Success!")

Great Success!
