In [299]:
from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import roc_auc_score, roc_curve, auc, make_scorer, accuracy_score, mean_squared_error

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

# Bernoulli NB

In [5]:
data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
data['EarningTag2'] = data['EarningTag2'].str.strip()

#Add Lagged Column
data['Lag1'] = data.groupby(["Company", "Participants", "Date", "EventName", "EventType"])['EarningTag2'].shift(1)

#Add Year and Month from Data
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

#Drop non-earning calls
nn_data = data.loc[data['EventType']=="Earnings call", ['Company', 'Participants', 'Month', 'Year', 'AnalystName',	'AnalystCompany', 'EventName', 'Lag1', 'EarningTag2']].copy()
#Add quarter
nn_data['Quarter'] = nn_data['EventName'].str.split("Q").str[0]
#Drop bad features
nn_data = nn_data[['Company', "Participants", "AnalystName", "AnalystCompany", "Month", "Year", "Quarter", "Lag1", "EarningTag2"]].copy()

#One-hot-encode categorical columns
nn_data_encoded = pd.concat([nn_data, 
                             pd.get_dummies(nn_data['Company'], prefix='C', prefix_sep="_"),
                             pd.get_dummies(nn_data['Participants'], prefix='P', prefix_sep="_"),
                             pd.get_dummies(nn_data['AnalystName'], prefix='AN', prefix_sep="_"),
                             pd.get_dummies(nn_data['AnalystCompany'], prefix='AC', prefix_sep="_"),
                             pd.get_dummies(nn_data['Month'], prefix='M', prefix_sep="_"),
                             pd.get_dummies(nn_data['Quarter'], prefix='Q', prefix_sep="_")], axis=1)

new_cols = pd.get_dummies(nn_data['Company'], prefix='C', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['Participants'], prefix='P', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['AnalystName'], prefix='AN', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['AnalystCompany'], prefix='AC', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['Month'], prefix='M', prefix_sep="_").columns.tolist() + \
             pd.get_dummies(nn_data['Quarter'], prefix='Q', prefix_sep="_").columns.tolist()

nn_data_encoded = nn_data_encoded[["Year", "Lag1", "EarningTag2"] + new_cols].copy()
new_cols = [col.replace(" ", "") for col in new_cols]
nn_data_encoded.columns = ["Year", "Lag1", "EarningTag2"] + new_cols

In [6]:
nn_data_encoded.dropna(inplace=True)
nn_data_encoded.reset_index(inplace=True, drop=True)

binarizer = LabelBinarizer().fit(nn_data_encoded['Lag1'])
lag = pd.DataFrame(binarizer.transform(nn_data_encoded['Lag1']), 
                   columns=["lag_{}".format(c) for c in binarizer.classes_])
y = pd.DataFrame(binarizer.transform(nn_data_encoded['EarningTag2']), 
                   columns=["y_{}".format(c) for c in binarizer.classes_])
nn_data_encoded = pd.concat([nn_data_encoded, lag, y], axis=1)

train = nn_data_encoded.loc[nn_data_encoded['Year']!=2018]
test = nn_data_encoded.loc[nn_data_encoded['Year']==2018]

X_train = train.drop(["Year", "Lag1", "EarningTag2"]+y.columns.tolist(), axis=1)
X_test = test.drop(["Year", "Lag1", "EarningTag2"]+y.columns.tolist(), axis=1)
y_train = train[y.columns].values
y_test = test[y.columns].values



In [17]:
comps = np.zeros(50)

for pos, comp in enumerate(range(1, 50)):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)

    classifier = OneVsRestClassifier(RandomForestClassifier())
    y_score = classifier.fit(X_train_W, y_train).predict_proba(X_test_W)

    n_classes = y_score.shape[1]

    lw=2
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    comps[pos] = roc_auc['macro']

In [18]:
comps.argmax()

44

In [19]:
comps.max()

0.6255890897219024

In [21]:
model = NMF(n_components=43)
X_train_W = model.fit_transform(X_train)
X_test_W = model.transform(X_test)
classifier = OneVsRestClassifier(GradientBoostingClassifier())
y_score = classifier.fit(X_train_W, y_train).predict_proba(X_test_W)

In [240]:
nn_data['NewIndex'] = nn_data['Company'].str.replace(" ", "") +  "_Y" + nn_data['Year'].astype(str) + "_M" + nn_data['Month'].astype(str) + "_Q" + nn_data['Quarter'].astype(str)
pct_data = (nn_data.groupby(['NewIndex', "EarningTag2"]).size().reset_index()).pivot(index='NewIndex', columns='EarningTag2', values=0).fillna(0)

#pct_div_data = pct_data.div(pct_data.sum(axis=1), axis=0)
pct_div_data = pct_data
pct_div_data = pd.concat([pct_div_data.reset_index(drop=True), pct_div_data.reset_index()['NewIndex'].str.split("_", expand=True)[[0, 1, 2, 3]]], axis=1, ignore_index=True)
pct_div_data.columns = pct_data.columns.tolist() + ['Company', 'Year', 'Month', 'Quarter']
pct_div_data = pct_div_data[['Company', 'Year', 'Month', 'Quarter'] + pct_data.columns.tolist()]

pct_melt_data = pd.melt(pct_div_data, id_vars=['Company', 'Year', 'Month', 'Quarter'], var_name='Tag', value_name='NumQ')
pct_melt_data = pd.concat([pct_melt_data, 
                             pd.get_dummies(pct_melt_data['Company'], prefix='C', prefix_sep=""),
                             pd.get_dummies(pct_melt_data['Month']),
                             pd.get_dummies(pct_melt_data['Quarter']),
                             pd.get_dummies(pct_melt_data['Year']),
                             pd.get_dummies(pct_melt_data['Tag'], prefix='T', prefix_sep="")], axis=1)
pct_melt_data = pct_melt_data.drop(['Company', 'Year', 'Month', 'Quarter', 'Tag'], axis=1)
pct_melt_data = pct_melt_data.reset_index(drop=True)

In [311]:
train, test = pct_melt_data.loc[pct_melt_data['Y2018']!=1].copy().reset_index(drop=True), \
                pct_melt_data.loc[pct_melt_data['Y2018']==1].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ'], axis=1), test['NumQ'].values

scores_gbc = np.zeros(50)
scores_rf = np.zeros(50)

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator_gbc = GradientBoostingRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_gbc[comp] = mean_squared_error(y_test, estimator_gbc.predict(X_test_W).round())
    
    estimator_rf = RandomForestRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_rf[comp] = mean_squared_error(y_test, estimator_rf.predict(X_test_W).round())


In [313]:
print('l')

l


In [312]:
print(scores[1:].min())
print(scores_gbc[1:].min())
print(scores_rf[1:].min())

5.011428571428572
3.9085714285714284
4.1571428571428575


In [294]:
X_test_test = X_test.loc[(X_test['CBankofAmerica']==1)&(X_test['M1']==1)&(X_test['Q4']==1)].copy()
y_test_test = y_test[X_test_test.index]


model = NMF(n_components=26)
X_train_W = model.fit_transform(X_train)
X_test_test_W = model.transform(X_test_test)

estimator_rf = GradientBoostingRegressor().fit(X_train_W, y_train)
preds = estimator_rf.predict(X_test_test_W)
mean_squared_error(y_test_test, preds.round())

3.2857142857142856

In [295]:
preds.round()

array([2., 1., 4., 1., 2., 2., 2., 2., 2., 1., 1., 2., 1., 1.])

# Multinomial NB

In [157]:
#All columns to string
nn_data_str = nn_data.copy()
nn_data_str['Year'] = nn_data_str['Year'].apply(str)
nn_data_str['Quarter'] = nn_data_str['Quarter'].apply(str)
nn_data_str['Month'] = nn_data_str['Month'].apply(str)

#Remove spaces
nn_data_str = nn_data_str.apply(lambda x: x.str.replace(" ", ""), axis=1)

#Train-Test Split
train = nn_data_str.loc[nn_data_str['Year']!="2018"]
test = nn_data_str.loc[nn_data_str['Year']=="2018"]

#X-y split
X_train = train.drop('EarningTag2', axis=1).values
X_train_str = [' '.join(x) for x in X_train]
y_train = train['EarningTag2'].values

X_test = test.drop("EarningTag2", axis=1).values
X_test_str = [' '.join(x) for x in X_test]
y_test = test['EarningTag2'].values

tfidf_vec = TfidfVectorizer(lowercase=False).fit(X_train_str)
X_train_tfidf = tfidf_vec.transform(X_train_str)
X_test_tfidf = tfidf_vec.transform(X_test_str)

#Encode test
encoder = LabelEncoder().fit(y_train)
y_train_str = encoder.transform(y_train)
y_test_str = encoder.transform(y_test)

model = MultinomialNB().fit(X_train_tfidf, y_train_str)  
preds = model.predict_proba(X_test_tfidf)
accuracy_score(y_test_str, np.argmax(preds, axis=1))

0.27314814814814814