In [8]:
from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import roc_auc_score, roc_curve, auc, make_scorer, accuracy_score, mean_squared_error

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

In [3]:
data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
data['EarningTag2'] = data['EarningTag2'].str.strip()

#Add Lagged Column
data['Lag1'] = data.groupby(["Company", "Participants", "Date", "EventName", "EventType"])['EarningTag2'].shift(1)

#Add Year and Month from Data
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

#Drop non-earning calls
nn_data = data.loc[data['EventType']=="Earnings call", ['Company', 'Participants', 'Month', 'Year', 'AnalystName',	'AnalystCompany', 'EventName', 'Lag1', 'EarningTag2']].copy()
#Add quarter
nn_data['Quarter'] = nn_data['EventName'].str.split("Q").str[0]
#Drop bad features
nn_data = nn_data[['Company', "Participants", "AnalystName", "AnalystCompany", "Month", "Year", "Quarter", "Lag1", "EarningTag2"]].copy()

nn_data['NewIndex'] = nn_data['Company'].str.replace(" ", "") +  "_Y" + nn_data['Year'].astype(str) + "_M" + nn_data['Month'].astype(str) + "_Q" + nn_data['Quarter'].astype(str)
pct_data = (nn_data.groupby(['NewIndex', "EarningTag2"]).size().reset_index()).pivot(index='NewIndex', columns='EarningTag2', values=0).fillna(0)

#pct_div_data = pct_data.div(pct_data.sum(axis=1), axis=0)
pct_div_data = pct_data
pct_div_data = pd.concat([pct_div_data.reset_index(drop=True), pct_div_data.reset_index()['NewIndex'].str.split("_", expand=True)[[0, 1, 2, 3]]], axis=1, ignore_index=True)
pct_div_data.columns = pct_data.columns.tolist() + ['Company', 'Year', 'Month', 'Quarter']
pct_div_data = pct_div_data[['Company', 'Year', 'Month', 'Quarter'] + pct_data.columns.tolist()]

pct_melt_data = pd.melt(pct_div_data, id_vars=['Company', 'Year', 'Month', 'Quarter'], var_name='Tag', value_name='NumQ')
pct_melt_data = pd.concat([pct_melt_data, 
                             pd.get_dummies(pct_melt_data['Company'], prefix='C', prefix_sep=""),
                             pd.get_dummies(pct_melt_data['Month']),
                             pd.get_dummies(pct_melt_data['Quarter']),
                             pd.get_dummies(pct_melt_data['Year']),
                             pd.get_dummies(pct_melt_data['Tag'], prefix='T', prefix_sep="")], axis=1)
pct_melt_data = pct_melt_data.drop(['Company', 'Year', 'Month', 'Quarter', 'Tag'], axis=1)
pct_melt_data = pct_melt_data.reset_index(drop=True)

train, test = pct_melt_data.loc[pct_melt_data['Y2018']!=1].copy().reset_index(drop=True), \
                pct_melt_data.loc[pct_melt_data['Y2018']==1].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ'], axis=1), test['NumQ'].values

In [49]:
scores_gbc = np.zeros(50)
scores_rf = np.zeros(50)

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator_gbc = GradientBoostingRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_gbc[comp] = mean_squared_error(y_test, estimator_gbc.predict(X_test_W).round())
    
    estimator_rf = RandomForestRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_rf[comp] = mean_squared_error(y_test, estimator_rf.predict(X_test_W).round())


In [50]:
print(scores_gbc[1:].min())
print(scores_rf[1:].min())

3.7
3.8685714285714288


In [52]:
scores_gbc[1:].argmin()

30

In [55]:
pipe = Pipeline([('dim_red', NMF()),
                     ('regressor', GradientBoostingRegressor())])

param_grid = {'dim_red__n_components': np.arange(2, 50, dtype=int),
              'regressor__loss':['ls', 'huber'],
              'regressor__learning_rate':10.0**np.arange(-5,1,1),
              'regressor__min_samples_split':np.arange(2,10,2)}

grid = GridSearchCV(pipe, cv=5, param_grid=param_grid, return_train_score=False, scoring= make_scorer(mean_squared_error, greater_is_better=False))
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('dim_red', NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=None, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)), ('regressor', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning...s=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'dim_red__n_components': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]), 'regressor__loss': ['ls', 'huber'], 'regressor__learning_rate': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00]), 'regressor__min_samples_split': array([2, 4, 6, 8])},
       pre_dispatch='2*n_jobs', 

In [56]:
grid.best_score_

-7.062518220068336

In [58]:
grid.best_params_

{'dim_red__n_components': 9,
 'regressor__learning_rate': 0.1,
 'regressor__loss': 'huber',
 'regressor__min_samples_split': 8}

In [75]:
model = NMF(n_components=31).fit(X_train)
X_train_W = model.transform(X_train)
X_test_W = model.transform(X_test)


estimator = GradientBoostingRegressor(learning_rate=0.1, loss='huber', min_samples_split=2).fit(X_train_W, y_train)
mean_squared_error(y_test, estimator.predict(X_test_W).round())

5.774285714285714