In [8]:
from itertools import cycle

import numpy as np
import pandas as pd
from scipy import interp

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import roc_auc_score, roc_curve, auc, make_scorer, accuracy_score, mean_squared_error

import matplotlib.pyplot as plt
plt.style.use('dark_background')

%matplotlib inline

In [3]:
data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
data['EarningTag2'] = data['EarningTag2'].str.strip()

#Add Lagged Column
data['Lag1'] = data.groupby(["Company", "Participants", "Date", "EventName", "EventType"])['EarningTag2'].shift(1)

#Add Year and Month from Data
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

#Drop non-earning calls
nn_data = data.loc[data['EventType']=="Earnings call", ['Company', 'Participants', 'Month', 'Year', 'AnalystName',	'AnalystCompany', 'EventName', 'Lag1', 'EarningTag2']].copy()
#Add quarter
nn_data['Quarter'] = nn_data['EventName'].str.split("Q").str[0]
#Drop bad features
nn_data = nn_data[['Company', "Participants", "AnalystName", "AnalystCompany", "Month", "Year", "Quarter", "Lag1", "EarningTag2"]].copy()

nn_data['NewIndex'] = nn_data['Company'].str.replace(" ", "") +  "_Y" + nn_data['Year'].astype(str) + "_M" + nn_data['Month'].astype(str) + "_Q" + nn_data['Quarter'].astype(str)
pct_data = (nn_data.groupby(['NewIndex', "EarningTag2"]).size().reset_index()).pivot(index='NewIndex', columns='EarningTag2', values=0).fillna(0)

#pct_div_data = pct_data.div(pct_data.sum(axis=1), axis=0)
pct_div_data = pct_data
pct_div_data = pd.concat([pct_div_data.reset_index(drop=True), pct_div_data.reset_index()['NewIndex'].str.split("_", expand=True)[[0, 1, 2, 3]]], axis=1, ignore_index=True)
pct_div_data.columns = pct_data.columns.tolist() + ['Company', 'Year', 'Month', 'Quarter']
pct_div_data = pct_div_data[['Company', 'Year', 'Month', 'Quarter'] + pct_data.columns.tolist()]

pct_melt_data = pd.melt(pct_div_data, id_vars=['Company', 'Year', 'Month', 'Quarter'], var_name='Tag', value_name='NumQ')
pct_melt_data = pd.concat([pct_melt_data, 
                             pd.get_dummies(pct_melt_data['Company'], prefix='C', prefix_sep=""),
                             pd.get_dummies(pct_melt_data['Month']),
                             pd.get_dummies(pct_melt_data['Quarter']),
                             pd.get_dummies(pct_melt_data['Year']),
                             pd.get_dummies(pct_melt_data['Tag'], prefix='T', prefix_sep="")], axis=1)
pct_melt_data = pct_melt_data.drop(['Company', 'Year', 'Month', 'Quarter', 'Tag'], axis=1)
pct_melt_data = pct_melt_data.reset_index(drop=True)

train, test = pct_melt_data.loc[pct_melt_data['Y2018']!=1].copy().reset_index(drop=True), \
                pct_melt_data.loc[pct_melt_data['Y2018']==1].copy().reset_index(drop=True)

X_train, y_train = train.drop(['NumQ'], axis=1), train['NumQ'].values
X_test, y_test = test.drop(['NumQ'], axis=1), test['NumQ'].values

In [49]:
scores_gbc = np.zeros(50)
scores_rf = np.zeros(50)

for comp in range(1, 50):
    model = NMF(n_components=comp)
    X_train_W = model.fit_transform(X_train)
    X_test_W = model.transform(X_test)
    
    estimator_gbc = GradientBoostingRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_gbc[comp] = mean_squared_error(y_test, estimator_gbc.predict(X_test_W).round())
    
    estimator_rf = RandomForestRegressor(warm_start=True).fit(X_train_W, y_train)
    scores_rf[comp] = mean_squared_error(y_test, estimator_rf.predict(X_test_W).round())


In [50]:
print(scores_gbc[1:].min())
print(scores_rf[1:].min())

3.7
3.8685714285714288


In [52]:
scores_gbc[1:].argmin()

30

In [77]:
model = NMF(n_components=31).fit(X_train)
X_train_W = model.transform(X_train)

param_grid = {'loss':['ls', 'huber'],
              'learning_rate':10.0**np.arange(-5,1,1),
              'min_samples_split':np.arange(2,10,2)}

grid = GridSearchCV(GradientBoostingRegressor(), cv=5, param_grid=param_grid, return_train_score=False, scoring= make_scorer(mean_squared_error, greater_is_better=False))
grid.fit(X_train_W, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['ls', 'huber'], 'learning_rate': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00]), 'min_samples_split': array([2, 4, 6, 8])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=make_scorer(mean_squared_error, greater_is_better=False),
       verbose=0)

In [78]:
grid.best_score_

-8.118898481023532

In [83]:
grid.cv_results_['mean']

{'mean_fit_time': array([0.06854925, 0.06635704, 0.06742826, 0.06312447, 0.15935092,
        0.15809288, 0.16487207, 0.16290479, 0.06814685, 0.06504178,
        0.07482562, 0.07388802, 0.15933461, 0.16892729, 0.1587574 ,
        0.19888043, 0.0667943 , 0.07079711, 0.06817837, 0.07440066,
        0.17502322, 0.17844605, 0.1756484 , 0.16384964, 0.06877875,
        0.06798625, 0.06516013, 0.07165804, 0.18323846, 0.18040566,
        0.18172388, 0.17720847, 0.07401361, 0.06580906, 0.07055964,
        0.06518369, 0.16257505, 0.16556177, 0.16731377, 0.17478604,
        0.07654963, 0.07580476, 0.07309804, 0.07903833, 0.19459691,
        0.17522388, 0.15677433, 0.15924549]),
 'std_fit_time': array([0.0056204 , 0.00495804, 0.00613989, 0.00328961, 0.01049357,
        0.01356116, 0.00883399, 0.00737444, 0.00234341, 0.00166524,
        0.01239389, 0.01014548, 0.01144181, 0.01003218, 0.00664474,
        0.01509385, 0.00653781, 0.00479117, 0.0046758 , 0.01029326,
        0.00923011, 0.01047624, 0.017

In [None]:
 {'learning_rate': 0.001, 'loss': 'ls', 'min_samples_split': 4},

In [84]:
grid.best_params_

{'learning_rate': 0.01, 'loss': 'huber', 'min_samples_split': 2}

In [75]:
model = NMF(n_components=31).fit(X_train)
X_train_W = model.transform(X_train)
X_test_W = model.transform(X_test)


estimator = GradientBoostingRegressor(learning_rate=0.1, loss='huber', min_samples_split=2).fit(X_train_W, y_train)
mean_squared_error(y_test, estimator.predict(X_test_W).round())

5.774285714285714