In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score,matthews_corrcoef,accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
import random
import math
from datetime import timedelta
import pickle

In [2]:
with open('updated_spx_raw_tickers.dictionary', 'rb') as config_dictionary_file:
 
    # Step 3
    combined = pickle.load(config_dictionary_file)

In [3]:
combined

{'lyb':               lyb  rep_date    volume
 date                                 
 2014-01-02  78.78       NaN       NaN
 2014-01-03  78.71       NaN       NaN
 2014-01-06  77.97       NaN  1.000000
 2014-01-07  78.94       NaN  0.756996
 2014-01-08  79.73       NaN  1.000000
 ...           ...       ...       ...
 2020-09-28  71.21       NaN  0.143262
 2020-09-29  70.28       NaN  0.076910
 2020-09-30  70.49       NaN  0.152140
 2020-10-01  68.14       NaN  0.139042
 2020-10-02  72.69       NaN  0.166154
 
 [1701 rows x 3 columns],
 'axp':                  axp  rep_date    volume
 2000-01-03   45.8828       NaN       NaN
 2000-01-04   44.1504       NaN       NaN
 2000-01-05   42.9650       NaN  0.000000
 2000-01-06   43.8404       NaN  1.000000
 2000-01-07   44.4786       NaN  0.000000
 ...              ...       ...       ...
 2020-09-28   98.8200       NaN  0.033593
 2020-09-29   97.7500       NaN  0.029631
 2020-09-30  100.2500       NaN  0.044227
 2020-10-01  101.3300       NaN

In [4]:
b4rep_window = 10
thr = 0.65


In [5]:
def compress(series, sides=0.01):
    
    series = pd.Series(np.where( series > series.quantile(q=(1-sides)) , series.quantile(q=(1-sides)), series ), 
                      index=series.index)
    series = np.where(series < series.quantile(q=sides), series.quantile(q=sides), series)
    return series

def compress_right_tail(series, sides=0.01):
    
    series = pd.Series(np.where( series > series.quantile(q=(1-sides)) , series.quantile(q=(1-sides)), series ), 
                      index=series.index)
    return series

def create_features(dtf, tick, lead=5, ret_lag=[1,2,5,10,20,44,261], tr=0.025, vol_lag=[1,2,5,10,20,44,261], vol_window=100, sides=0.01):
    
    feat_cols = []
#    dtf['volume'] = np.log(dtf['volume'])

#    dtf['volume'] = (dtf['volume']-dtf['volume'].cummin())/(dtf['volume'].cummax()-dtf['volume'].cummin())

    avg_vol_name = 'avg_vol_' + str(vol_window)
    dtf[avg_vol_name] = dtf['volume'].rolling(window=vol_window).mean()
    dtf[avg_vol_name] = np.log(dtf[avg_vol_name])
#     avg_range_name = 'avg_range_' + str(vol_window)
#     dtf[avg_range_name] = np.abs(dtf['high'].rolling(window=vol_window).mean()-dtf['low'].rolling(window=vol_window).mean())
#     dtf[avg_range_name] = compress(dtf[avg_range_name], sides=sides)

# #     avg_low_name = 'avg_low_' + str(vol_window)
# #     dtf[avg_low_name] = dtf[tick]-dtf['low'].rolling(window=vol_window).mean()
# #     avg_high_name = 'avg_high_' + str(vol_window)
# #     dtf[avg_high_name] = dtf['high'].rolling(window=vol_window).mean()-dtf[tick]
    
#     for avg in vol_avgs:

#         name_simple = 'open'+str(avg)
#         dtf[name_simple] = dtf['open'].rolling(window=avg).mean()
#         dtf[name_simple] = np.log(dtf[name_simple])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
#         feat_cols.append(name_simple)
#         name_simple = 'high'+str(avg)
#         dtf[name_simple] = dtf['high'].rolling(window=avg).mean()
#         dtf[name_simple] = np.log(dtf[name_simple])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
#         #feat_cols.append(name_simple)
#         name_simple = 'low'+str(avg)
#         dtf[name_simple] = dtf['low'].rolling(window=avg).mean()
#         dtf[name_simple] = np.log(dtf[name_simple])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
#         #feat_cols.append(name_simple)
# #         name_simple = 'vwap'+str(avg)
# #         dtf[name_simple] = dtf['vwap'].rolling(window=avg).mean()
# #         dtf[name_simple] = np.log(dtf[name_simple])
# #         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
# #         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
# #         feat_cols.append(name_simple)
#         name_simple = 'ranges'+str(avg)
#         dtf[name_simple] = np.abs(dtf['high'+str(avg)]-dtf['low'+str(avg)])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = dtf[name_simple] / dtf[avg_range_name]
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         #feat_cols.append(name_simple)


    
    
    for lag in vol_lag:
        name_simple = 'vol'+str(lag)
        dtf[name_simple] = dtf['volume'].rolling(window=lag).mean()
        dtf[name_simple] = np.log(dtf[name_simple])
        dtf[name_simple] = dtf[name_simple] / dtf[avg_vol_name]
        dtf[name_simple] = compress(dtf[name_simple], sides=sides)
        dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].cummin()) / (dtf[name_simple].cummax() - dtf[name_simple].cummin())
        feat_cols.append(name_simple)
    for lag in ret_lag:
        name = 'ret'+str(lag)
        dtf[name] = dtf[tick]/dtf[tick].shift(lag)-1
        dtf[name] = compress(dtf[name], sides=sides)
        dtf[name] = (dtf[name] - dtf[name].cummin()) / (dtf[name].cummax() - dtf[name].cummin())
        feat_cols.append(name)

        
        
    dtf['tick'] = tick
    #feat_cols.append(tick)
#    dtf['fwd'] = dtf[tick].shift(-lead+1) / dtf[tick]-1
#    dtf['fwd'] = dtf[tick].shift(-lead) / dtf[tick]-1
    dtf['fwd'] = dtf[tick].shift(-lead-1) / dtf[tick]-1
    dtf['ycol'] = np.where(dtf['fwd'] >= tr, 1, 0)
    feat_cols.append('ycol')
#    dtf['mva_90'] = dtf[tick].rolling(window=90).mean()
#    dtf['y_90'] = np.where(dtf[tick] >= dtf['mva_90'], 1, 0)
#    feat_cols.append('y_90')
    return dtf, feat_cols

In [6]:
def ticker_features(combined_dtfs, ticker, growth_tr=0.02, back_window=28, b4rep_window=10,
                   ret_lag=[1,2,5,10,20,44,261], vol_lag=[1,2,5,10,20,44,261], vol_window=100, sides=0.01):

    idf = combined_dtfs[ticker]
    if combined[ticker].shape[0]==0:
        return
    else:
        idf['idate'] = idf.index
        idf['idate'] = idf['idate'].shift(1)
        idf['b4rep_window'] = idf.index
        idf['b4rep_window'] = idf['b4rep_window'].shift(b4rep_window+1)
        #print(ticker)
        idf['back_window'] = pd.to_datetime(np.where(idf.rep_date == 1, idf['idate'] - \
                                                     pd.Timedelta(days=back_window), pd.to_datetime(np.nan)))
        idf['back_window'] = idf['back_window'].fillna(method='backfill')

        idf['b4rep_window'] = pd.to_datetime(np.where(idf.rep_date == 1, idf['b4rep_window'], pd.to_datetime(np.nan)))
        idf['b4rep_window'] = idf['b4rep_window'].fillna(method='backfill')

        idf['b4rep'] = np.where( idf.idate >= idf.back_window , 1, 0 )
        idf['next_rep'] = pd.to_datetime(np.where( idf.rep_date == 1, idf.idate, pd.to_datetime(np.nan)))
        idf['next_rep'] = idf['next_rep'].fillna(method='backfill')
        idf['price_atrep'] = idf['next_rep'].map(idf[ticker])
        idf['price_b4rep'] = idf['b4rep_window'].map(idf[ticker])
        idf['b4rep_chng'] = idf['price_atrep']/idf['price_b4rep']-1

        #idf = idf[idf.b4rep == 1]#[['meli', 'volume', 'rep_date']]
        #idf['ycol'] = np.where( (idf['b4rep'] == 1) & (idf['b4rep_chng'] > growth_tr), 1, 0 )
        #idf = idf[[ticker, 'volume', 'b4rep', 'b4rep_chng']]

        #idf = create_returns(idf, ticker, lead=b4rep_window, tr=growth_tr)
        idf, feat_cols = create_features(idf, ticker, lead=b4rep_window,ret_lag=ret_lag, vol_lag=vol_lag, tr=growth_tr, 
                            vol_window=vol_window, sides=sides)
        feat_cols.append('fwd')
        feat_cols.append('tick')
        idf = idf[idf.idate == idf.b4rep_window]

        return idf[feat_cols].dropna()

In [7]:
def nasdaq_features(combined, params=None):
    
    if params == None:
        params = {
            'b4rep_window': b4rep_window, 
            'growth_tr': 0.02,
            'back_window': 28, 
            'ret_lag': [1,2,5,10,22,44,51,66,118,132,246,261,375,480,520], 
            'vol_lag': [1,2,5,10,22,44,51,66,118,132], 

            'vol_window': 100, 
            'sides': 0.01
        }
    
    df = pd.DataFrame()
    for i in combined:
        df = df.append(ticker_features(combined, i, **params))
    df = df.sort_index()
    
    return df

In [8]:
%%time
df = nasdaq_features(combined, params=None)
#### ADDED COLUMN FOR TESTING RETURNS
#df['case_id'] = np.linspace(1, df.shape[0], df.shape[0])
df.index = df.index
df['date:tick'] = df.index.astype(str)+':'+df['tick']

#### ALSO AMENDED my_tss function to account for new column
df.tail()

CPU times: user 44.4 s, sys: 809 ms, total: 45.2 s
Wall time: 45.2 s


Unnamed: 0,vol1,vol2,vol5,vol10,vol22,vol44,vol51,vol66,vol118,vol132,...,ret132,ret246,ret261,ret375,ret480,ret520,ycol,fwd,tick,date:tick
2020-08-31,0.572634,0.516585,0.592231,0.5749,0.372985,0.282722,0.303976,0.349852,0.346603,0.288642,...,0.800615,0.621432,0.756742,0.607848,0.306617,0.284277,1,0.138555,fdx,2020-08-31:fdx
2020-09-08,0.339846,0.380349,0.346768,0.39468,0.41493,0.476595,0.471174,0.435927,0.463528,0.385984,...,0.342498,0.179531,0.210342,0.2421,0.323408,0.312697,0,-0.068131,azo,2020-09-08:azo
2020-09-08,0.527466,0.569587,0.52845,0.668057,0.618831,0.631074,0.528114,0.431714,0.233559,0.0,...,0.732229,0.623917,0.695646,0.517817,0.588209,0.465765,1,0.127661,nke,2020-09-08:nke
2020-09-09,0.280558,0.2806,0.401968,0.434631,0.599852,0.5515,0.481793,0.364341,0.387111,0.031626,...,0.710255,0.542456,0.609398,0.712105,0.9001,0.761539,0,-0.02134,gis,2020-09-09:gis
2020-09-09,0.499114,0.475922,0.521696,0.671788,0.816631,0.708158,0.730354,0.77494,0.188285,0.013395,...,0.687215,0.720762,0.634522,0.864013,0.873807,0.657998,0,-0.022367,ctas,2020-09-09:ctas


In [9]:
# save the model to disk
filename = 'inference_before_spx.sav'
 
# some time later...
 
# load the model from disk
estimatorCopy = pickle.load(open(filename, 'rb'))

In [10]:
cols=['vol1', 'vol2', 'vol5', 'vol10', 'vol22', 'vol44', 'vol51', 'vol66',
       'vol118', 'vol132', 'ret1', 'ret2', 'ret5', 'ret10', 'ret22', 'ret44',
       'ret51', 'ret66', 'ret118', 'ret132', 'ret246', 'ret261', 'ret375',
       'ret480', 'ret520']
num_companies=500
next_reports = df['tick'][-int(num_companies*1.3):]
ticker_proba = {}
existing_reports = []
for ticker in next_reports:
    try:
        pred_features=combined[ticker][-1:][cols]
        ticker_proba[ticker] = estimatorCopy.predict_proba(pred_features)[:,2]
        existing_reports.append(ticker)
    except:
        print('except:')
        print(ticker)

In [11]:
{k: v for k, v in sorted(ticker_proba.items(), key=lambda item: item[1],reverse=True)}

{'oxy': array([0.74511069]),
 'm': array([0.7385571]),
 'mo': array([0.71776557]),
 'fls': array([0.71684185]),
 'nclh': array([0.7147635]),
 'ba': array([0.70424431]),
 'gis': array([0.6953655]),
 'nbl': array([0.67583214]),
 'expe': array([0.67418379]),
 'bk': array([0.660129]),
 'hal': array([0.64798535]),
 'tap': array([0.64145565]),
 'kss': array([0.63405001]),
 'cvx': array([0.63232203]),
 'cop': array([0.61825928]),
 'jwn': array([0.61386367]),
 'gd': array([0.61114031]),
 'stt': array([0.60183946]),
 'ua': array([0.60082816]),
 'pru': array([0.59429049]),
 'yum': array([0.59414716]),
 'mar': array([0.59108935]),
 'ual': array([0.59049212]),
 'cma': array([0.58987896]),
 'syy': array([0.58775283]),
 'wfc': array([0.58687689]),
 'cdns': array([0.57501194]),
 'dfs': array([0.57024208]),
 'alk': array([0.56808409]),
 'l': array([0.56708075]),
 'o': array([0.56000159]),
 'udr': array([0.55315337]),
 'nov': array([0.5528667]),
 'viac': array([0.5513139]),
 'well': array([0.54788183])

In [12]:
import datetime
from datetime import datetime
from yahoo_earnings_calendar import YahooEarningsCalendar
import math

my_custom_delay_s = 0.01

yec = YahooEarningsCalendar(my_custom_delay_s)

# Returns the next earnings date of BOX in Unix timestamp
#print(yec.get_next_earnings_date('box'))
# 1508716800

ticker_date = {}
sml_tickers = []
today=datetime.today()
start=today+timedelta(days=14)
#end=today+timedelta(days=16)
end=today+timedelta(days=18)

for ticker in set(next_reports):
    pred_features=combined[ticker][-1:][cols]
    prediction = estimatorCopy.predict_proba(pred_features)[:,1]
    index = str(ticker)+':'+str(math.floor(100*prediction))

    try:
        date = datetime.fromtimestamp(yec.get_next_earnings_date(ticker))
        ticker_date[index] = date
        if date>start and date<end:
            sml_tickers.append(ticker)
    except:
        ticker_date[index] = datetime(3000, 1, 1, 1, 1)

{k: v for k, v in sorted(ticker_date.items(), key=lambda item: item[1])}


{'ua:29': datetime.datetime(2018, 4, 25, 6, 0),
 'flir:32': datetime.datetime(2020, 10, 30, 6, 0),
 'disck:35': datetime.datetime(2020, 11, 5, 6, 0),
 'ual:34': datetime.datetime(2021, 1, 20, 6, 0),
 'bxp:33': datetime.datetime(2021, 1, 26, 6, 0),
 'nee:38': datetime.datetime(2021, 1, 26, 6, 0),
 'msft:39': datetime.datetime(2021, 1, 26, 6, 0),
 'jnj:42': datetime.datetime(2021, 1, 26, 6, 0),
 'amd:37': datetime.datetime(2021, 1, 26, 6, 0),
 'sbux:34': datetime.datetime(2021, 1, 26, 6, 0),
 'adm:39': datetime.datetime(2021, 1, 26, 6, 0),
 'rtx:29': datetime.datetime(2021, 1, 26, 6, 0),
 'pld:38': datetime.datetime(2021, 1, 26, 6, 0),
 'rok:34': datetime.datetime(2021, 1, 26, 6, 0),
 'cof:41': datetime.datetime(2021, 1, 26, 6, 0),
 'axp:42': datetime.datetime(2021, 1, 26, 6, 0),
 'chrw:38': datetime.datetime(2021, 1, 26, 6, 0),
 'mxim:35': datetime.datetime(2021, 1, 26, 6, 0),
 'ffiv:59': datetime.datetime(2021, 1, 26, 6, 0),
 'xlnx:33': datetime.datetime(2021, 1, 26, 6, 0),
 'ge:47': d

In [13]:
df = pd.DataFrame(sml_tickers)
writer = pd.ExcelWriter('spx_tickers.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome', index=False)
writer.save()

In [14]:
len(df)

28

In [15]:
df

Unnamed: 0,0
0,clx
1,el
2,ess
3,pru
4,nlok
5,mrk
6,bll
7,ph
8,tt
9,xyl
