In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score,matthews_corrcoef,accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
import random
import math
from datetime import timedelta
import pickle

In [2]:
with open('updated_mid_raw_tickers.dictionary', 'rb') as config_dictionary_file:
 
    # Step 3
    combined = pickle.load(config_dictionary_file)

In [3]:
b4rep_window = 10
thr = 0.65


In [4]:
def compress(series, sides=0.01):
    
    series = pd.Series(np.where( series > series.quantile(q=(1-sides)) , series.quantile(q=(1-sides)), series ), 
                      index=series.index)
    series = np.where( series < series.quantile(q=sides), series.quantile(q=sides), series)
    return series

def compress_right_tail(series, sides=0.01):
    
    series = pd.Series(np.where( series > series.quantile(q=(1-sides)) , series.quantile(q=(1-sides)), series ), 
                      index=series.index)
    return series

def create_features(dtf, tick, lead=5, ret_lag=[1,2,5,10,20,44,261], tr=0.025, vol_lag=[1,2,5,10,20,44,261], vol_window=100, sides=0.01):
    
    feat_cols = []
#    dtf['volume'] = np.log(dtf['volume'])

#    dtf['volume'] = (dtf['volume']-dtf['volume'].cummin())/(dtf['volume'].cummax()-dtf['volume'].cummin())

    avg_vol_name = 'avg_vol_' + str(vol_window)
    dtf[avg_vol_name] = dtf['volume'].rolling(window=vol_window).mean()
    dtf[avg_vol_name] = np.log(dtf[avg_vol_name])
#     avg_range_name = 'avg_range_' + str(vol_window)
#     dtf[avg_range_name] = np.abs(dtf['high'].rolling(window=vol_window).mean()-dtf['low'].rolling(window=vol_window).mean())
#     dtf[avg_range_name] = compress(dtf[avg_range_name], sides=sides)

# #     avg_low_name = 'avg_low_' + str(vol_window)
# #     dtf[avg_low_name] = dtf[tick]-dtf['low'].rolling(window=vol_window).mean()
# #     avg_high_name = 'avg_high_' + str(vol_window)
# #     dtf[avg_high_name] = dtf['high'].rolling(window=vol_window).mean()-dtf[tick]
    
#     for avg in vol_avgs:

#         name_simple = 'open'+str(avg)
#         dtf[name_simple] = dtf['open'].rolling(window=avg).mean()
#         dtf[name_simple] = np.log(dtf[name_simple])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
#         feat_cols.append(name_simple)
#         name_simple = 'high'+str(avg)
#         dtf[name_simple] = dtf['high'].rolling(window=avg).mean()
#         dtf[name_simple] = np.log(dtf[name_simple])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
#         #feat_cols.append(name_simple)
#         name_simple = 'low'+str(avg)
#         dtf[name_simple] = dtf['low'].rolling(window=avg).mean()
#         dtf[name_simple] = np.log(dtf[name_simple])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
#         #feat_cols.append(name_simple)
# #         name_simple = 'vwap'+str(avg)
# #         dtf[name_simple] = dtf['vwap'].rolling(window=avg).mean()
# #         dtf[name_simple] = np.log(dtf[name_simple])
# #         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
# #         dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].min()) / (dtf[name_simple].max() - dtf[name_simple].min())
# #         feat_cols.append(name_simple)
#         name_simple = 'ranges'+str(avg)
#         dtf[name_simple] = np.abs(dtf['high'+str(avg)]-dtf['low'+str(avg)])
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         dtf[name_simple] = dtf[name_simple] / dtf[avg_range_name]
#         dtf[name_simple] = compress(dtf[name_simple], sides=sides)
#         #feat_cols.append(name_simple)


    
    
    for lag in vol_lag:
        name_simple = 'vol'+str(lag)
        dtf[name_simple] = dtf['volume'].rolling(window=lag).mean()
        dtf[name_simple] = np.log(dtf[name_simple])
        dtf[name_simple] = dtf[name_simple] / dtf[avg_vol_name]
        dtf[name_simple] = compress(dtf[name_simple], sides=sides)
        dtf[name_simple] = (dtf[name_simple]-dtf[name_simple].cummin()) / (dtf[name_simple].cummax() - dtf[name_simple].cummin())
        feat_cols.append(name_simple)
    for lag in ret_lag:
        name = 'ret'+str(lag)
        dtf[name] = dtf[tick]/dtf[tick].shift(lag)-1
        dtf[name] = compress(dtf[name], sides=sides)
        dtf[name] = (dtf[name] - dtf[name].cummin()) / (dtf[name].cummax() - dtf[name].cummin())
        feat_cols.append(name)

        
        
    dtf['tick'] = tick
    #feat_cols.append(tick)
#    dtf['fwd'] = dtf[tick].shift(-lead+1) / dtf[tick]-1
#    dtf['fwd'] = dtf[tick].shift(-lead) / dtf[tick]-1
    dtf['fwd'] = dtf[tick].shift(-lead-1) / dtf[tick]-1
    dtf['ycol'] = np.where(dtf['fwd'] >= tr, 1, 0)
    feat_cols.append('ycol')
#    dtf['mva_90'] = dtf[tick].rolling(window=90).mean()
#    dtf['y_90'] = np.where(dtf[tick] >= dtf['mva_90'], 1, 0)
#    feat_cols.append('y_90')
    return dtf, feat_cols

def ticker_features(combined_dtfs, ticker, growth_tr=0.02, back_window=28, b4rep_window=10,
                   ret_lag=[1,2,5,10,20,44,261], vol_lag=[1,2,5,10,20,44,261], vol_window=100, sides=0.01):

    idf = combined_dtfs[ticker]
    if combined[ticker].shape[0]==0:
        return
    else:
        idf['idate'] = idf.index
        idf['idate'] = idf['idate'].shift(1)
        idf['b4rep_window'] = idf.index
        idf['b4rep_window'] = idf['b4rep_window'].shift(b4rep_window+1)
        #print(ticker)
        idf['back_window'] = pd.to_datetime(np.where(idf.rep_date == 1, idf['idate'] - \
                                                     pd.Timedelta(days=back_window), pd.to_datetime(np.nan)))
        idf['back_window'] = idf['back_window'].fillna(method='backfill')

        idf['b4rep_window'] = pd.to_datetime(np.where(idf.rep_date == 1, idf['b4rep_window'], pd.to_datetime(np.nan)))
        idf['b4rep_window'] = idf['b4rep_window'].fillna(method='backfill')

        idf['b4rep'] = np.where( idf.idate >= idf.back_window , 1, 0 )
        idf['next_rep'] = pd.to_datetime(np.where( idf.rep_date == 1, idf.idate, pd.to_datetime(np.nan)))
        idf['next_rep'] = idf['next_rep'].fillna(method='backfill')
        idf['price_atrep'] = idf['next_rep'].map(idf[ticker])
        idf['price_b4rep'] = idf['b4rep_window'].map(idf[ticker])
        idf['b4rep_chng'] = idf['price_atrep']/idf['price_b4rep']-1

        #idf = idf[idf.b4rep == 1]#[['meli', 'volume', 'rep_date']]
        #idf['ycol'] = np.where( (idf['b4rep'] == 1) & (idf['b4rep_chng'] > growth_tr), 1, 0 )
        #idf = idf[[ticker, 'volume', 'b4rep', 'b4rep_chng']]

        #idf = create_returns(idf, ticker, lead=b4rep_window, tr=growth_tr)
        idf, feat_cols = create_features(idf, ticker, lead=b4rep_window,ret_lag=ret_lag, vol_lag=vol_lag, tr=growth_tr, 
                            vol_window=vol_window, sides=sides)
        feat_cols.append('fwd')
        feat_cols.append('tick')
        idf = idf[idf.idate == idf.b4rep_window]

        return idf[feat_cols].dropna()
def nasdaq_features(combined, params=None):
    
    if params == None:
        params = {
            'b4rep_window': b4rep_window, 
            'growth_tr': 0.02,
            'back_window': 28, 
            'ret_lag': [1,2,5,10,22,44,51,66,118,132,246,261,375,480,520], 
            'vol_lag': [1,2,5,10,22,44,51,66,118,132], 
            'vol_window': 100, 
            'sides': 0.01
        }
    
    df = pd.DataFrame()
    for i in combined:
        df = df.append(ticker_features(combined, i, **params))
    df = df.sort_index()
    
    return df


In [5]:
%%time
df = nasdaq_features(combined, params=None)
#### ADDED COLUMN FOR TESTING RETURNS
#df['case_id'] = np.linspace(1, df.shape[0], df.shape[0])
df.index = df.index
df['date:tick'] = df.index.astype(str)+':'+df['tick']

#### ALSO AMENDED my_tss function to account for new column
df.head()

CPU times: user 33.7 s, sys: 184 ms, total: 33.8 s
Wall time: 33.9 s


Unnamed: 0,vol1,vol2,vol5,vol10,vol22,vol44,vol51,vol66,vol118,vol132,...,ret132,ret246,ret261,ret375,ret480,ret520,ycol,fwd,tick,date:tick
2002-02-01,0.520735,0.553674,0.6278,0.805963,0.847464,0.887232,0.964082,0.386751,0.938787,0.964084,...,0.308752,0.221616,0.136592,0.245488,0.396372,1.0,1,0.028706,sxt,2002-02-01:sxt
2002-02-05,0.431302,0.479171,0.0,0.0,0.039793,0.109718,0.130907,0.227047,0.455813,0.659656,...,0.447544,0.332483,0.371083,0.863058,0.0,0.766544,0,-0.040737,dci,2002-02-05:dci
2002-02-05,0.480296,0.470399,0.538486,0.141694,0.240743,0.003635,0.0,0.045914,0.6759,0.837344,...,0.303974,0.079071,0.281811,0.111068,0.82808,1.0,0,-0.000746,ndsn,2002-02-05:ndsn
2002-02-05,0.15804,0.301677,0.277521,0.300602,0.289138,0.334693,0.372043,0.472216,0.612167,0.499342,...,0.517301,0.394961,0.399169,0.374585,0.0,0.0,0,-0.056228,arw,2002-02-05:arw
2002-02-11,0.509447,0.616043,0.454173,0.517911,0.508214,0.657546,0.522989,0.452308,0.766881,0.80494,...,0.688424,0.578354,0.586273,0.910341,0.155651,0.334553,0,-0.00457,pzza,2002-02-11:pzza


In [6]:
# save the model to disk
filename = 'inference_before_mid.sav'
 
# some time later...
 
# load the model from disk
estimatorCopy = pickle.load(open(filename, 'rb'))
#result = estimatorCopy.score(validx, validy)
print(estimatorCopy)

RandomForestClassifier(bootstrap=False, criterion='entropy',
                       max_features=0.7428567749319119, min_samples_leaf=4,
                       n_estimators=299, n_jobs=-1, random_state=1,
                       verbose=False)


In [7]:
cols=['vol1', 'vol2', 'vol5', 'vol10', 'vol22', 'vol44', 'vol51', 'vol66',
       'vol118', 'vol132', 'ret1', 'ret2', 'ret5', 'ret10', 'ret22', 'ret44',
       'ret51', 'ret66', 'ret118', 'ret132', 'ret246', 'ret261', 'ret375',
       'ret480', 'ret520']
num_companies=400
next_reports = df['tick'][-int(num_companies*1.3):]
ticker_proba = {}
existing_reports = []
for ticker in next_reports:
    try:
        pred_features=combined[ticker][-1:][cols]
        ticker_proba[ticker] = estimatorCopy.predict_proba(pred_features)[:,2]
        existing_reports.append(ticker)
    except:
        print('except:')
        print(ticker)

except:
tdy


In [8]:
{k: v for k, v in sorted(ticker_proba.items(), key=lambda item: item[1],reverse=True)}

{'trmk': array([0.75070075]),
 'cfr': array([0.69554865]),
 'dhc': array([0.66651537]),
 'enr': array([0.65672082]),
 'tho': array([0.65206243]),
 'alex': array([0.65179965]),
 'oge': array([0.65134576]),
 'safm': array([0.64566014]),
 'fult': array([0.64473642]),
 'ale': array([0.61901577]),
 'iboc': array([0.6173276]),
 'blkb': array([0.60685619]),
 'fnb': array([0.60114668]),
 'chdn': array([0.58150979]),
 'svc': array([0.57821309]),
 'rig': array([0.57748049]),
 'pacw': array([0.57249562]),
 'y': array([0.57060041]),
 'vly': array([0.56523332]),
 'boh': array([0.56260551]),
 'wwd': array([0.56137124]),
 'cxw': array([0.55784361]),
 'pbf': array([0.55680045]),
 'msm': array([0.55458672]),
 'mdp': array([0.54968944]),
 'krc': array([0.54577162]),
 'knx': array([0.54438605]),
 'mur': array([0.54138398]),
 'sabr': array([0.51944577]),
 'ghc': array([0.51917503]),
 'livn': array([0.51881669]),
 'ter': array([0.5173276]),
 'ugi': array([0.51383978]),
 'hain': array([0.51196847]),
 'amcx'

In [9]:
# import datetime
from datetime import datetime
from yahoo_earnings_calendar import YahooEarningsCalendar
import math

my_custom_delay_s = 0.01

yec = YahooEarningsCalendar(my_custom_delay_s)

# Returns the next earnings date of BOX in Unix timestamp
#print(yec.get_next_earnings_date('box'))
# 1508716800

ticker_date = {}
sml_tickers = []
today=datetime.today()
start=today+timedelta(days=14)
#end=today+timedelta(days=16)
end=today+timedelta(days=18)


for ticker in set(existing_reports):
    pred_features = combined[ticker][-1:][cols]
    prediction = estimatorCopy.predict_proba(pred_features)[:,1]
    index = str(ticker)+':'+str(math.floor(100*prediction))

    try:
        date = datetime.fromtimestamp(yec.get_next_earnings_date(ticker))
        ticker_date[index] = date
        if date>start and date<end:
            sml_tickers.append(ticker)
    except:
        ticker_date[index] = datetime(1000, 1, 1, 1, 1)


{k: v for k, v in sorted(ticker_date.items(), key=lambda item: item[1])}


{'vvv:31': datetime.datetime(1000, 1, 1, 1, 1),
 'jw a:21': datetime.datetime(1000, 1, 1, 1, 1),
 'nfg:30': datetime.datetime(1000, 1, 1, 1, 1),
 'casy:53': datetime.datetime(1000, 1, 1, 1, 1),
 'alex:22': datetime.datetime(2020, 7, 30, 6, 0),
 'stl:43': datetime.datetime(2021, 1, 20, 6, 0),
 'tcf:42': datetime.datetime(2021, 1, 22, 6, 0),
 'cr:26': datetime.datetime(2021, 1, 25, 6, 0),
 'cohr:28': datetime.datetime(2021, 1, 26, 6, 0),
 'snv:25': datetime.datetime(2021, 1, 26, 6, 0),
 'pii:19': datetime.datetime(2021, 1, 26, 6, 0),
 'umbf:41': datetime.datetime(2021, 1, 26, 6, 0),
 'trmk:8': datetime.datetime(2021, 1, 26, 6, 0),
 'slgn:18': datetime.datetime(2021, 1, 26, 6, 0),
 'rnr:39': datetime.datetime(2021, 1, 26, 6, 0),
 'ehc:47': datetime.datetime(2021, 1, 26, 6, 0),
 'caty:26': datetime.datetime(2021, 1, 27, 6, 0),
 'pb:37': datetime.datetime(2021, 1, 27, 6, 0),
 'seic:40': datetime.datetime(2021, 1, 27, 6, 0),
 'cree:24': datetime.datetime(2021, 1, 27, 6, 0),
 'nycb:25': datet

In [10]:
df = pd.DataFrame(sml_tickers)
writer = pd.ExcelWriter('mid_tickers.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome', index=False)
writer.save()

In [11]:
len(df)

20

In [12]:
df

Unnamed: 0,0
0,cno
1,ncr
2,thc
3,jll
4,hiw
5,rga
6,oi
7,avnt
8,blkb
9,enr
