In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
import random
import math
import pickle
from datetime import datetime, timedelta
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

from skopt import gp_minimize
from skopt.space import Integer, Real, Categorical
from skopt.utils import use_named_args
from scipy.stats.mstats import gmean
from sklearn.metrics import matthews_corrcoef as matt

In [2]:
close = pd.read_parquet('rus_close_201120.parquet').iloc[:,:1000]
volume = pd.read_parquet('rus_volume_201120.parquet').iloc[:,:1000]

market = pd.read_excel("market.xlsx", index_col='date')

In [3]:
from backtester import Portfolio, Calc_invest

In [4]:
def year_start(date):
    """
    Makes the model aware of current date
    """
    if date.year % 4 == 0:
        base = 366
    else:
        base = 365
        
    ystart = pd.to_datetime(str(date.year) + '-01-01')
    return (date - ystart).days / base


def get_tick_df(x, split=pd.to_datetime('2017-01-01')):
    """
    Combines information for a ticker from 'close' and 'volume' dataframes into one,
    Calculates 'valtr' column - 'value traded',
    Splits data into master 'train' and 'test' parts
    
    x - column index for the respective column date
    split - date to split into 'test' and 'train' parts
    """
    tick_df = close.iloc[:,x:x+2]
    tick = tick_df.columns[1]
    tick_df['volume'] = volume[tick]
    tick_df.columns = ['date', tick, 'volume']
    tick_df = tick_df.set_index('date').dropna()
    tick_df['valtr'] = tick_df[tick] * tick_df['volume'] / 1000000
    train = tick_df.loc[:split]
    test = tick_df.copy()
    
    return train, test

market['dt'] = market.index
market['dt'] = market['dt'].apply(year_start)
market['spyval'] = market['spy'] * market['spyv'] / 1000000

In [5]:
x = 0

train_df = {}
test_df = {}
split = pd.to_datetime('2017-01-01')

for _ in tqdm_notebook(range(int(close.shape[1]/2))):
    
    tick = close.iloc[:,x+1].name
    if close[tick].dropna().shape[0] == 0:
        # some are empty, these are ignored
        x+=2
        continue
    else:
        train_df[tick], test_df[tick] = get_tick_df(x, split=split)
        x+=2

train_market = market.loc[:split]
test_market = market.copy()

train_market.index[-1], test_market.index[0]

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




(Timestamp('2016-12-30 00:00:00'), Timestamp('2002-01-02 00:00:00'))

In [6]:
def check_cv_error(cv_periods, verbose=True):
    """
    Checks for look-ahead biases and other errors in cv periods,
    reports data if required so or if silent - raises exceptions
    """
    sub_start = cv_periods['sub_start']
    sub_end = cv_periods['sub_end']
    val_start = cv_periods['val_start']
    val_end = cv_periods['val_end']
#     test_start = cv_periods['test_start']
#     test_end = cv_periods['test_end']
    
    if verbose:
        if sub_start > sub_end:
            print("Training sample dates error")

        if sub_end > val_start:
            print("Validation starts before training ends!")

        if val_start > val_end:
            print("Validation sample dates error")
        print("Training starts at", sub_start, "Training sample size is", (sub_end - sub_start).days, 
              "\nValidation starts at", val_start, "Validation sample size is", (val_end - val_start).days,
              "\nNumber of days purged between training and validation is", (val_start - sub_end).days)
    
    else:
        if sub_start > sub_end:
            raise Exception("Training sample dates error")

        if sub_end > val_start:
            raise Exception("Validation starts before training ends!")

        if val_start > val_end:
            raise Exception("Validation sample dates error")
        
    pass

def shift_cv(cv_periods, shift=12, drag=False):
    """
    Shifts cross-validation periods by a set number of months
    """ 
    
    new_periods = {}
    for k, v in cv_periods.items():
        if (k == 'sub_start') and (drag == False):
            new_periods[k] = v
        else:
            new_periods[k] = (v + np.timedelta64(shift, 'M')).round('d')
    
    return new_periods

In [7]:
def zero_to_one(series, log=True):
    """
    Scales data from 0 to 1 while not looking ahead because of cummin/cummax methods.
    If log=True, first takes np.log of the series
    """
    if log:
        return (np.log(series) - np.log(series).cummin()) / (np.log(series).cummax() - np.log(series).cummin() )
    else:
        return (series - series.cummin()) / (series.cummax() - series.cummin() )
    
def prod_return(valdf, compress=0.5, sharpe=False, com=0.002, penalize=0.1):
    
    if 'predy' not in valdf.columns:
        print("No 'predy' column found!")
        return 0
    elif 'fwd' not in valdf.columns:
        print("No 'fwd' column found!")
        return 0
    elif (compress != None) and ( compress <= 0):
        print("'compress' input must be >0!")
        return 0
    
    else:
        if compress != None:
            valdf['fwd'] = np.where( valdf['fwd'] > 0.5, 0.5, valdf['fwd'] )
        
        valdf['pred_return'] = 1 + valdf['predy'] * (valdf['fwd'] - com)
        
        if valdf.predy.sum() >= 1:
            prod_return = valdf['pred_return'].prod()**(1.0/len(valdf[valdf.predy == 1])) - 1
        else:
            return -valdf.testy.mean()
        
        if penalize:
            if 'testy' not in valdf.columns:
                print("No 'testy' column found!")
                return 0
            
            ratio = valdf['predy'].mean()/valdf['testy'].mean()
            if ratio < penalize:
                prod_return = prod_return * ratio / penalize
        
        if sharpe:
            
            return (prod_return / valdf[valdf.predy == 1]['fwd'].std())
            
        else:
            return prod_return

def preproc_mkt(mkt_df, price_lags=[1,5,20,250], vol_lags=[2,10]):
    
    """
    Prepares data for market dataframe. 
    Creates delta features for every price_lags unit for spy price, 10year rate, 3month rate and oil price.
    Creates rolling average features for 1 and every vol_lags unit for vix, dollar index and spy volume. 
    
    returns clean market dataframe and columns of the dataframe
    Attention! returing dataframe contains a non-feature column 'spyv', used for further relative feature creation 
    """
    
    for lag in price_lags:
        ret_name = 'spy' + str(lag)
        mkt_df[ret_name] = mkt_df['spy']/mkt_df['spy'].shift(lag)-1
        mkt_df[ret_name] = zero_to_one(mkt_df[ret_name], log=False)

        ret_name = '10y' + str(lag)
        mkt_df[ret_name] = mkt_df['rate10y']/mkt_df['rate10y'].shift(lag)-1
        mkt_df[ret_name] = zero_to_one(mkt_df[ret_name], log=False)

        ret_name = '3m' + str(lag)
        mkt_df[ret_name] = mkt_df['rate3m']/mkt_df['rate3m'].shift(lag)-1
        mkt_df[ret_name] = zero_to_one(mkt_df[ret_name], log=False)

        ret_name = 'oil' + str(lag)
        mkt_df[ret_name] = mkt_df['oil']/mkt_df['oil'].shift(lag)-1
        mkt_df[ret_name] = zero_to_one(mkt_df[ret_name], log=False)


    mkt_df['spyvol'] = zero_to_one(mkt_df['spyv'], log=True)
    
    mkt_df['dxy'] = zero_to_one(mkt_df['dxy'], log=False)
    
    mkt_df['s_vix'] = zero_to_one(mkt_df['vix'], log=True)

    for lag in vol_lags:
        mkt_df['spyvol' + str(lag)] = mkt_df['spyvol'].rolling(lag).mean()
        mkt_df['dxy' + str(lag)] = mkt_df['dxy'].rolling(lag).mean()
        mkt_df['s_vix' + str(lag)] = mkt_df['s_vix'].rolling(lag).mean()

    junk_cols = ['vix', 'spy', 'spyv', 'rate10y', 'rate3m', 'oil']

    mkt_df = mkt_df.drop(columns=junk_cols)
    return mkt_df, mkt_df.columns

def preproc_tick(tick, fulldf, market_df, market_cols, fwd=5, 
                 cut_first=150, price_lags=[1,2,5,10,20,50,100,250], vol_lags=[2,5,10,20]):
    """
    Prepares data for a ticker dataframe. 
    Creates delta features for every price_lags unit for close price.
    Creates rolling average features for 1 and every vol_lags unit for volatility and value traded. 
    Cuts first 150 days, because they are too noisy usually due to cummax/cummin methods
    
    Parameters:
        tick - ticker used
        fulldf - dictionary with all tickers that is limited by cv_periods
        market_df - output of preproc_mkt function
        market_cols - output of preproc_mkt function
        fwd - forecasting period in days
    
    returns clean ticker dataframe 
    """
    
    idf = fulldf[tick]
    idf['fwd'] = idf[tick].shift(-fwd) / idf[tick] - 1
    
    fwd_tr = 0.141986 * np.sqrt(fwd/252)
    
    idf['ycol'] = np.where( idf['fwd'] >= fwd_tr, 1, 0 )
    
    idf[market_cols] = market_df
    
    for lag in price_lags:
        ret_name = 'ret' + str(lag)
        idf[ret_name] = idf[tick]/idf[tick].shift(lag)-1
        idf[ret_name] = zero_to_one(idf[ret_name], log=False)

    idf['relvol'] = np.log(idf['valtr']) / np.log(idf['spyval'])
    idf['vol'] = zero_to_one(idf['volume'], log=True)
    idf['val'] = zero_to_one(idf['valtr'], log=True)

    for lag in vol_lags:
        idf['vol' + str(lag)] = idf['vol'].rolling(lag).mean()
        idf['val' + str(lag)] = idf['val'].rolling(lag).mean()
        idf['rel_val' + str(lag)] = idf['relvol'].rolling(lag).mean()
    
    idf['tick'] = tick
    idf['vlt'] = idf['ret1'].std() #Volatility
    
    junk_cols = [tick, 'volume', 'valtr', 'spyval']
    return idf.drop(columns=junk_cols).iloc[cut_first:].dropna()
    

In [8]:
class StepCV():
    
    def __init__(self, full_data=None, full_market=None, cv_periods=None, proc_data_func=None, 
                 proc_mkt_func=None, settings=None):
        
        self.full_data = full_data
        self.full_market = full_market
        self.cv_periods = cv_periods
        self.settings = settings
        self.proc_data_func = proc_data_func
        self.proc_mkt_func = proc_mkt_func
        
        if self.settings == None:
            self.genmodel = LGBMClassifier
            check_cv_error(self.cv_periods, verbose=False)
            self.split()
            self.preproc_data()
            self.model = self.genmodel(random_state=1, n_jobs=-1)
            
        else:
            self.genmodel = self.settings['genmodel']
            check_cv_error(cv_periods, verbose=self.settings['verbose'])
            self.split()
            self.preproc_data()
            self.model = self.genmodel(random_state=1, n_jobs=-1, **settings['model_settings'])
            
        self.train_X = self.train.drop(columns = ['ycol', 'fwd', 'vlt'])
        self.train_y = self.train['ycol']
        val_X = self.val.drop(columns = ['ycol', 'fwd', 'vlt'])
        val_y = self.val['ycol']
        
        try:
            self.model.fit(self.train_X, self.train_y)
        except:
            raise Exception('Error')
        
        if self.settings == None:
            val_X['proby'] = self.model.predict_proba(val_X)[:,1]
            val_X['predy'] = np.where(val_X['proby'] > 0.5, 1, 0)
        else:
            val_X['proby'] = self.model.predict_proba(val_X)[:,1]
            val_X['predy'] = np.where(val_X['proby'] > settings['threshold'], 1, 0)
            
        val_X['fwd'] = self.val['fwd']
        val_X['testy'] = val_y
        self.val_X = val_X
        self.valx = self.val_X[['testy','predy','proby','fwd']]
        self.val_score = prod_return(self.valx.copy())
        self.matt_score = matt( self.valx['testy'], self.valx['predy'] )
    
    def split(self):
        
        self.sub_df = {}
        self.val_df = {}

        for k, v in self.full_data.items():

            self.sub_df[k] = v.loc[self.cv_periods['sub_start']:self.cv_periods['sub_end']]
            self.val_df[k] = v.loc[self.cv_periods['sub_start']:self.cv_periods['val_end']]

        self.market_sub = self.full_market.loc[self.cv_periods['sub_start']:self.cv_periods['sub_end']].copy()
        self.market_val = self.full_market.loc[self.cv_periods['sub_start']:self.cv_periods['val_end']].copy()
        
    def preproc_data(self):
        
        self.train = {}
        self.val = {}
        self.vols = {}
        
        if self.settings == None:
            self.market_sub, _ = self.proc_mkt_func(self.market_sub)
            self.market_val, self.market_cols = self.proc_mkt_func(self.market_val)
            
            for tick in tqdm_notebook(self.sub_df.copy().keys()):
                self.train[tick] = self.proc_data_func(tick, self.sub_df, self.market_sub, self.market_cols)
                self.val[tick] = self.proc_data_func(tick, self.val_df, self.market_val, 
                                                     self.market_cols).loc[self.cv_periods['val_start']:]
        
        else:
            self.market_sub, _ = self.proc_mkt_func(self.market_sub, **self.settings['preproc_mkt'])
            self.market_val, self.market_cols = self.proc_mkt_func(self.market_val, **self.settings['preproc_mkt'])
            
            for tick in self.sub_df.copy().keys():
                self.train[tick] = self.proc_data_func(tick, self.sub_df, self.market_sub, 
                                                       self.market_cols, **self.settings['preproc_tick'])
                try:
                    self.vols[tick] = self.train[tick]['vlt'].iloc[-1]
                except:
                    self.vols[tick] = 0
                self.val[tick] = self.proc_data_func(tick, self.val_df, self.market_val, self.market_cols, 
                                                     **self.settings['preproc_tick']).loc[self.cv_periods['val_start']:]
        
        nlargest = int(len(self.sub_df.keys())*self.settings['top_vol'])
        self.vols = pd.Series(self.vols).nlargest(nlargest)
        self.train = {c: self.train[c] for c in self.vols.index}
        self.val = {c: self.val[c] for c in self.vols.index}             
        
        self.train = pd.concat( self.train.values() ).sort_index()
        self.train['tick date'] = self.train.index.astype(str) + ':' + self.train['tick']
        self.train = self.train.set_index('tick date').drop(columns=['tick'])

        self.val = pd.concat( self.val.values() ).sort_index()
        self.val['tick date'] = self.val.index.astype(str) + ':' + self.val['tick']
        self.val = self.val.set_index('tick date').drop(columns=['tick'])
        
        pass



In [9]:
settings = {
    'genmodel': LGBMClassifier, 
    'verbose': False, 
    'model_settings': {
        'boosting_type': 'gbdt', 
        'num_leaves': 2663, 
        'max_depth': -1, 
        'learning_rate': 0.08882030882895775, 
        'n_estimators': 844, 
        'subsample_for_bin': 200000, 
        'objective': None, 
        'class_weight': None, 
        'min_split_gain': 0.0, 
        'min_child_weight': 0.00034988764861549547, 
        'min_child_samples': 732, 
        'subsample': 0.670359813425697, 
        'subsample_freq': 1, 
#         'colsample_bytree': 1.0, 
        'reg_alpha': 0.0, 
        'reg_lambda': 0.0, 
        'importance_type': 'split', 
        'colsample_bytree': 0.599917250527772}, 
    'preproc_mkt': {'price_lags': [1, 5, 20, 250], 'vol_lags': [2, 10]}, 
    'preproc_tick': {
        'fwd': 8, 
        'cut_first': 150, 'price_lags': [1, 2, 5, 10, 20, 50, 100, 250], 'vol_lags': [2, 5, 10, 20]}, 
    'threshold': 0.7554631330855286, 
    'top_vol': 0.6940517879788671, 
    'drag': False}

cv_periods = {
        'sub_start': pd.to_datetime('2002-07-01'),
        'sub_end': pd.to_datetime('2005-07-01'),
        'val_start': pd.to_datetime('2005-08-01'),
        'val_end': pd.to_datetime('2006-01-01')
    }

prep = StepCV(full_data=train_df, 
                  full_market=train_market, 
                  cv_periods=cv_periods, 
                  proc_data_func=preproc_tick, 
                  proc_mkt_func=preproc_mkt,
                  settings=settings
        )

In [10]:
calc_inv = Calc_invest(fwd=settings['preproc_tick']['fwd'])
portf = Portfolio(prep.valx['proby'], train_df, days_shift=settings['preproc_tick']['fwd'], 
                      threshold=settings['threshold'], calc_invest=calc_inv)
portf.portf_df

HBox(children=(IntProgress(value=0, max=94), HTML(value='')))




Unnamed: 0_level_0,spx,inXdays,portfolio,part,part1,part2,part3,part4,part5,part6,part7,part8,chosen,spx100,p100
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2005-08-08,1223.13,1219.02,200000.00,1,25000.00,25000.00,25000.00,25000.00,25000.00,25000.00,25000.0,25000.00,['bmy'],100.000000,100.000000
2005-08-09,1231.38,1219.71,200000.00,2,25000.00,25000.00,25000.00,25000.00,25000.00,25000.00,25000.0,25000.00,['all'],100.674499,100.000000
2005-08-10,1229.13,1221.73,200000.00,3,25000.00,25000.00,25000.00,25000.00,25000.00,25000.00,25000.0,25000.00,['fmc'],100.490545,100.000000
2005-08-11,1237.81,1217.57,200000.00,4,25000.00,25000.00,25000.00,25000.00,25000.00,25000.00,25000.0,25000.00,['isrg'],101.200199,100.000000
2005-08-12,1230.39,1209.59,200000.00,5,25000.00,25000.00,25000.00,25000.00,25000.00,25000.00,25000.0,25000.00,['isrg'],100.593559,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-12-13,1267.43,1259.92,224175.15,2,22346.17,26742.02,28922.59,31705.32,31714.86,31999.46,23190.6,27554.13,spx,103.621855,112.087575
2005-12-14,1272.74,1259.92,224343.67,3,22346.17,26742.02,29091.11,31705.32,31714.86,31999.46,23190.6,27554.13,spx,104.055988,112.171835
2005-12-15,1270.94,1259.92,224564.92,4,22346.17,26742.02,29091.11,31926.57,31714.86,31999.46,23190.6,27554.13,['amt'],103.908824,112.282460
2005-12-16,1267.32,1259.92,223946.80,5,22346.17,26742.02,29091.11,31926.57,31096.74,31999.46,23190.6,27554.13,['eog'],103.612862,111.973400


In [11]:
cv_periods = {
        'sub_start': pd.to_datetime('2002-07-01'),
        'sub_end': pd.to_datetime('2005-07-01'),
        'val_start': pd.to_datetime('2005-08-01'),
        'val_end': pd.to_datetime('2006-01-01')
    }

res = {}

while cv_periods['val_end'] <= split:
    prep = StepCV(full_data=train_df, 
                  full_market=train_market, 
                  cv_periods=cv_periods, 
                  proc_data_func=preproc_tick, 
                  proc_mkt_func=preproc_mkt,
                  settings=settings
        )
    
    calc_inv = Calc_invest(fwd=settings['preproc_tick']['fwd'])
    
    if prep.valx['predy'].sum() > 0:
        portf = Portfolio(prep.valx['proby'], train_df, days_shift=settings['preproc_tick']['fwd'], 
                      threshold=settings['threshold'], calc_invest=calc_inv)

        print(prep.matt_score, prep.val_score, '\n', portf.portf_df[['spx100', 'p100']].iloc[-1], cv_periods['val_end'])
    else:
        
        print(prep.matt_score, prep.val_score, [100, 100], cv_periods['val_end'])
        
    res[cv_periods['val_end']] = prep
    
    cv_periods = shift_cv(cv_periods, drag=settings['drag'])

HBox(children=(IntProgress(value=0, max=94), HTML(value='')))


0.08683619980953124 0.01577198947397518 
 spx100    103.007857
p100      112.114170
Name: 2005-12-19 00:00:00, dtype: float64 2006-01-01 00:00:00


HBox(children=(IntProgress(value=0, max=83), HTML(value='')))


0.018659410840884554 0.000712875424545834 
 spx100    111.095410
p100      112.717995
Name: 2006-12-07 00:00:00, dtype: float64 2007-01-01 00:00:00


HBox(children=(IntProgress(value=0, max=98), HTML(value='')))


0.027324552716074486 0.0055063367822394405 
 spx100     99.261159
p100      120.911795
Name: 2007-12-18 00:00:00, dtype: float64 2008-01-01 00:00:00


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


0.13827117974924627 -0.002102890563888238 
 spx100    69.851189
p100      58.375585
Name: 2008-12-18 00:00:00, dtype: float64 2008-12-31 00:00:00


HBox(children=(IntProgress(value=0, max=77), HTML(value='')))


0.006435418823848173 0.00046473343123025924 
 spx100    112.267666
p100      114.337980
Name: 2009-12-03 00:00:00, dtype: float64 2009-12-31 00:00:00


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


0.03556921684599532 0.006595689231006149 
 spx100    110.766880
p100      120.698905
Name: 2010-12-20 00:00:00, dtype: float64 2010-12-31 00:00:00


HBox(children=(IntProgress(value=0, max=97), HTML(value='')))


0.011017265401223327 0.0005576413200168455 
 spx100    95.636892
p100      89.997135
Name: 2011-12-19 00:00:00, dtype: float64 2011-12-31 00:00:00


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))


0.013948785965334293 0.0004935950960588839 
 spx100    102.400202
p100      105.959815
Name: 2012-12-10 00:00:00, dtype: float64 2012-12-30 00:00:00


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))


0.03216661907984389 0.004402162048000461 
 spx100    104.920235
p100      124.040210
Name: 2013-12-17 00:00:00, dtype: float64 2013-12-30 00:00:00


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


0.22859364705596638 0.038753547882319106 
 spx100    102.173527
p100       99.683020
Name: 2014-12-17 00:00:00, dtype: float64 2014-12-30 00:00:00


HBox(children=(IntProgress(value=0, max=96), HTML(value='')))


0.09104012497050509 0.022137699839269276 
 spx100    97.543137
p100      91.304350
Name: 2015-12-17 00:00:00, dtype: float64 2015-12-30 00:00:00


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


-0.004594585788264864 -8.704303471631661e-05 
 spx100    100.020164
p100      100.000000
Name: 2016-10-14 00:00:00, dtype: float64 2016-12-29 00:00:00


In [None]:
search_space = [
    
    Categorical(['gbdt', 'dart', 'goss', 'rf'], name='boosting_type'),
    Integer(2, 4096, name='num_leaves'),
    Real(0.0001, 0.3, prior='log-uniform', name='learning_rate'),
    Integer(2, 1000, name='n_estimators'),
    Real(0.01, 0.999, name='subsample'),
    Real(0.0001, 0.3, prior='log-uniform', name='min_child_weight'),
    Integer(5, 1000, name='min_child_samples'),
    Real(0.2, 0.999, prior='uniform', name='colsample_bytree'),
    Integer(2, 15, name='fwd'),
    Real(0.5, 0.9, prior='uniform', name='threshold'),
    Categorical([None, 'balanced'], name='class_weight'),
    Categorical([True, False], name='drag'),
    Real(0.1, 1.0, prior='uniform', name='top_vol'),
]

@use_named_args(search_space)
def cv_round(**opt_sets):

    cv_periods = {
        'sub_start': pd.to_datetime('2002-07-01'),
        'sub_end': pd.to_datetime('2005-12-01'),
        'val_start': pd.to_datetime('2006-01-01'),
        'val_end': pd.to_datetime('2007-01-01'),
    }
    
    settings = {
        'genmodel': LGBMClassifier,
        'verbose': False,
        'model_settings': {
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'max_depth': -1,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'subsample_for_bin': 200000,
            'objective': None,
            'class_weight': None, #'balanced'
            'min_split_gain': 0.0,
            'min_child_weight': 0.001,
            'min_child_samples': 20,
            'subsample': 1.0,
            'subsample_freq': 1,
            'colsample_bytree': 1.0,
            'reg_alpha': 0.0,
            'reg_lambda': 0.0,
            'importance_type': 'split'
        },
        'preproc_mkt': {
            'price_lags': [1,5,20,250], 
            'vol_lags': [2,10]
        },
        'preproc_tick': {
            'fwd': 10,
            'cut_first': 150, 
            'price_lags': [1,2,5,10,20,50,100,250], 
            'vol_lags': [2,5,10,20]
        }, 
        'threshold': 0.5,
        'top_vol': 0.2, 
        'drag': False
    }
    
    if opt_sets != None:
        
        if ('boosting_type' in opt_sets.keys()) and (opt_sets['boosting_type'] == 'goss') and \
            ('subsample' in opt_sets.keys()):
            del opt_sets['subsample']
            
        
        for k, v in opt_sets.items():
            
            if k in settings.keys():
                settings[k] = v
            elif k in settings['preproc_tick'].keys():
                settings['preproc_tick'][k] = v
            else:
                settings['model_settings'][k] = v
    
    val_dfs = []
    zroz = 0
    
    while cv_periods['val_end'] <= split:
        prep = StepCV(full_data=train_df, 
                      full_market=train_market, 
                      cv_periods=cv_periods, 
                      proc_data_func=preproc_tick, 
                      proc_mkt_func=preproc_mkt,
                      settings=settings
            )
        
        if (prep.valx.predy.sum() == 0):
            print("Futile, next!")
            myfile = open('results.txt', 'a+')
            myfile.write('\n' + str(-prep.valx.testy.mean()) + '\n' + str(settings) + \
                         '\n' + datetime.now().strftime("%H:%M:%S"))
            myfile.close()
            return prep.valx.testy.mean()
        
        val_dfs.append(prep.valx)
        cv_periods = shift_cv(cv_periods, drag=settings['drag'])

#     score = -prod_return(pd.concat(val_dfs))
    val_dfs = pd.concat(val_dfs)
    score = -matt( val_dfs['testy'], val_dfs['predy'] )
    
    calc_inv = Calc_invest(fwd=settings['preproc_tick']['fwd'])
    portf = Portfolio(val_dfs['proby'], train_df, days_shift=settings['preproc_tick']['fwd'], 
                      threshold=settings['threshold'], calc_invest=calc_inv)

    portf_return = round(portf.portf_df['p100'].iloc[-1],2)
    spx_return = round(portf.portf_df['spx100'].iloc[-1],2)
    
    myfile = open('results.txt', 'a+')
    myfile.write('\nMatt score: ' + str(round(-score, 5)) + '\tReturn: ' + str(portf_return) + 
                 '\tSPX return: ' + str(spx_return) + 
                 '\n' + str(settings) + '\n' + datetime.now().strftime("%H:%M:%S"))
    myfile.close()
    

    if np.isnan(score):
        return 0
    else:
        return score 

In [None]:
myfile = open('results.txt', 'w')
myfile.write("started\n")
myfile.close()

# result = gp_minimize(cv_round, search_space, random_state=1, n_calls=200, verbose=True, n_initial_points=25)
result = gp_minimize(cv_round, search_space, random_state=1, n_calls=30, verbose=True, n_initial_points=10)

In [None]:
found = {'genmodel': LGBMClassifier,
 'verbose': False,
 'model_settings': {'boosting_type': 'gbdt',
  'num_leaves': 2663,
  'max_depth': -1,
  'learning_rate': 0.08882030882895775,
  'n_estimators': 844,
  'subsample_for_bin': 200000,
  'objective': None,
  'class_weight': None,
  'min_split_gain': 0.0,
  'min_child_weight': 0.00034988764861549547,
  'min_child_samples': 732,
  'subsample': 0.670359813425697,
  'subsample_freq': 1,
  'reg_alpha': 0.0,
  'reg_lambda': 0.0,
  'importance_type': 'split',
  'colsample_bytree': 0.599917250527772},
 'preproc_mkt': {'price_lags': [1, 5, 20, 250], 'vol_lags': [2, 10]},
 'preproc_tick': {'fwd': 8,
  'cut_first': 150,
  'price_lags': [1, 2, 5, 10, 20, 50, 100, 250],
  'vol_lags': [2, 5, 10, 20]},
 'threshold': 0.7554631330855286,
 'top_vol': 0.6940517879788671,
 'drag': False}

In [None]:
search_space = [
    
    Categorical(['gbdt', 'dart', 'goss', 'rf'], name='boosting_type'),
    Integer(2, 4096, name='num_leaves'),
    Real(0.0001, 0.3, prior='log-uniform', name='learning_rate'),
    Integer(2, 1000, name='n_estimators'),
    Real(0.01, 0.999, name='subsample'),
    Real(0.0001, 0.3, prior='log-uniform', name='min_child_weight'),
    Integer(5, 1000, name='min_child_samples'),
    Real(0.2, 0.999, prior='uniform', name='colsample_bytree'),
    Integer(2, 15, name='fwd'),
    Real(0.5, 0.9, prior='uniform', name='threshold'),
    Categorical([None, 'balanced'], name='class_weight'),
    Categorical([True, False], name='drag'),
    Real(0.1, 1.0, prior='uniform', name='top_vol'),
]

opt_sets = {}
x = 0

for opt in result.x:
    opt_sets[search_space[x].name] = opt
    x+=1
    

cv_periods = {
    'sub_start': pd.to_datetime('2013-12-01'),
    'sub_end': pd.to_datetime('2016-12-01'),
    'val_start': pd.to_datetime('2017-01-01'),
    'val_end': pd.to_datetime('2017-04-01'),
}

settings = {
        'genmodel': LGBMClassifier,
        'verbose': False,
        'model_settings': {
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'max_depth': -1,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'subsample_for_bin': 200000,
            'objective': None,
            'class_weight': None, #'balanced'
            'min_split_gain': 0.0,
            'min_child_weight': 0.001,
            'min_child_samples': 20,
            'subsample': 1.0,
            'subsample_freq': 1,
            'colsample_bytree': 1.0,
            'reg_alpha': 0.0,
            'reg_lambda': 0.0,
            'importance_type': 'split'
        },
        'preproc_mkt': {
            'price_lags': [1,5,20,250], 
            'vol_lags': [2,10]
        },
        'preproc_tick': {
            'fwd': 10,
            'cut_first': 150, 
            'price_lags': [1,2,5,10,20,50,100,250], 
            'vol_lags': [2,5,10,20]
        }, 
        'threshold': 0.5,
        'top_vol': 0.2, 
        'drag': False
    }
    
if opt_sets != None:

    if ('boosting_type' in opt_sets.keys()) and (opt_sets['boosting_type'] == 'goss') and \
        ('subsample' in opt_sets.keys()):
        del opt_sets['subsample']


    for k, v in opt_sets.items():

        if k in settings.keys():
            settings[k] = v
        elif k in settings['preproc_tick'].keys():
            settings['preproc_tick'][k] = v
        else:
            settings['model_settings'][k] = v

val_dfs = []

while cv_periods['val_end'] < test_df['aa'].index[-1]:
    prep = StepCV(full_data=test_df, 
                  full_market=test_market, 
                  cv_periods=cv_periods, 
                  proc_data_func=preproc_tick, 
                  proc_mkt_func=preproc_mkt,
                  settings=settings
        )

    val_dfs.append(prep.valx)
    print("Calculated until", cv_periods['val_end'], "Local valscore is", round(prep.val_score*100,2),
         "Local genscore is", round( gmean(prep.valx.fwd + 1)*100 - 100, 2 ))
    cv_periods = shift_cv(cv_periods, drag=settings['drag'], shift=3)

score = prod_return(pd.concat(val_dfs))
print("Testing score", round(score*100,2))

In [None]:
calc_inv = Calc_invest(fwd=settings['preproc_tick']['fwd'])
portf = Portfolio(pd.concat(val_dfs)['proby'], test_df, days_shift=settings['preproc_tick']['fwd'], 
                      threshold=settings['threshold'], calc_invest=calc_inv)

In [None]:
portf.portf_df[['spx100', 'p100']].plot(figsize=[16,8])