In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from ta import add_all_ta_features
from ta.trend import ema_indicator
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import datetime

from strategy import *

In [14]:
# Code reference: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 250, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2] # [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1] # [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores


In [24]:
random_grid

{'n_estimators': [200, 205, 211, 216, 222, 227, 233, 238, 244, 250],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2],
 'min_samples_leaf': [1],
 'bootstrap': [False]}

In [15]:
def run_cerebro(dataname,train_start,random_grid,strategy):
    df = pd.read_csv(dataname+'.csv',index_col=0)
    

    df.index = pd.to_datetime(df.index)
    X_train, y_train = df.loc[train_start:'2018-01-01',:].iloc[:,:-2], df.loc[train_start:'2018-01-01']['state']
    X_test, y_test = df.loc['2018-01-01':'2022-01-01',:].iloc[:,:-2], df.loc['2018-01-01':'2022-01-01']['state']

    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
    rf_random.fit(X_train, y_train)
    params = rf_random.best_params_

    model = RandomForestClassifier(**params).fit(X_train, y_train)

    # model = RandomForestClassifier().fit(X_train, y_train)
    y_pred = model.predict(X_test)

    score = model.score(X_test,y_test)
    f1 = f1_score(y_pred,y_test,average='weighted')

    df.loc['2018-01-01':'2022-01-01',['state']] = y_pred.reshape(-1,1)



    # Backtesting
    # df['Date']=pd.to_datetime(df.index)
    df = df.reset_index()
    # state_column = df.shape[1]-1
    #     # Instantiate Cerebro engine
    # cerebro = bt.Cerebro()

    # data = PandasData(
    #     dataname = df,
    #     fromdate=datetime.datetime(2018, 1, 2),
    #     todate=datetime.datetime(2021, 12, 30),
    #     datetime = 0,
    #     open = 1,
    #     high = 2,
    #     low = 3,
    #     close = 4,
    #     state = state_column,
    # )

    # # Add data to Cerebro
    # cerebro.adddata(data)

    # # Add strategy to Cerebro
    # cerebro.addstrategy(strategy)

    # # Default position size
    # cerebro.addsizer(bt.sizers.SizerFix, stake=1)

    # # Add analytics to Cerebro
    # cerebro.addanalyzer(btanalyzers.SharpeRatio, _name='SharpeRatio')
    # cerebro.addanalyzer(btanalyzers.AnnualReturn, _name='AnnualReturn')
    # cerebro.addanalyzer(btanalyzers.DrawDown, _name='DrawDown')
    # cerebro.addanalyzer(btanalyzers.TimeDrawDown, _name='TimeDrawDown')
    # cerebro.addanalyzer(btanalyzers.PositionsValue, _name='PositionsValue')
    # cerebro.addanalyzer(btanalyzers.LogReturnsRolling, _name='LogReturnsRolling')
    # cerebro.addanalyzer(btanalyzers.PeriodStats, _name='PeriodStats')
    # cerebro.addanalyzer(btanalyzers.Returns, _name='Returns')
    # cerebro.addanalyzer(btanalyzers.TradeAnalyzer, _name='TradeAnalyzer')
    # cerebro.addanalyzer(btanalyzers.Transactions, _name='Transactions')

    # # Run Cerebro Engine
    # start_portfolio_value = cerebro.broker.getvalue()

    # # cerebro.run()
    # thestrats = cerebro.run()
    # thestrat = thestrats[0]

    # end_portfolio_value = cerebro.broker.getvalue()
    # pnl = end_portfolio_value - start_portfolio_value

    # print(f'Starting Portfolio Value: {start_portfolio_value:2f}')
    # print(f'Final Portfolio Value: {end_portfolio_value:2f}')
    # print(f'PnL: {pnl:.2f}\n')


    # sharpe = thestrat.analyzers.SharpeRatio.get_analysis()['sharperatio']
    # # print(thestrat.analyzers.AnnualReturn.get_analysis())
    # # print(thestrat.analyzers.DrawDown.get_analysis())
    # # # print(thestrat.analyzers.TimeDrawDown.get_analysis())
    # # print(thestrat.analyzers.PositionsValue.get_analysis())
    # # print(thestrat.analyzers.LogReturnsRolling.get_analysis())
    # # print(thestrat.analyzers.PeriodStats.get_analysis())
    # # print(thestrat.analyzers.Returns.get_analysis())
    # # print(thestrat.analyzers.TradeAnalyzer.get_analysis())
    # # print(thestrat.analyzers.Transactions.get_analysis())
    return df,score,f1,params

In [16]:
# datanames = ['y1_2003','y2_2003','y3_2003','y4_2003','y5_2003','gmm_1_labeled_2003','hmm_1_labeled_2003']
datanames = ['y1_2003','y2_2003','y5_2003','gmm_1_labeled_2003','hmm_1_labeled_2003']
train_start_list = ['2003-01-01','2010-01-04','2015-01-02']

pnl_result = []
df_result = []
pnl = 0
for dataname in datanames:
    for train_start in train_start_list:
        df_final,score,f1,params = run_cerebro(dataname,train_start,random_grid,MarketStatus)
        # pnl_dic = {'dataname':dataname,
        #             'train_start':train_start,
        #             'pnl':pnl,
        #             'sharpe':sharpe,
        #             'score':score,
        #             'f1':f1,
        #             'params':params
        #             }
        df_dic = {'dataname':dataname,
                    'train_start':train_start,
                    'df':df_final,
                    'score':score,
                    'f1':f1,
                    'params':params
        }
        df_result.append(df_dic)
df_result

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[{'dataname': 'y1_2003',
  'train_start': '2003-01-01',
  'df':            Date         Open         High          Low        Close  \
  0    2003-01-02   489.489990   505.179993   489.489990   505.179993   
  1    2003-01-03   505.179993   506.230011   502.100006   504.769989   
  2    2003-01-06   504.769989   517.530029   504.769989   516.039978   
  3    2003-01-07   516.039978   516.650024   510.769989   512.450012   
  4    2003-01-08   512.450012   512.450012   504.529999   505.309998   
  ...         ...          ...          ...          ...          ...   
  4749 2021-12-23  2750.840088  2776.620117  2750.840088  2768.360107   
  4750 2021-12-27  2769.340088  2803.879883  2769.340088  2803.739990   
  4751 2021-12-28  2803.899902  2813.590088  2795.709961  2798.350098   
  4752 2021-12-29  2798.379883  2807.370117  2791.860107  2801.870117   
  4753 2021-12-30  2802.909912  2813.250000  2794.189941  2795.909912   
  
          Adj Close  volatility_bbm  volatility_bbh  volati

In [18]:
df_result_list = df_result

In [23]:
for result in df_result_list:
    name = 'rf_'+result['dataname'][0:2]+'_'+result['train_start'][0:4]+'.csv'
    # print("'"+name+"',")
    result['df'].to_csv(name)

In [17]:
pnl_result_df = pd.DataFrame(pnl_result)
pnl_result_df.sort_values(by='pnl',ascending=False)
pnl_result_df.to_excel('0 rf_result.xlsx')

KeyError: 'pnl'

In [None]:
def StateSmooth(data, n):
    X_train, y_train = data.loc['2000-01-01':'2018-01-01',:].iloc[:,:-1], data.loc['2000-01-01':'2018-01-01',:].iloc[:,-1]
    X_test, y_test = data.loc['2018-01-01':'2022-01-01',:].iloc[:,:-1], data.loc['2018-01-01':'2022-01-01',:].iloc[:,-1]
    model = RandomForestClassifier().fit(X_train, y_train)
    strategy = data.loc['2018-01-01':'2022-01-01',['Open','High','Low','Close','Adj Close']]
    strategy.insert(len(strategy.columns), 'State', '')
    strategy.loc['2018-01-01':'2022-01-01',['State']] = model.predict(X_test)
    for i in range(n):
        strategy[i]=strategy['State'].shift(i)
    strategy.insert(len(strategy.columns), 'New State', '')
    for i in range(len(strategy)):
        strategy.iloc[i,-1] = int(strategy.iloc[i,5:5+n+1].mode().max())
    strategy.insert(len(strategy.columns), 'Position Change', '')
    strategy['Position Change'] = strategy['New State'] - strategy['New State'].shift(1)
    strategy.iloc[0,-1] = strategy.iloc[0,5]
    strategy.iloc[-1,-1] = -strategy.iloc[-1,5]
    strategy.insert(len(strategy.columns), 'Money', 0)
    strategy['Money'] = -strategy['Adj Close'] * strategy['Position Change']
    return strategy

In [None]:
sns.set(rc = {'figure.figsize':(15,8)})
sns.scatterplot(data=data.loc['2000-01-01':'2022-01-01',:], x='Date', y='Close', hue='state',style='state',palette='flare')

NameError: name 'data' is not defined

In [None]:
from sklearn.metrics import confusion_matrix
def plot_cm(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(10, 10)) 
    ax = sns.heatmap(cm, annot=True, fmt="d", cmap=sns.diverging_palette(220, 20, n=7), ax=ax)

    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    ax.set_xticklabels(['Bear','Static','Bull'])
    ax.set_yticklabels(['Bear','Static','Bull'])
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    plt.show() # ta-da!

In [None]:
df = pd.read_csv('hmm_1_labeled_2003'+'.csv',index_col=0)


df.index = pd.to_datetime(df.index)
X_train, y_train = df.loc['2015-01-02':'2018-01-01',:].iloc[:,:-2], df.loc['2015-01-02':'2018-01-01']['state']
X_test, y_test = df.loc['2018-01-01':'2022-01-01',:].iloc[:,:-2], df.loc['2018-01-01':'2022-01-01']['state']

# rf = RandomForestClassifier()
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
# rf_random.fit(X_train, y_train)
params = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}


model = RandomForestClassifier(**params).fit(X_train, y_train)

# model = RandomForestClassifier().fit(X_train, y_train)
y_pred = model.predict(X_test)

score = model.score(X_test,y_test)
f1 = f1_score(y_pred,y_test,average='weighted')

df.loc['2018-01-01':'2022-01-01',['predict']] = y_pred.reshape(-1,1)



# Backtesting
# df['Date']=pd.to_datetime(df.index)
df = df.loc['2018-01-01':'2022-01-01'].reset_index()
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,volatility_bbm,volatility_bbh,volatility_bbl,volatility_bbw,...,momentum_kama,others_dr,others_dlr,others_cr,EMA10,EMA100,T10Y3M,T10YIE,state,predict
0,2018-01-02,1585.089966,1595.939941,1585.089966,1595.869995,1595.869995,1578.270001,1602.487458,1554.052544,3.068861,...,1578.753438,0.827661,0.824255,215.901266,1586.432730,1523.688249,1.02,2.00,1,1.0
1,2018-01-03,1595.839966,1605.890015,1595.839966,1605.189941,1605.189941,1580.344501,1606.262649,1554.426353,3.280063,...,1580.169292,0.584004,0.582305,217.746143,1589.843132,1525.302144,1.03,1.98,1,1.0
2,2018-01-04,1605.699951,1613.839966,1605.699951,1611.020020,1611.020020,1583.055499,1609.882381,1556.228617,3.389254,...,1584.576146,0.363202,0.362544,218.900203,1593.693476,1526.999528,1.05,2.01,1,1.0
3,2018-01-05,1612.140015,1621.479980,1612.140015,1621.369995,1621.369995,1586.348499,1614.951917,1557.745080,3.606196,...,1591.759284,0.642449,0.640394,220.948972,1598.725570,1528.868250,1.08,2.01,1,1.0
4,2018-01-08,1621.280029,1624.780029,1617.640015,1624.349976,1624.349976,1589.498999,1620.194113,1558.803885,3.862238,...,1598.050163,0.183794,0.183625,221.538857,1603.384553,1530.758977,1.04,2.02,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,2021-12-23,2750.840088,2776.620117,2750.840088,2768.360107,2768.360107,2717.716003,2786.561838,2648.870169,5.066448,...,2723.913013,0.648974,0.646877,447.994803,2730.984040,2676.300265,1.43,2.47,1,1.0
974,2021-12-27,2769.340088,2803.879883,2769.340088,2803.739990,2803.739990,2722.416504,2800.638241,2644.194767,5.746493,...,2725.955470,1.278009,1.269911,454.998225,2744.212394,2678.823824,1.42,2.50,1,1.0
975,2021-12-28,2803.899902,2813.590088,2795.709961,2798.350098,2798.350098,2725.344507,2810.062637,2640.626377,6.217058,...,2729.177026,-0.192239,-0.192424,453.931299,2754.055613,2681.190680,1.43,2.50,1,1.0
976,2021-12-29,2798.379883,2807.370117,2791.860107,2801.870117,2801.870117,2731.161011,2820.021515,2642.300506,6.507160,...,2734.793633,0.125789,0.125710,454.628085,2762.749159,2683.580372,1.50,2.53,1,1.0


In [None]:
df['state'].value_counts()

 1    831
-1     98
 0     49
Name: state, dtype: int64