In [None]:
!pip install backtesting
!pip install pandas_ta
!pip install catboost

In [None]:
import numpy as np
import pandas as pd
import pandas_ta as ta

In [None]:
!unzip data.zip

In [None]:
df = pd.read_parquet('data/TSLA.parquet')
df.head()

Unnamed: 0_level_0,high,low,open,close,volume,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-03-12,231.626663,222.046661,223.333328,231.243332,100751400,TSLA
2021-03-15,237.726669,228.013336,231.363327,235.979996,88006800,TSLA
2021-03-16,235.973328,223.666672,234.449997,225.626663,96587100,TSLA
2021-03-17,234.57666,217.003326,218.956665,233.936661,121117500,TSLA
2021-03-18,229.743332,217.333328,228.096664,217.720001,99674400,TSLA


In [None]:
def process_data(df):
  df.ta.stdev(length=20, append=True)
  df.ta.stdev(length=40, append=True)
  df.ta.stdev(length=60, append=True)
  df.ta.uo(append=True)
  df.ta.aroon(append=True)
  df.ta.ema(length=20, append=True)
  df.ta.ema(length=40, append=True)
  df.ta.ema(length=60, append=True)
  df.drop(['AROOND_14', 'AROONU_14'], axis=1, inplace=True)
  X = df[59:]
  y = get_target(X)
  X_train, X_test = half_split(X)
  y_train, y_test = half_split(y)
  X_train.columns = [x.capitalize() for x in X_train.columns]
  X_test.columns = [x.capitalize() for x in X_test.columns]
  
  return X_train, y_train, X_test, y_test

In [None]:
def half_split(data):
  return data[:len(data) // 2], data[len(data) // 2:]

def get_target(data):
    open_prices = np.array(data['open'])
    deltas = open_prices[1:] - open_prices[:-1]
    target = deltas > 0
    target_data = pd.DataFrame(target)
    target_data.columns = ['Buy']
    return target_data.astype(int)

In [None]:
X_train, y_train, X_test, y_test = process_data(df)

In [None]:
X_train

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Ticker,Stdev_20,Stdev_40,Stdev_60,Uo_7_14_28,Aroonosc_14,Ema_20,Ema_40,Ema_60
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-06-07,203.333328,194.293335,197.276672,201.710007,67631100,TSLA,7.008890,20.468548,17.505259,50.893560,50.000000,204.510929,210.717234,219.265778
2021-06-08,207.696671,198.500000,207.669998,201.196671,78160200,TSLA,6.611952,20.470696,17.586395,46.249171,50.000000,204.195285,210.252816,218.673348
2021-06-09,203.929993,199.210007,200.723328,199.593338,49753800,TSLA,6.431328,19.755295,17.610054,45.024655,50.000000,203.757004,209.732841,218.047774
2021-06-10,205.529999,200.166672,201.293335,203.373337,71758800,TSLA,6.484550,19.308623,17.684285,45.996957,42.857143,203.720465,209.422622,217.566645
2021-06-11,204.186661,200.506668,203.410004,203.296661,48615900,TSLA,6.211091,18.701376,17.652414,50.218345,-21.428571,203.680102,209.123794,217.098777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-08,349.480011,340.813324,347.736664,341.829987,55013700,TSLA,36.436536,37.196101,32.253058,49.055019,78.571429,337.622925,324.047064,320.206380
2022-04-11,336.156677,324.880005,326.799988,325.309998,59357100,TSLA,31.728192,37.151353,32.001549,44.715536,71.428571,336.450265,324.108670,320.373712
2022-04-12,340.396667,325.533325,332.546661,328.983337,65976000,TSLA,27.427548,37.198345,31.658069,42.677242,-28.571429,335.739129,324.346459,320.655995
2022-04-13,342.079987,324.366669,327.026672,340.790009,55121100,TSLA,23.683031,37.533023,31.608932,41.312278,-42.857143,336.220165,325.148583,321.316126


In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(n_estimators=100)

In [None]:
from google.colab import output
output.enable_custom_widget_manager()
X_train.columns = [x.capitalize() for x in X_train.columns]
X_test.columns = [x.capitalize() for x in X_test.columns]

In [None]:
model.fit(X_train.iloc[:, 6:], y_train)

In [None]:
from backtesting import Strategy

class catboost_strategy(Strategy):

    def init(self):
      pass

    def next(self):
        X = self.data.df.iloc[-1:]
        signal = model.predict(X.iloc[0, 6:])
        if signal and self.position.size < 0:
            self.position.close()
        if signal:
            self.buy(size=1)
        if not signal and self.position.size > 0:
            self.position.close()
        if not signal:
            self.sell(size=1)

In [None]:
from backtesting import Backtest

bt = Backtest(X_test, catboost_strategy, cash=10000, commission=.001)
stats = bt.run()
stats

Start                     2022-04-18 00:00:00
End                       2023-03-01 00:00:00
Duration                    317 days 00:00:00
Exposure Time [%]                   99.086758
Equity Final [$]                 13010.477618
Equity Peak [$]                  13209.636507
Return [%]                          30.104776
Buy & Hold Return [%]              -39.428849
Return (Ann.) [%]                   35.367843
Volatility (Ann.) [%]               18.976978
Sharpe Ratio                         1.863724
Sortino Ratio                        4.135696
Calmar Ratio                          6.75403
Max. Drawdown [%]                   -5.236554
Avg. Drawdown [%]                   -1.712238
Max. Drawdown Duration      172 days 00:00:00
Avg. Drawdown Duration       21 days 00:00:00
# Trades                                  217
Win Rate [%]                        60.829493
Best Trade [%]                      76.751606
Worst Trade [%]                    -15.610755
Avg. Trade [%]                    

In [None]:
np.mean(model.predict(X_test.iloc[1:, 6:]).reshape(y_test.shape) == y_test)

Buy    0.555046
dtype: float64

In [None]:
bt.plot();

In [None]:
importances = pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': X_train.columns[6:]}).sort_values(by=['feature_importance'], 
                                                           ascending=False).reset_index(drop=True)

importances

Unnamed: 0,feature_importance,feature_names
0,40.906803,Uo_7_14_28
1,11.321434,Stdev_60
2,10.646421,Aroonosc_14
3,10.450536,Stdev_20
4,7.225592,Ema_40
5,7.185881,Ema_20
6,6.247278,Ema_60
7,6.016055,Stdev_40


In [None]:
import os
from collections import defaultdict
PATH = 'data/'
stds = defaultdict(float)
metrics = {'Sharpe Ratio': [], 'Return [%]': [], 'Volatility (Ann.) [%]': [], 'Accuracy': []}
tickers = []
for filename in os.listdir(PATH):
  f = os.path.join(PATH, filename)
  data = pd.read_parquet(f)
  tickers.append(data['ticker'][0])
  X_train, y_train, X_test, y_test = process_data(data)
  model = CatBoostClassifier(n_estimators=100)
  model.fit(X_train.iloc[:, 6:], y_train)
  stds[filename] = data.ta.stdev(length=365)[-1]
  bt = Backtest(X_test, catboost_strategy, cash=10000, commission=.001)
  stats = bt.run()
  metrics['Sharpe Ratio'].append(stats['Sharpe Ratio'])
  metrics['Return [%]'].append(stats['Return [%]'])
  metrics['Volatility (Ann.) [%]'].append(stats['Volatility (Ann.) [%]'])
  metrics['Accuracy'].append(np.mean(model.predict(X_test.iloc[1:, 6:]).reshape(y_test.shape) == y_test)[0])

In [None]:
dfs = pd.DataFrame(metrics, index=tickers)

In [None]:
wins = dfs[dfs['Return [%]'] >= 0].sort_values('Return [%]', ascending=False)
losses = dfs[dfs['Return [%]'] < 0].sort_values('Return [%]')
wins.shape[0] / (wins.shape[0] + losses.shape[0])

0.46

In [None]:
wins

Unnamed: 0,Sharpe Ratio,Return [%],Volatility (Ann.) [%],Accuracy
TSLA,1.863724,30.104776,18.976978,0.555046
REGN,0.629764,20.675583,38.334753,0.5
ISRG,0.599584,18.255651,35.494015,0.513761
XOM,1.241132,16.360529,15.347169,0.550459
V,0.701463,15.348507,25.457135,0.495413
AXP,2.319032,14.732491,7.388113,0.518349
MS,0.792034,13.903581,20.403197,0.536697
COST,0.840983,13.38939,18.498395,0.582569
MSFT,0.866433,10.592143,14.17618,0.587156
LMT,0.553025,10.452752,21.915602,0.536697


In [None]:
losses

Unnamed: 0,Sharpe Ratio,Return [%],Volatility (Ann.) [%],Accuracy
GS,0.0,-19.9787,8.850346,0.490826
UNH,0.0,-17.841485,6.655772,0.536697
VRTX,0.0,-14.453478,11.004551,0.541284
AMD,0.0,-13.478768,12.602792,0.53211
HD,0.0,-12.893705,9.120972,0.577982
ADBE,0.0,-12.667744,16.600819,0.582569
EL,0.0,-12.122031,27.19584,0.53211
CI,0.0,-11.355302,11.515986,0.495413
INTU,0.0,-10.90895,22.542147,0.555046
TMO,0.0,-10.379513,12.637541,0.56422


In [None]:
dfs[dfs.columns[1:]].corr()

Unnamed: 0,Return [%],Volatility (Ann.) [%],Accuracy
Return [%],1.0,0.233582,0.037299
Volatility (Ann.) [%],0.233582,1.0,-0.394081
Accuracy,0.037299,-0.394081,1.0


# ....................

In [None]:
def process_data2(df):
  df.ta.stdev(length=20, append=True)
  df.ta.stdev(length=60, append=True)
  df.ta.macd(append=True)
  df.ta.adx(append=True)
  df.ta.rsi(append=True)
  df.ta.aroon(append=True)
  
  df.drop(['AROOND_14', 'AROONU_14'], axis=1, inplace=True)
  X = df[59:]
  y = get_target(X)
  X_train, X_test = half_split(X)
  y_train, y_test = half_split(y)
  X_train.columns = [x.capitalize() for x in X_train.columns]
  X_test.columns = [x.capitalize() for x in X_test.columns]
  
  return X_train, y_train, X_test, y_test

In [None]:
stds = defaultdict(float)
metrics = {'Sharpe Ratio': [], 'Return [%]': [], 'Volatility (Ann.) [%]': [], 'Accuracy': []}
tickers = []
for filename in os.listdir(PATH):
  f = os.path.join(PATH, filename)
  data = pd.read_parquet(f)
  tickers.append(data['ticker'][0])
  X_train, y_train, X_test, y_test = process_data2(data)
  model = CatBoostClassifier(n_estimators=100)
  model.fit(X_train.iloc[:, 6:], y_train)
  stds[filename] = data.ta.stdev(length=365)[-1]
  bt = Backtest(X_test, catboost_strategy, cash=10000, commission=.001)
  stats = bt.run()
  metrics['Sharpe Ratio'].append(stats['Sharpe Ratio'])
  metrics['Return [%]'].append(stats['Return [%]'])
  metrics['Volatility (Ann.) [%]'].append(stats['Volatility (Ann.) [%]'])
  metrics['Accuracy'].append(np.mean(model.predict(X_test.iloc[1:, 6:]).reshape(y_test.shape) == y_test)[0])

In [None]:
dfs = pd.DataFrame(metrics, index=tickers)
wins = dfs[dfs['Return [%]'] >= 0].sort_values('Return [%]', ascending=False)
losses = dfs[dfs['Return [%]'] < 0].sort_values('Return [%]')
wins.shape[0] / (wins.shape[0] + losses.shape[0])

0.54

In [None]:
wins

Unnamed: 0,Sharpe Ratio,Return [%],Volatility (Ann.) [%],Accuracy
BLK,1.086235,28.149243,30.407106,0.642202
TSLA,0.873928,25.879451,34.695824,0.504587
ELV,1.738574,19.751885,13.257459,0.614679
INTU,0.533312,14.353862,31.292323,0.555046
BRK-B,0.919735,10.659075,13.439637,0.605505
META,0.748944,10.342826,16.011185,0.587156
AMGN,1.329637,9.454419,8.238763,0.605505
UPS,1.127507,8.633924,8.86738,0.59633
COST,0.614099,8.455286,15.941936,0.619266
MRNA,0.367597,8.025969,25.272206,0.582569


In [None]:
losses

Unnamed: 0,Sharpe Ratio,Return [%],Volatility (Ann.) [%],Accuracy
NOW,0.0,-31.436723,21.460221,0.559633
TMO,0.0,-19.595336,11.091137,0.619266
NOC,0.0,-13.032332,11.158939,0.600917
ADBE,0.0,-11.502689,17.899277,0.582569
DHR,0.0,-11.201554,5.008413,0.655963
HD,0.0,-9.927825,7.58251,0.587156
AVGO,0.0,-8.925433,10.785828,0.633028
MSFT,0.0,-7.67066,4.607971,0.688073
CRM,0.0,-6.579018,12.311208,0.582569
LOW,0.0,-6.174339,7.061826,0.614679


In [None]:
dfs[dfs.columns[1:-1]].corr()

Unnamed: 0,Return [%],Volatility (Ann.) [%]
Return [%],1.0,0.286343
Volatility (Ann.) [%],0.286343,1.0


In [None]:
blk = pd.read_parquet('data/BLK.parquet')
X_train, y_train, X_test, y_test = process_data2(blk)
model = CatBoostClassifier(n_estimators=100, silent=True)
model.fit(X_train.iloc[:, 6:], y_train)
bt = Backtest(X_test, catboost_strategy, cash=10000, commission=.001)
stats = bt.run()
stats

Start                     2022-04-18 00:00:00
End                       2023-03-01 00:00:00
Duration                    317 days 00:00:00
Exposure Time [%]                   99.086758
Equity Final [$]                 12814.924277
Equity Peak [$]                   13351.47254
Return [%]                          28.149243
Buy & Hold Return [%]                0.155882
Return (Ann.) [%]                   33.029276
Volatility (Ann.) [%]               30.407106
Sharpe Ratio                         1.086235
Sortino Ratio                        2.541902
Calmar Ratio                         2.133796
Max. Drawdown [%]                  -15.479118
Avg. Drawdown [%]                   -3.988243
Max. Drawdown Duration      116 days 00:00:00
Avg. Drawdown Duration       30 days 00:00:00
# Trades                                  198
Win Rate [%]                        56.060606
Best Trade [%]                      29.375924
Worst Trade [%]                    -12.425815
Avg. Trade [%]                    

In [None]:
bt.plot()

In [None]:
importances = pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': X_train.columns[6:]}).sort_values(by=['feature_importance'], 
                                                           ascending=False).reset_index(drop=True)

importances

Unnamed: 0,feature_importance,feature_names
0,19.753951,Rsi_14
1,10.657918,Dmn_14
2,10.548444,Macds_12_26_9
3,10.053788,Adx_14
4,9.752753,Macd_12_26_9
5,9.509601,Stdev_60
6,9.385178,Macdh_12_26_9
7,8.758356,Stdev_20
8,6.265477,Dmp_14
9,5.314533,Aroonosc_14
