# Workflow

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import warnings

import pandas as pd

from scripts.data_repo import DataRepository
from scripts.transform import TransformData
from scripts.train import SimulationParams, TrainModel

Workflow configuration

In [3]:
# Set the local data folder
LOCAL_DATA_FOLDER = "<path-to-folder>/data/"

In [4]:
# Set the way of data load: set FETCH_REPO = True for full data load, if False - existing file from disk will be loaded.
FETCH_REPO = True

In [5]:
# if True, data will be transformed into a single dataset, if False, the dataset will be loaded from the local storage
TRANSFORM_DATA = True

In [6]:
# if True, the model will be trained, if False, the model will be loaded from the local storage
TRAIN_MODEL = True

## Step 1: Getting data from APIs or Load from disk

In [7]:
repo = DataRepository()

if FETCH_REPO:
    # Fetch All 3 datasets for all dates from APIs
    repo.fetch()
    # save data to a local dir
    repo.persist(data_dir=LOCAL_DATA_FOLDER)
else:
    # OR Load from disk
    repo.load(data_dir=LOCAL_DATA_FOLDER)

Fetching Tickers info from YFinance
Going download data for these tickers: ['MSFT', 'AAPL', 'GOOG', 'NVDA', 'AMZN', 'META', 'BRK-B', 'LLY', 'AVGO', 'V', 'JPM', 'TSLA', 'WMT', 'XOM', 'UNH', 'MA', 'PG', 'ORCL', 'COST', 'JNJ', 'HD', 'MRK', 'BAC', 'ABBV', 'CVX', 'NFLX', 'KO', 'AMD', 'ADBE', 'CRM', 'PEP', 'QCOM', 'TMO', 'TMUS', 'WFC', 'CSCO', 'AMAT', 'DHR', 'MCD', 'DIS', 'ABT', 'TXN', 'GE', ' INTU', 'VZ', 'AMGN', 'AXP', 'CAT', 'IBM', 'PFE', 'PM', 'MS', 'NVO', 'MC.PA', 'ASML', 'RMS.PA', 'OR.PA', 'SAP', 'ACN', 'TTE', 'SIE.DE', 'IDEXY', 'CDI.PA', 'RELIANCE.NS', 'TCS.NS', 'HDB', 'BHARTIARTL.NS', 'IBN', 'SBIN.NS', 'LICI.NS', 'INFY', 'ITC.NS', 'HINDUNILVR.NS', 'LT.NS', 'TCEHY', '1398.HK', '601857.SS', '600519.SS', '0941.HK', '601288.SS', 'PDD', 'BABA', '601939.SS', '601988.SS']


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Fetching Indexes info from YFinance


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Fetching Macro info from FRED (Pandas_datareader)


## Step2: Transform data into one dataframe

In [8]:
transformed = TransformData(repo=repo)

if TRANSFORM_DATA:
    transformed.transform()
    transformed.persist(data_dir=LOCAL_DATA_FOLDER)
else:
    transformed.load(data_dir=LOCAL_DATA_FOLDER)

601988.SS: 100%|██████████| 84/84 [00:17<00:00,  4.87it/s]    


## Step3: Train/Load Model

In [9]:
# Suppress all warnings (not recommended in production unless necessary)
warnings.filterwarnings("ignore")

trained = TrainModel(transformed=transformed)

if TRAIN_MODEL:
    trained.prepare_dataframe()  # prepare dataframes
    trained.train_random_forest()  # train the model
    trained.persist(data_dir=LOCAL_DATA_FOLDER)  # save the model to disk
else:
    trained.prepare_dataframe()  # prepare dataframes (incl. for inference)
    trained.load(data_dir=LOCAL_DATA_FOLDER)

Prepare the dataframe: define feature sets, add dummies, temporal split
length: X_train (310453, 352),  X_validation (76373, 352), X_test (76883, 352)
  X_train_valid = (386826, 352),  all combined: X_all (463709, 352)
Training the best model (RandomForest (max_depth=18, n_estimators=500))


## Step4: Inference

Inference and simulation settings

In [10]:
sim_params = SimulationParams(
    initial_capital = 10000,        # initial capital = $10k
    threshold = 0.55,               # select all binary predictions with probability>=0.55
    fees = 0.002,                   # trading fees = 0.2% (buy+sell)
    top_k = 5,                      # select top_k predictions
    portfolio_optimization=False,   # DOES NOT WORK now
    stop_loss = 0.8,                # automatic sell (with loss) if price (any of next 5 days) is lower than -20% from Adj.Close
    take_profit = 1.02,             # automatic sell (with profit) if price (any of next 5 days) is higher than +2% from Adj.Close
    lower_entry = 0.99              # buy next day with the price = [Adj.Close] * 0.99 (try to buy cheaper)
)

In [11]:
trained.make_inference(sim_params)

Making inference


In [12]:
print('Results of the estimation (last 10):')
# Set display options to prevent truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

predicted_signals = trained.get_last_signals(num=10)

predicted_signals.tail(10)

Results of the estimation (last 10):


Unnamed: 0,Adj Close,Ticker,Date,pred_rf_best,pred_rf_best_rank
7165,859.700012,SBIN.NS,2024-07-12 00:00:00+00:00,1,7.0
532,1051.550049,LICI.NS,2024-07-12 00:00:00+00:00,1,8.0
6306,708.799988,MC.PA,2024-07-15 00:00:00+00:00,1,1.0
6306,2139.0,RMS.PA,2024-07-15 00:00:00+00:00,1,2.0
6306,410.649994,OR.PA,2024-07-15 00:00:00+00:00,1,3.0
7094,183.320007,SIE.DE,2024-07-15 00:00:00+00:00,1,4.0
8376,664.0,CDI.PA,2024-07-15 00:00:00+00:00,1,5.0
7165,3193.199951,RELIANCE.NS,2024-07-15 00:00:00+00:00,1,6.0
5441,4198.149902,TCS.NS,2024-07-15 00:00:00+00:00,1,7.0
5470,1435.849976,BHARTIARTL.NS,2024-07-15 00:00:00+00:00,1,8.0


In [13]:
res, capital = trained.simulate(sim_params)

SIMULATION STARTED
Simulations params: SimulationParams(initial_capital=10000, threshold=0.55, fees=0.002, top_k=5, portfolio_optimization=False, stop_loss=0.8, take_profit=1.02, lower_entry=0.99)
 Count bids 2452 in total, avg.bids per day 4.019672131147541,  filled bids 621, fill bids percent = 0.2532626427406199
  Stop loss events: count = 2, net loss = -373.8079053388201 
  Take profit events: count = 438, net profit = 5873.229442572396 
  Start capital = 10000, Resulting capital: 13608.626598714014 
  CAGR in 4 years: 1.08 or 8.01 % of avg. growth per year
