In [1]:
# ! pip install scikit-learn pandas joblib tomli

In [26]:
import tomli
import joblib
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

In [34]:
# Load Config
CONFIG_FILE_PATH = "../config.tomli"

with open(CONFIG_FILE_PATH, 'rb') as config_file:
    config = tomli.load(config_file)

ETF_DATA_DRIVE_PATH = f"../{config['data']['etfs']}"
STOCK_DATA_DRIVE_PATH = f"../{config['data']['stocks']}"
PROCESSED_DATA_DRIVE_PATH = f"../{config['data']['processed']}"
ENGINEERED_DATA_DRIVE_PATH = f"../{config['data']['engineered']}"
SYMBOLS_FILE_PATH = f"../{config['data']['symbols']}"

DATASET_PATH = f"{PROCESSED_DATA_DRIVE_PATH}/dataset.parquet"
ENG_DATASET_PATH = f"{ENGINEERED_DATA_DRIVE_PATH}/dataset.parquet"

MODEL_DIR = f"../{config['model']['model_dir']}"

data_dtypes = config['etf_stock_data_type']
symbols_dtype = config['symbols_data_types']

date_format = config['format']['date_format']

rf_model_path = f"{MODEL_DIR}/{config['model']['rl_model']}"

N_JOBS = config['random_forest']['n_jobs']
TEST_SIZE = config['random_forest']['test_size']
MAX_DEPTH = config['random_forest']['max_depth']
N_ESTIMATORS = config['random_forest']['n_estimators']
RANDOM_STATE = config['random_forest']['random_state']

In [22]:
# Create a logger object
logger = logging.getLogger('my_logger')
logger.setLevel(logging.DEBUG)

# Create a file handler and set the logging level
file_handler = logging.FileHandler('../logs/random_forest.log')
file_handler.setLevel(logging.DEBUG)

# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the file handler to the logger
logger.addHandler(file_handler)

In [4]:
%time
# load dataset
data = pd.read_parquet(ENG_DATASET_PATH)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

CPU times: total: 0 ns
Wall time: 0 ns


In [5]:
# Select features and target
features = ['vol_moving_avg', 'adj_close_rolling_med']
target = 'Volume'

# build x, y
X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

del data

In [12]:
# Create a RandomForestRegressor model
model = RandomForestRegressor(
    n_estimators=N_ESTIMATORS, 
    max_depth=MAX_DEPTH, 
    random_state=RANDOM_STATE, 
    n_jobs=N_JOBS
)

In [13]:
# Train the model
model.fit(X_train, y_train)

In [15]:
# Make predictions on test data
y_pred = model.predict(X_test)

In [36]:
# Calculate the Mean Absolute Error and Mean Squared Error
r2 = r2_score(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

logger.info("----------------      Random Forest      --------------------")
logger.info(f"Random Forest Config: {config['random_forest']}")
logger.info(f"Random Forest MAE: {mae}")
logger.info(f"Random Forest MSE: {mse}")
logger.info(f"Random Forest EVS: {evs}")
logger.info(f"Random Forest R^2: {r2}")
logger.info("-------------------------------------------------------------")


In [17]:
# Save the model to disk
# a better apprach would be MLflow
joblib.dump(model, rf_model_path)

['.././models/model_random_forest.joblib']

In [18]:
! pip freeze > ../requirements.txt