In [74]:
import numpy as np
import pandas as pd

In [75]:
data = pd.read_csv('../data/punisher_domain.csv')

In [76]:
data.head()

Unnamed: 0,average,date,highest,lowest,order_count,volume
0,255300.0,2022-07-01,256550,253100,115,195
1,275000.0,2022-07-02,335000,245900,101,138
2,332450.0,2022-07-03,365000,243100,103,158
3,250150.0,2022-07-04,367350,90030,90,112
4,249161.92,2022-07-05,325000,230300,73,73


In [77]:
# Change the index to the date column since we are doing time series predictions
data = data.set_index('date')


In [78]:
# Plot out each of our values to try and find a stable one to predict decided on average
# Comment in the graphs
# data.plot.line(y="average", use_index=True)
# data.plot.line(y="lowest", use_index=True)
# data.plot.line(y="highest", use_index=True)

In [79]:
# Create our tomrrow column from our historic data
data["tomorrow"] = data["average"].shift(-1)

In [80]:
# Create our target column for our ML that just determines if the average will increase or decrease
data["target"] = (data["tomorrow"] > data["average"]).astype(int)
data.loc[data["tomorrow"] > data["average"], "movement"] = 1
data.loc[data["tomorrow"] < data["average"], "movement"] = -1

data = data.dropna()
data.isna().value_counts()

average  highest  lowest  order_count  volume  tomorrow  target  movement
False    False    False   False        False   False     False   False       403
Name: count, dtype: int64

In [81]:
# Default model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [82]:
# Adding horizons to the data to help with predictions
horizons = [2,7,14]
new_predictors = []

for horizon in horizons:
    rolling_averages = data.rolling(horizon).mean()

    ratio_column = f"close_ratio_{horizon}"
    data[ratio_column] = data['average'] / rolling_averages['average']

    trend_column = f"trend_{horizon}"
    data[trend_column] = data.shift(1).rolling(horizon).sum()["movement"]

    new_predictors += [ratio_column, trend_column]

data = data.dropna()

In [83]:
predictors = ["average", "highest", "lowest", "order_count", "volume"]

X = data[new_predictors]
y = data['target']

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [84]:
# Define time-based cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Perform Randomized Search with Time Series Cross-Validation
randomized_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=1),
    param_distributions=param_grid,
    n_iter=10,
    cv=tscv,
    scoring='accuracy'
)

# Fit the model
randomized_search.fit(X, y)

In [85]:
data = data.drop(["average", "highest", "lowest", "tomorrow"], axis=1)
data.iloc[300:301]


Unnamed: 0_level_0,order_count,volume,target,movement,close_ratio_2,trend_2,close_ratio_7,trend_7,close_ratio_14,trend_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-05-18,85,93,1,1.0,1.053205,0.0,0.970984,-1.0,0.912801,-2.0


In [86]:
best_params = randomized_search.best_params_
best_model = randomized_search.best_estimator_

# Split the data into training and test sets (you can use your own data split)
split_date = "2023-01-03"
X_train = X[X.index < split_date]
y_train = y[y.index < split_date]
X_test = X[X.index >= split_date]
y_test = y[y.index >= split_date]

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best Hyperparameters:", best_params)
print("Accuracy on Test Set:", accuracy)

Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 20}
Accuracy on Test Set: 0.8248847926267281


In [89]:
import joblib
joblib.dump(best_model, "../production-models/production_rf_model.joblib")

['../production-models/production_rf_model.joblib']