In [254]:
import numpy as np
import pandas as pd

In [255]:
data = pd.read_csv('../data/punisher_domain.csv')

In [256]:
data.head()

Unnamed: 0,average,date,highest,lowest,order_count,volume
0,255300.0,2022-07-01,256550,253100,115,195
1,275000.0,2022-07-02,335000,245900,101,138
2,332450.0,2022-07-03,365000,243100,103,158
3,250150.0,2022-07-04,367350,90030,90,112
4,249161.92,2022-07-05,325000,230300,73,73


In [257]:
# Change the index to the date column since we are doing time series predictions
data = data.set_index('date')


In [258]:
# Plot out each of our values to try and find a stable one to predict decided on average
# Comment in the graphs
# data.plot.line(y="average", use_index=True)
# data.plot.line(y="lowest", use_index=True)
# data.plot.line(y="highest", use_index=True)

In [259]:
# Create our tomrrow column from our historic data
data["tomorrow"] = data["average"].shift(-1)

In [260]:
# Create our target column for our ML that just determines if the average will increase or decrease
data["target"] = (data["tomorrow"] > data["average"]).astype(int)
data.loc[data["tomorrow"] > data["average"], "movement"] = 1
data.loc[data["tomorrow"] < data["average"], "movement"] = -1

data = data.dropna()
data.isna().value_counts()

average  highest  lowest  order_count  volume  tomorrow  target  movement
False    False    False   False        False   False     False   False       403
Name: count, dtype: int64

In [261]:
# Default model
from sklearn.ensemble import RandomForestClassifier

In [262]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

train = data.iloc[:-100]
test = data.iloc[-100:]

predictors = ["average", "highest", "lowest", "order_count", "volume"]
model.fit(train[predictors], train["target"])

In [263]:
from sklearn.metrics import precision_score

# Get our predictions
preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)


In [264]:
# Score our model
precision_score(test["target"], preds)

0.8666666666666667

In [265]:
# Plot our predictions
combined = pd.concat([test["target"], preds], axis=1)
# Comment in the graph
# combined.plot()

In [266]:
# Start of initial backtest
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="predictions")
    combined = pd.concat([test["target"], preds], axis=1)
    return combined
    

In [267]:
def backtest(data,model, predictors, start=100, step=2):
    all_predictions = []

    for i in range(start, data.shape[0], step):
      train = data.iloc[0:i].copy()
      test = data.iloc[i:(i+step)].copy()
      predictions = predict(train, test, predictors, model)
      all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [268]:
predictions = backtest(data, model, predictors)

In [269]:
predictions["predictions"].value_counts()

predictions
0    263
1     40
Name: count, dtype: int64

In [270]:
precision_score(predictions["target"], predictions["predictions"])

0.6

In [271]:
predictions["target"].value_counts() / predictions.shape[0]

target
0    0.50165
1    0.49835
Name: count, dtype: float64

In [272]:
# Adding horizons to the data to help with predictions
horizons = [2,7,14]
new_predictors = []

for horizon in horizons:
    rolling_averages = data.rolling(horizon).mean()

    ratio_column = f"close_ratio_{horizon}"
    data[ratio_column] = data['average'] / rolling_averages['average']

    trend_column = f"trend_{horizon}"
    data[trend_column] = data.shift(1).rolling(horizon).sum()["movement"]

    new_predictors += [ratio_column, trend_column]

In [273]:
# Start of model that predicts the probabilities
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [274]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >= .6] = 1
    preds[preds< .6] = 0
    preds = pd.Series(preds, index=test.index, name="predictions")
    combined = pd.concat([test["target"], preds], axis=1)
    return combined
    

In [275]:
data = data.dropna()
data

Unnamed: 0_level_0,average,highest,lowest,order_count,volume,tomorrow,target,movement,close_ratio_2,trend_2,close_ratio_7,trend_7,close_ratio_14,trend_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-07-15,231000.00,231000,231000,86,112,231100.00,1,1.0,0.994147,-2.0,0.935523,-1.0,0.901016,0.0
2022-07-16,231100.00,275000,231000,100,120,255550.00,1,1.0,1.000216,0.0,0.997107,-1.0,0.912568,0.0
2022-07-17,255550.00,290000,231000,77,102,250483.52,0,-1.0,1.050241,2.0,1.086163,1.0,1.031489,0.0
2022-07-18,250483.52,270000,231000,56,91,231100.00,0,-1.0,0.989988,0.0,1.059139,-1.0,1.010942,0.0
2022-07-19,231100.00,290100,231000,82,118,232000.00,1,1.0,0.959750,-2.0,0.965696,-1.0,0.937593,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-11,374900.00,495500,299900,106,140,393000.00,1,1.0,1.032785,0.0,0.918361,1.0,0.875478,0.0
2023-08-12,393000.00,509450,361200,92,122,485500.00,1,1.0,1.023571,2.0,0.975714,1.0,0.922626,2.0
2023-08-13,485500.00,489000,381700,86,139,381700.00,0,-1.0,1.105293,2.0,1.186165,1.0,1.129409,2.0
2023-08-14,381700.00,473600,380200,81,122,469900.00,1,1.0,0.880304,0.0,0.971136,-1.0,0.912317,0.0


In [276]:
predictions = backtest(data, model, new_predictors)

In [277]:
predictions["predictions"].value_counts()

predictions
0.0    205
1.0     84
Name: count, dtype: int64

In [278]:
precision_score(predictions["target"], predictions["predictions"])

0.7619047619047619