In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('df_20_dados_fdate.csv', parse_dates=["Fdate"], index_col="Fdate")

price_columns = [c for c in df.columns if 'Pa_' in c] + [c for c in df.columns if 'Pb_' in c]
volume_columns = [c for c in df.columns if 'Sa_' in c] + [c for c in df.columns if 'Sb_' in c]

result = pd.DataFrame()
for p, v in zip(price_columns, volume_columns):
  result[f'{p}_{v}'] = df[p] * df[v]

total_value = result.sum(axis=1)
total_volume = df[volume_columns].sum(axis=1)
df['VWAP'] = total_value / total_volume

# #################
# Px_y *(Sx_y / SumS) \-/ x_y = VWAP
# #################


In [2]:
def get_renamed_df(df, i):
    df.columns = [f'{col}-{i}' for col in df.columns]
    return df

window = 3
df_lag = get_renamed_df(df.copy(), 0)

for i in range(1, window):
  temp_df = get_renamed_df(df.copy().shift(i), i)
  df_lag = pd.concat([df_lag, temp_df], axis=1)

temp_df = df.copy().shift(-1)
df_lag = pd.concat([df_lag, temp_df], axis=1)

df_lag = df_lag.dropna()

In [3]:
threshold = 0.002

df_lag['Action'] = 'manter'
df_lag.loc[df_lag['VWAP'] > df_lag['VWAP-0'] * (1 + threshold), 'Action'] = 'compra'
df_lag.loc[df_lag['VWAP'] < df_lag['VWAP-0'] * (1 - threshold), 'Action'] = 'venda'

df_lag['Action'].value_counts()

Action
manter    27961
venda       323
compra      288
Name: count, dtype: int64

In [4]:
df_lag2 = df_lag[df_lag.columns[df_lag.columns.str.match('.*-\d')].append(pd.Index(['Action']))]
df_lag2 = df_lag2[df_lag2.columns[~df_lag2.columns.str.match('VWAP.*')]]
df_lag2.columns

df_lag2['Action'].value_counts()

Action
manter    27961
venda       323
compra      288
Name: count, dtype: int64

In [5]:
X = df_lag2.drop('Action', axis=1)
y = df_lag2['Action']


In [6]:
# from sklearn.model_selection import StratifiedShuffleSplit

# splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in splitter.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# display(df_lag2['Action'].value_counts() / df_lag2['Action'].count())
# display(y_train.value_counts() / y_train.count())

In [7]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(test_size=int(len(X) * 0.1), n_splits=5, gap=3)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train) 

In [9]:
vwap_data = df_lag.filter(items=["VWAP-0"])
vwap_data = vwap_data.rename(columns={"VWAP-0": "Current Price"})
vwap_data = vwap_data.merge(y_test, left_index=True, right_index=True)
vwap_data["Action"].value_counts()

Action
manter    2836
compra      13
venda        8
Name: count, dtype: int64

In [10]:
vwap_data.to_csv("stock_action_data.csv")