In [1]:
import pandas as pd
from indicators import RSI, extract_bb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import numpy as np
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from IPython.display import clear_output
import plotly.graph_objects as go
import plotly.express as px

import gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from gym import spaces
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import ProgressBarCallback

warnings.filterwarnings("ignore")

In [32]:
df = pd.read_csv("gzpn_data.csv")
df = df.dropna().reset_index(drop=True)  # .sample(frac=1)
df = df.drop(columns=["<TICKER>", "<PER>", "<DATE>", "<TIME>"])
df.columns = ["open", "high", "low", "close", "volume"]
df

Unnamed: 0,open,high,low,close,volume
0,162.55,162.83,162.50,162.63,214810
1,162.65,162.70,162.43,162.49,218280
2,162.49,162.50,162.31,162.42,174320
3,162.43,162.49,162.30,162.43,128060
4,162.45,162.46,162.28,162.29,125240
...,...,...,...,...,...
2128,163.37,163.40,163.30,163.40,171700
2129,163.40,163.82,163.39,163.71,623050
2130,163.70,163.82,163.61,163.80,314660
2131,163.80,163.82,163.60,163.70,136580


In [33]:
n_steps = 11

prices = df["close"]

rsi_values = RSI(prices=prices, n_steps=n_steps)
bb_values = extract_bb(prices=prices, n_steps=n_steps)

assert len(rsi_values) == len(
    bb_values
), f"Indicators length don't coincide: {len(rsi_values)} and {len(bb_values)}"

In [34]:
def prepare_target(df, steps_obs: int = 3):
    targets = []
    for i in range(0, len(df) - steps_obs):
        current_price = df["close"].iloc[i]
        max_price = df["high"].iloc[i + 1 : i + 1 + steps_obs].max()
        targets.append(max_price > current_price)
    targets += [0] * steps_obs
    return np.array(targets, dtype=np.int32)


steps_obs = 3

targets = prepare_target(df=df, steps_obs=steps_obs)

In [35]:
indicator_data = (
    pd.DataFrame(
        data=np.array([rsi_values, bb_values, targets]).T,
        columns=["rsi", "bb", "target"],
    )
    .dropna()
    .reset_index(drop=True)
    .astype(np.float64)
)
indicator_data["target"] = indicator_data["target"].astype(np.int32)
indicator_data

Unnamed: 0,rsi,bb,target
0,42.465753,-0.450672,1
1,53.947368,0.979324,1
2,53.333333,1.004554,1
3,63.492063,0.659716,1
4,59.649123,0.850015,1
...,...,...,...
2117,59.782609,1.467361,1
2118,66.363636,2.596038,1
2119,78.846154,2.145840,0
2120,74.545455,1.372398,0


In [36]:
indicator_data.describe()

Unnamed: 0,rsi,bb,target
count,2122.0,2122.0,2122.0
mean,49.882666,-0.018201,0.874647
std,19.02269,1.211967,0.331197
min,0.0,-3.059143,0.0
25%,36.390374,-1.013441,1.0
50%,49.382716,-0.019851,1.0
75%,63.106796,0.947435,1.0
max,100.0,3.140485,1.0


In [37]:
all_data = indicator_data.copy()
all_data[df.columns] = df.iloc[n_steps:, :].values

all_data

Unnamed: 0,rsi,bb,target,open,high,low,close,volume
0,42.465753,-0.450672,1,162.41,162.50,162.37,162.38,102660.0
1,53.947368,0.979324,1,162.37,162.50,162.37,162.48,70060.0
2,53.333333,1.004554,1,162.48,162.50,162.46,162.48,18360.0
3,63.492063,0.659716,1,162.49,162.49,162.39,162.46,39080.0
4,59.649123,0.850015,1,162.46,162.50,162.45,162.48,156850.0
...,...,...,...,...,...,...,...,...
2117,59.782609,1.467361,1,163.37,163.40,163.30,163.40,171700.0
2118,66.363636,2.596038,1,163.40,163.82,163.39,163.71,623050.0
2119,78.846154,2.145840,0,163.70,163.82,163.61,163.80,314660.0
2120,74.545455,1.372398,0,163.80,163.82,163.60,163.70,136580.0


# Обработка фичей

In [38]:
scaler = MinMaxScaler()
scaler.fit(indicator_data.iloc[:, :-1])
indicator_data.iloc[:, :-1] = scaler.transform(indicator_data.iloc[:, :-1])
indicator_data

Unnamed: 0,rsi,bb,target
0,0.424658,0.420746,1
1,0.539474,0.651405,1
2,0.533333,0.655474,1
3,0.634921,0.599852,1
4,0.596491,0.630547,1
...,...,...,...
2117,0.597826,0.730125,1
2118,0.663636,0.912181,1
2119,0.788462,0.839564,0
2120,0.745455,0.714808,0


In [39]:
indicator_data[["rsi", "bb"]].iloc[1].values

array([0.53947368, 0.65140469])

In [40]:
X = indicator_data.iloc[:, :]
y = indicator_data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, shuffle=False
)

In [41]:
step_data = df.iloc[:50]
x = step_data.index
price = step_data["close"]

x_buy = np.array([i for i in x[::2]])
y_buy = price[x_buy]

In [42]:
all_data

Unnamed: 0,rsi,bb,target,open,high,low,close,volume
0,42.465753,-0.450672,1,162.41,162.50,162.37,162.38,102660.0
1,53.947368,0.979324,1,162.37,162.50,162.37,162.48,70060.0
2,53.333333,1.004554,1,162.48,162.50,162.46,162.48,18360.0
3,63.492063,0.659716,1,162.49,162.49,162.39,162.46,39080.0
4,59.649123,0.850015,1,162.46,162.50,162.45,162.48,156850.0
...,...,...,...,...,...,...,...,...
2117,59.782609,1.467361,1,163.37,163.40,163.30,163.40,171700.0
2118,66.363636,2.596038,1,163.40,163.82,163.39,163.71,623050.0
2119,78.846154,2.145840,0,163.70,163.82,163.61,163.80,314660.0
2120,74.545455,1.372398,0,163.80,163.82,163.60,163.70,136580.0


In [43]:
indicator_data = indicator_data.iloc[:, :2]

In [59]:
# Your DataFrame `df` should contain all observations and you will have your reward function `reward_function`
# df = all_da.copy()
# df.columns = [col.lower() for col in df.columns]

steps_obs = 3
commision_bps = 0
rewards = []
Statistics = []

init_balance = 200
init_active = 10

start_price = all_data["close"].iloc[0]
init_pv = init_balance + init_active * start_price

MAX_BALANCE = 2 * init_pv
MAX_ACTIVE = 2 * init_pv / start_price


class CustomEnv(gym.Env):
    def __init__(self, df, init_balance, init_active, test=False):
        super(CustomEnv, self).__init__()
        self.df = df
        self.test = test
        self.seed = lambda x: 0
        self.current_step = 0
        # Assume observation space is a 1D array of features (adjust according to your data)
        self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)
        self.balance = init_balance
        self.active = init_active
        self.init_balance = init_balance
        self.init_active = init_active
        self.current_price = self.df["close"].iloc[self.current_step]
        self.pv = self.balance + self.active * self.current_price
        # Adjust according to your action space
        self.action_space = spaces.Box(
            low=np.array([-1]),
            high=np.array([1]),
            dtype=np.float32,
        )

    def reset(self):
        # Reset the state of the environment to an initial state
        # print("reseting...")
        self.current_step = 0
        if self.test:
            rewards.append([])
            Statistics.append([])

        init_state = np.append(
            self.df[indicator_data.columns].iloc[0].values,
            np.array([self.init_balance / MAX_BALANCE, self.init_active / MAX_ACTIVE]),
        )
        self.balance = init_balance
        self.active = init_active
        self.current_price = self.df["close"].iloc[self.current_step]
        self.pv = self.balance + self.init_active * self.current_price
        return init_state

    def reward_function(self, action):
        next_price = self.df["close"].iloc[self.current_step + 1]
        # покупаем
        if action > 0:
            money_to_spend = self.balance * action
            self.active += money_to_spend / self.current_price
            self.balance -= money_to_spend
        # продаем
        elif action < 0:
            active_to_sell = self.active * abs(action)
            self.active -= active_to_sell
            self.balance += active_to_sell * self.current_price
        new_pv = self.active * next_price + self.balance
        reward = (new_pv - self.pv) / self.pv * 1e4

        return reward

    def step(self, action):
        # Calculate the reward

        reward = self.reward_function(action)
        self.current_step += 1
        if self.current_step < len(self.df.index) - (steps_obs + 1):
            next_state = np.append(
                self.df[indicator_data.columns].iloc[self.current_step].values,
                np.array([self.balance / MAX_BALANCE, self.active / MAX_ACTIVE]),
            )
            done = False
        else:
            # The episode is finished
            next_state = np.zeros(self.observation_space.shape)
            done = True

        # Placeholder for info, additional data can be added if needed
        info = {}
        # print(next_state)
        if self.test:
            rewards[-1].append(reward)
        self.current_price = self.df["close"].iloc[self.current_step]
        self.pv = self.active * self.current_price + self.balance

        return next_state, reward, done, info


# Create multiple environments
env = CustomEnv(df=all_data, init_balance=init_balance, init_active=init_active)
env = DummyVecEnv([lambda: env])

In [60]:
policy_kwargs = dict(activation_fn=nn.ReLU, net_arch=dict(pi=[20, 20], vf=[20, 20]))

# Create the PPO model
model = PPO("MlpPolicy", env, verbose=1, device="cpu", policy_kwargs=policy_kwargs)

# Train the model
model.learn(total_timesteps=2500)  # , progress_bar=True)

# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=2)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 125  |
|    iterations      | 1    |
|    time_elapsed    | 16   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 119          |
|    iterations           | 2            |
|    time_elapsed         | 34           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0027503548 |
|    clip_fraction        | 0.0197       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -0.0656      |
|    learning_rate        | 0.0003       |
|    loss                 | 95.6         |
|    n_updates            | 10           |
|    policy_gradient_loss | 0.000572     |
|    std                  | 0.985        |
|    value_loss           | 193          |

In [61]:
# Reset the environment to get the initial observation
obs = env.reset()

num_iterations = 20
actions_taken = []
rewards = []

for _ in range(num_iterations):
    print(obs)
    # Choose an action randomly (replace this with your own policy)
    action, _ = model.predict(obs, deterministic=False)

    # Take a step in the environment with the predicted action
    obs, reward, done, _ = env.step(action)

    # Save the action taken
    actions_taken.append(action)
    rewards.append(reward)

    if done:
        break

[[42.46575    -0.45067176  0.05483057  0.44516942]]
[[53.94737     0.9793236   0.26998982  0.23001018]]
[[5.3333332e+01 1.0045544e+00 4.6473113e-01 3.5388723e-02]]
[[63.49206     0.6597164   0.          0.49983382]]
[[59.649124    0.850015    0.          0.49983382]]
[[48.88889     0.86629796  0.          0.49983382]]
[[51.11111     0.86253893  0.          0.49983382]]


[[82.14286     0.86253893  0.          0.49983382]]
[[87.17949     2.0831559   0.22073391  0.27926293]]
[[85.29412     1.8733509   0.2286256   0.27138337]]
[[93.75        1.6657321   0.          0.49962947]]
[[96.36364     2.6980762   0.          0.49962947]]
[[82.8125      1.6870966   0.          0.49962947]]
[[86.56716     1.5599154   0.          0.49962947]]
[[76.712326    0.9766236   0.13574617  0.36435813]]
[[69.620255    0.5112316   0.07972115  0.42021456]]
[[71.084335    0.64203644  0.50102264  0.        ]]
[[72.72727     0.7772954   0.32246393  0.17804332]]
[[63.75        0.35981113  0.41902605  0.08178944]]
[[64.19753     0.43969142  0.50105226  0.        ]]


In [62]:
test_df = all_data.iloc[:num_iterations, :].copy()
test_df["actions"] = np.array(actions_taken).T[0][0]
test_df["rewards"] = np.array(rewards)

test_df

Unnamed: 0,rsi,bb,target,open,high,low,close,volume,actions,rewards
0,42.465753,-0.450672,1,162.41,162.5,162.37,162.38,102660.0,-0.48332,2.832556
1,53.947368,0.979324,1,162.37,162.5,162.37,162.48,70060.0,-0.846143,0.0
2,53.333333,1.004554,1,162.48,162.5,162.46,162.48,18360.0,1.0,-1.229859
3,63.492063,0.659716,1,162.49,162.49,162.39,162.46,39080.0,1.0,1.23001
4,59.649123,0.850015,1,162.46,162.5,162.45,162.48,156850.0,0.469128,0.616268
5,48.888889,0.866298,1,162.48,162.5,162.48,162.49,83300.0,0.308973,0.614892
6,51.111111,0.862539,1,162.5,162.5,162.49,162.5,52370.0,1.0,0.0
7,82.142857,0.862539,1,162.49,162.5,162.49,162.5,28020.0,-0.441288,4.469901
8,87.179487,2.083156,1,162.5,162.63,162.49,162.63,62910.0,-0.028216,0.668079
9,85.294118,1.873351,1,162.62,162.69,162.62,162.65,54210.0,1.0,0.615207


In [63]:
fig = px.line(
    test_df, x=test_df.index, y="close", hover_data=["rsi", "bb", "actions", "rewards"]
)
fig.update_traces(mode="markers+lines")

fig.show()