<a href="https://colab.research.google.com/github/Billy-Drunkenstein/RL_CSI_300/blob/main/Bill/Neural%20Network%20Implementation%200.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
import random

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

In [3]:
data = pd.read_csv('000300.SH.csv', index_col = 0, parse_dates = True)
data.tail()

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,AMT,TURN,TOTAL_SHARES,FREE_FLOAT_SHARES,MKT_CAP_ARD,MKT_FREESHARES,PE_TTM,VAL_PB_WGT,DIVIDENDYIELD2
2025-05-12,3866.5305,3890.7342,3859.0346,3890.6096,19.312476,15333250000.0,296122900000.0,0.4696,4210860000000.0,1109568000000.0,57048000000000.0,18703830000000.0,12.5363,1.3089,3.4887
2025-05-13,3915.5924,3915.5924,3890.6855,3896.2583,18.16358,14732030000.0,267586400000.0,0.4512,4210862000000.0,1109540000000.0,57207610000000.0,18731350000000.0,12.5713,1.3127,3.485
2025-05-14,3894.8386,3960.5228,3890.6504,3943.2108,18.569913,17203380000.0,319465200000.0,0.5269,4210862000000.0,1109540000000.0,57880740000000.0,18954250000000.0,12.7193,1.3282,3.4508
2025-05-15,3934.0273,3936.4653,3904.9068,3907.1992,17.487101,13808170000.0,241464900000.0,0.4229,4210863000000.0,1109530000000.0,57498590000000.0,18780550000000.0,12.6353,1.3196,3.4779
2025-05-16,3898.6662,3903.8528,3874.8619,3889.0854,17.637351,11840530000.0,208835700000.0,0.3626,4210807000000.0,1109475000000.0,57160820000000.0,18693070000000.0,12.5611,1.3119,3.4436


In [4]:
WINDOWS = {"3y": 756, "1y": 252, "6m": 126, "3m": 63, "1m": 21, "2w": 10}

new_columns = {}
for col in data.columns:
    for window_name, window in WINDOWS.items():
        rolling_mean = data[col].rolling(window=window).mean()
        rolling_std = data[col].rolling(window=window).std()

        z_score = (data[col] - rolling_mean) / rolling_std
        z_score = z_score.clip(lower=-10, upper=10)

        new_columns[f"{col}_{window_name}_Z"] = z_score

data = pd.concat([data, pd.DataFrame(new_columns)], axis = 1)

data["DAILY_RETURN"] = data["OPEN"].pct_change()
data["DAILY_RETURN_DIFF1"] = data["DAILY_RETURN"].diff(1)
data["VOLUME_DIFF1"] = data["VOLUME"].diff(1)
data["VOLUME_DIFF2"] = data["VOLUME"].diff(2)

In [5]:
open_prices = data["OPEN"].values

for n in [3, 5, 7, 10, 15]:
    max_dailyized_returns = []

    for i in range(len(open_prices)):
        if i < n:
            max_dailyized_returns.append(np.nan)
            continue

        max_ret = float("-inf")
        for k in range(1, n + 1):  # subwindow length
            for offset in range(i - n, i - k + 1):
                if offset < 0:
                    continue
                p_start = open_prices[offset]
                p_end = open_prices[offset + k]
                if p_start <= 0:
                    continue
                dailyized_ret = (p_end / p_start) ** (1 / k) - 1
                max_ret = max(max_ret, dailyized_ret)

        max_dailyized_returns.append(max_ret)

    data[f"MAX_DAILYIZED_{n}"] = max_dailyized_returns

data.dropna(inplace = True)

In [6]:
data.head()

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,VWAP,VOLUME,AMT,TURN,TOTAL_SHARES,FREE_FLOAT_SHARES,...,DIVIDENDYIELD2_2w_Z,DAILY_RETURN,DAILY_RETURN_DIFF1,VOLUME_DIFF1,VOLUME_DIFF2,MAX_DAILYIZED_3,MAX_DAILYIZED_5,MAX_DAILYIZED_7,MAX_DAILYIZED_10,MAX_DAILYIZED_15
2008-05-20,3911.096,3946.441,3698.554,3710.818,16.263919,3970263000.0,64572030000.0,1.2424,1605192000000.0,282996700000.0,...,1.811529,-0.003876,0.001626,999304800.0,-1912300.0,-0.003876,0.042459,0.042459,0.049507,0.058347
2008-05-21,3668.181,3801.067,3591.566,3783.049,15.868166,4359616000.0,69179110000.0,1.3618,1606971000000.0,283567700000.0,...,1.113035,-0.062109,-0.058233,389353300.0,1388658000.0,-0.003876,0.042459,0.042459,0.049507,0.058347
2008-05-22,3732.641,3783.849,3704.157,3711.444,16.268236,4074272000.0,66281210000.0,1.2715,1611218000000.0,283889400000.0,...,1.132834,0.017573,0.079682,-285344500.0,104008800.0,0.017573,0.017573,0.042459,0.049507,0.058347
2008-05-23,3697.89,3741.06,3629.009,3675.147,15.708758,3344043000.0,52530760000.0,1.0413,1612052000000.0,284925500000.0,...,1.308002,-0.00931,-0.026883,-730228600.0,-1015573000.0,0.017573,0.017573,0.042459,0.042459,0.058347
2008-05-26,3643.012,3643.012,3555.762,3559.217,16.154744,2818019000.0,45524370000.0,0.8768,1612599000000.0,285315700000.0,...,1.619508,-0.01484,-0.00553,-526024100.0,-1256253000.0,0.017573,0.017573,0.017573,0.042459,0.049507


# Trade Parameters

In [7]:
transaction_cost = 0.0015

# Reinforcement Learning Parameters

In [8]:
gamma = 0.9     # Reward Decay Coefficient

# CSI 300 Trading Environment

In [17]:
class CSI300TradingEnv(Env):

    def __init__(self, data: pd.DataFrame, window_size: int = 1, lookback: int = 50, ignored: int = 3, mode: str = "train",
                 transaction_cost: float = 0.0015, gamma: float = 0.9):
        super().__init__()

        self.data = data
        self.raw_data = data.copy(deep = True)
        self.window_size = window_size
        self.num_features = self.data.shape[1]
        self.reward_length = lookback
        self.ignored = ignored

                                #  0 = Purchase
                                #  1 = Liquidate
        self.action_space = Discrete(2)

        self.transaction_cost = transaction_cost

        self.gamma = gamma

        # Action Mapping for Reward Calculation
        self.action_mapping = {0: -1, 1: 1}

        self.action_label = {
            0: "liquidate",
            1: "enter"
        }

        self.observation_space = Box(
            low=-np.inf,
            high=np.inf,
            shape=(self.window_size * self.num_features,),
            dtype=np.float32
        )

        self.index_position = self.window_size
        self.position = -1
        self.done = False

        self.position_history = [] # len = len(return_history) + 1
        self.return_history = []

        self.cumulative_log_return = 0.0

        self.data_all = data
        self.splits = {
            'train': (0, int(0.75 * len(self.data_all))),
            'test': (int(0.75 * len(self.data_all)), int(0.9 * len(self.data_all))),
            'oos': (int(0.9 * len(self.data_all)), len(self.data_all))
        }

        train_start, train_end = self.splits['train']
        scaler = StandardScaler()
        scaler.fit(self.data_all.iloc[train_start:train_end])

        scaled_values = scaler.transform(self.data_all)
        self.data_all = pd.DataFrame(scaled_values, columns=self.data_all.columns, index=self.data_all.index)

        self.mode = mode
        self._set_data_for_mode()


        # Temporary Debug Variables
        self.return_history_dated = []     # List of (date, return)
        self.position_history_dated = []   # List of (date, position)

    def _set_data_for_mode(self):
        start, end = self.splits[self.mode]
        self.data = self.data_all.iloc[start:end]

    def set_mode(self, mode: str):
        if mode not in self.splits:
            raise ValueError(f"Invalid mode: {mode}. Must be one of {list(self.splits.keys())}")
        self.mode = mode
        self._set_data_for_mode()
        self.reset()

    def change_lookback(self, new_lookback: int):
        self.reward_length = new_lookback

    def reset(self):
        self.index_position = self.window_size
        self.position = -1
        self.done = False
        self.position_history = []
        self.return_history = []
        self.cumulative_log_return = 0.0

        self.return_history_dated = []
        self.position_history_dated = []

        return self._get_observation()

    def step(self, action):
        mapped_action = self.action_mapping[action]

        # Update Portfolio Status:
        self.position = self.action_mapping[action]

        date_today = self.data.index[self.index_position]
        date_prev = self.data.index[self.index_position - 1]

        open_today = self.raw_data.loc[date_today, "OPEN"]
        open_prev = self.raw_data.loc[date_prev, "OPEN"]

        if self.index_position == len(self.data) - 1:
            daily_return = 0.0
        else:
            date_next = self.data.index[self.index_position + 1]
            open_next = self.raw_data.loc[date_next, "OPEN"]
            daily_return = open_next / open_today - 1.0

        self.position_history.append(self.position)
        self.position_history_dated.append((date_today, self.position))

        # Skip the first day
        if self.index_position > self.window_size:
            self.return_history.append(daily_return)
            self.return_history_dated.append((date_prev, daily_return))

        if len(self.return_history) > self.reward_length:
            self.return_history.pop(0)
            self.return_history_dated.pop(0)
        if len(self.position_history) > self.reward_length + 1:
            self.position_history.pop(0)
            self.position_history_dated.pop(0)

        log_daily_return = np.log(1 + daily_return) if self.position == 1 else 0.0

        if len(self.position_history) >= 2:
            prev_position = self.position_history[-2]
            transaction_cost = self.transaction_cost if prev_position != self.position else 0.0
        else:
            transaction_cost = 0.0

        self.cumulative_log_return += log_daily_return - transaction_cost

        # Reward and Return
        reward = 0.0
        total_return = 0.0
        cutoff = self.reward_length - self.ignored
        if len(self.return_history_dated) >= self.reward_length and len(self.position_history_dated) >= self.reward_length + 1:
            returns_slice = self.return_history_dated[:cutoff]
            positions_slice = self.position_history_dated[:cutoff]

            dates_match = all(r[0] == p[0] for r, p in zip(returns_slice, positions_slice))
            if dates_match:
                n = len(returns_slice)

                effective_returns = np.array([r[1] for r in returns_slice])
                effective_positions = np.array([p[1] for p in positions_slice])
                weights = np.array([self.gamma ** (n - 1 - i) for i in range(n)])

                reward = np.sum(weights * effective_positions * effective_returns)

                log_returns = [
					np.log(1 + r) if p == 1 else 0.0
					for p, r in zip(effective_positions, effective_returns)
				]
                total_return = np.exp(np.sum(log_returns)) - 1.0

                transition_count = np.sum(np.abs(np.diff(effective_positions)))
                transaction_cost = transition_count * self.transaction_cost

                reward -= transaction_cost
                total_return -= transaction_cost
            else:
                print("[Warning] Mismatched dates in return and position histories!")


        self.index_position += 1
        if self.index_position >= len(self.data):
            self.done = True

        obs = self._get_observation()

        info = {
            "date": date_today,
            "position": self.position,
            "mapped_action": mapped_action,
            "action_str": self.action_label[action],
            "return": daily_return,
            "total_return": np.exp(self.cumulative_log_return) - 1,
            "reward": reward
        }

        return obs, reward, self.done, info

    def _get_observation(self):
        indices = self.data.index[self.index_position - self.window_size : self.index_position]
        window = self.data.loc[indices]
        return window.values.flatten().astype(np.float32)

    def close(self):
        pass

In [10]:
# splits = {
#         'train': (0, int(0.75 * len(data))),
#         'test': (int(0.75 * len(data)), int(0.9 * len(data))),
#         'oos': (int(0.9 * len(data)), len(data))
#         }

# train_start, train_end = splits['train']
# scaler = StandardScaler()
# scaler.fit(data.iloc[train_start:train_end])

# scaled_values = scaler.transform(data)
# temp = pd.DataFrame(scaled_values, columns=data.columns, index=data.index)

# Environment Class Debug

In [18]:
env = CSI300TradingEnv(data = data, mode = 'train', lookback = 1, ignored = 0, transaction_cost = transaction_cost)
obs = env.reset()

rows = []

for step in range(30):
    action = random.randint(0, 1)

    obs, reward, done, info = env.step(action)

    row = {
        "Action": f"{action:<2} {info['action_str']:<9}",
        "Mapped": info["mapped_action"],
        "Position": info["position"],
        "Return": round(info["return"], 5),
        "Cumulative_Return": round(info["total_return"], 5),
        "Reward": round(info["reward"], 5)
    }
    rows.append((info["date"], row))

df = pd.DataFrame.from_dict(dict(rows), orient="index")

df.head(30)

Unnamed: 0,Action,Mapped,Position,Return,Cumulative_Return,Reward
2008-05-21,0 liquidate,-1,-1,0.01757,0.0,0.0
2008-05-22,1 enter,1,1,-0.00931,-0.01079,0.00931
2008-05-23,0 liquidate,-1,-1,-0.01484,-0.01228,-0.01484
2008-05-26,0 liquidate,-1,-1,-0.02554,-0.01228,0.02554
2008-05-27,0 liquidate,-1,-1,0.00938,-0.01228,-0.00938
2008-05-28,0 liquidate,-1,-1,0.02314,-0.01228,-0.02314
2008-05-29,1 enter,1,1,-0.02198,-0.03544,0.02198
2008-05-30,0 liquidate,-1,-1,0.00408,-0.03688,0.00408
2008-06-02,1 enter,1,1,0.00676,-0.03183,-0.00676
2008-06-03,1 enter,1,1,-0.00635,-0.03798,-0.00635


# Shallow NN Trader

In [19]:
class Shallow_NN_Agent:
    def __init__(self, input_dim, hidden_dims = [128, 64, 32], action_dim = 3, lr = 1e-3, gamma = 0.9, lookback = 50, ignored = 5):
        self.gamma = gamma
        self.lookback = lookback
        self.ignored = ignored
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())

        self.net = nn.Sequential(*layers).to(self.device)
        self.optimizer = optim.Adam(self.net.parameters(), lr = lr)

        self.history = []

    def act(self, filtration, do_debug):
        x = torch.tensor(filtration, dtype=torch.float32, device=self.device)
        prob = self.net(x).squeeze()
        dist = torch.distributions.Bernoulli(probs=prob)
        action = dist.sample()

        if do_debug:
            print(f"Probs: {prob.detach().cpu().numpy()}, Action: {action.item()}")


        self.history.append((x, action, dist.log_prob(action)))

        return int(action.item())

    def update(self, reward):
        if len(self.history) < self.lookback:
            return

        useful = self.history[:self.lookback - self.ignored]

        loss = 0
        n = len(useful)

        for t, (x, action, log_prob) in enumerate(useful):
            discounted = reward * (self.gamma ** (n - 1 - t))
            loss -= log_prob * discounted

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.history.clear()

# Training
## Model and Training Variables

In [20]:
lookback = 20
ignored = 3
window_size = 1
gamma = 0.9

transaction_cost = 0.0015

## Environment and Agent Startup

In [21]:
env = CSI300TradingEnv(
    data=data,
    window_size=window_size,
    lookback=lookback,
    ignored=ignored,
    transaction_cost=transaction_cost,
    gamma=gamma,
    mode="train"
)

agent = Shallow_NN_Agent(
    input_dim=env.observation_space.shape[0],
    hidden_dims=[128, 64, 32],
    action_dim=3,
    lr=1e-3,
    gamma=gamma,
    lookback=lookback,
    ignored=ignored
)

## Training

In [22]:
agent = Shallow_NN_Agent(
    input_dim=114,
    hidden_dims=[128, 64, 32],
    action_dim=1,
    lr=1e-3,
    gamma=agent.gamma,
    lookback=agent.lookback,
    ignored=agent.ignored
)

agent.net.load_state_dict(torch.load("best_shallow_nn_weights.pth"))

<All keys matched successfully>

In [23]:
env.set_mode("train")
obs = env.reset()
done = False

rows = []

while not done:
    action = agent.act(obs, False)
    obs, reward, done, info = env.step(action)

    if rows:
        rows[-1][1]["Return"] = round(info["return"], 5)

    row = {
        "Action": f"{action:<2} {info['action_str']:<9}",
        "Mapped": info["mapped_action"],
        "Position": info["position"],
        "Return": None,
        "Cumulative_Return": round(info["total_return"], 5),
        "Reward": round(info["reward"], 5)
    }
    rows.append((info["date"], row))

    # if reward != 0:
    #     agent.update(reward)


df_train = pd.DataFrame.from_dict(dict(rows), orient="index")

print(df_train["Action"].value_counts())

Action
0  liquidate    1622
1  enter        1432
Name: count, dtype: int64


In [24]:
df_train.index = pd.to_datetime(df_train.index).strftime("%Y-%m-%d")

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_train.index,
    y=100 * df_train["Cumulative_Return"],
    mode='lines',
    name='Cumulative Return'
))

fig.update_layout(
    title="Train Cumulative PnL",
    xaxis_title="Date",
    yaxis_title="Cumulative Return",
    yaxis=dict(
        tickformat=".2f",
        ticksuffix="%"
    ),
    xaxis=dict(
        type="category",
        tickmode='linear',
        tick0=0,
        dtick=max(len(df_train) // 15, 1),
        tickangle=-45,
        showgrid=False
    ),
    template="plotly_dark",
    height=500
)

fig.show()

# Test

In [25]:
env.set_mode("test")
obs = env.reset()
done = False

rows = []

while not done:
    action = agent.act(obs, False)  # Use the trained agent
    obs, reward, done, info = env.step(action)

    # Apply today's return to yesterday's row
    if rows:
        rows[-1][1]["Return"] = round(info["return"], 5)

    row = {
        "Action": f"{action:<2} {info['action_str']:<9}",
        "Mapped": info["mapped_action"],
        "Position": info["position"],
        "Return": None,  # Will be filled next step
        "Cumulative_Return": round(info["total_return"], 5),
        "Reward": round(info["reward"], 5)
    }
    rows.append((info["date"], row))

# Final test log as DataFrame
df_test = pd.DataFrame.from_dict(dict(rows), orient="index")

In [26]:
df_test.index = pd.to_datetime(df_test.index).strftime("%Y-%m-%d")

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_test.index,
    y=100 * df_test["Cumulative_Return"],
    mode='lines',
    name='Cumulative Return'
))

fig.update_layout(
    title="Test Cumulative PnL",
    xaxis_title="Date",
    yaxis_title="Cumulative Return",
    yaxis=dict(
        tickformat=".2f",
        ticksuffix="%"
    ),
    xaxis=dict(
        type="category",
        tickmode='linear',
        tick0=0,
        dtick=max(len(df_test) // 15, 1),
        tickangle=-45,
        showgrid=False
    ),
    template="plotly_dark",
    height=500
)

fig.show()

In [27]:
env.set_mode("oos")
obs = env.reset()
done = False

rows = []

while not done:
    action = agent.act(obs, False)
    obs, reward, done, info = env.step(action)

    if rows:
        rows[-1][1]["Return"] = round(info["return"], 5)

    row = {
        "Action": f"{action:<2} {info['action_str']:<9}",
        "Mapped": info["mapped_action"],
        "Position": info["position"],
        "Return": None,
        "Cumulative_Return": round(info["total_return"], 5),
        "Reward": round(info["reward"], 5)
    }

    rows.append((info["date"], row))

df_oos = pd.DataFrame.from_dict(dict(rows), orient="index")
df_oos.index = pd.to_datetime(df_oos.index).strftime("%Y-%m-%d")

In [28]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_oos.index,
    y=100 * df_oos["Cumulative_Return"],
    mode='lines',
    name='Cumulative Return'
))

fig.update_layout(
    title="OOS Cumulative PnL",
    xaxis_title="Date",
    yaxis_title="Cumulative Return",
    yaxis=dict(
        tickformat=".2f",
        ticksuffix="%"
    ),
    xaxis=dict(
        type="category",
        tickmode='linear',
        tick0=0,
        dtick=max(len(df_oos) // 15, 1),
        tickangle=-45,
        showgrid=False,
    ),
    template="plotly_dark",
    height=500
)

fig.show()

In [None]:
print(df_oos["Action"].value_counts())

Action
0  liquidate    240
1  enter        155
Name: count, dtype: int64


In [149]:
torch.save(agent.net.state_dict(), "best_shallow_nn_weights.pth")

In [51]:
env.set_mode("oos")
env.index_position = len(env.data)
final_obs = env._get_observation()

action = agent.act(final_obs, do_debug=False)
action_str = env.action_label[action]

print(f"Final action on last OOS day: {action} → '{action_str}'")

Final action on last OOS day: 1 → 'enter'
