In [None]:
from stable_baselines3 import PPO

model = PPO("MlpPolicy", "CartPole-v1").learn(10_000)

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO

env = gym.make("CartPole-v1", render_mode="human")
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    # VecEnv resets automatically
    if done:
      obs = env.reset()

env.close()

# exit program
exit()

In [None]:
import gymnasium as gym
from stable_baselines3 import PPO

env = gym.make("CartPole-v1", render_mode="human")
model = PPO("MlpPolicy", env, verbose=1)
total_timesteps = 100
model.learn(total_timesteps=total_timesteps) 

# observation, info = env.reset(seed=42)
# for _ in range(total_timesteps):
#     # action = env.action_space.sample()
#     action, _states = model.predict(observation, deterministic=True)
#     observation, reward, terminated, truncated, info = env.step(action)
#     env.render()
    
#     if terminated or truncated:
#         observation, info = env.reset()
        
# env.close()

In [None]:
import numpy as np

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback

model = SAC("MlpPolicy", "Pendulum-v1", tensorboard_log="./a2c_cartpole_tensorboard/", verbose=1)


class TensorboardCallback(BaseCallback):
    """
    Custom callback for plotting additional values in tensorboard.
    """

    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_step(self) -> bool:
        # Log scalar value (here a random variable)
        value = np.random.random()
        self.logger.record("random_value", value)
        return True


model.learn(50000,tb_log_name="4_run", callback=TensorboardCallback())

In [None]:
from stable_baselines3 import A2C

model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
model.learn(total_timesteps=10_000, tb_log_name="first_run")
# Pass reset_num_timesteps=False to continue the training curve in tensorboard
# By default, it will create a new curve
# Keep tb_log_name constant to have continuous curve (see note below)
model.learn(total_timesteps=10_000, tb_log_name="second_run", reset_num_timesteps=False)
model.learn(total_timesteps=10_000, tb_log_name="third_run", reset_num_timesteps=False)

In [None]:
from stable_baselines3 import A2C

model = A2C("MlpPolicy", "CartPole-v1", verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
model.learn(total_timesteps=100_000)

In [None]:
model.policy

In [None]:
import gymnasium as gym
import torch as th

from stable_baselines3 import PPO

# Custom actor (pi) and value function (vf) networks
# of two layers of size 32 each with Relu activation function
# Note: an extra linear layer will be added on top of the pi and the vf nets, respectively
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[32, 32], 
                                   vf=[32, 32]))
# Create the agent
model = PPO("MlpPolicy", "CartPole-v1", policy_kwargs=policy_kwargs, verbose=1)
# Retrieve the environment
env = model.get_env()
# Train the agent
model.learn(total_timesteps=20_000)
# Save the agent
# model.save("ppo_cartpole")

# del model
# the policy_kwargs are automatically loaded
# model = PPO.load("ppo_cartpole", env=env)

In [None]:
import gymnasium as gym

from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = make_vec_env("CartPole-v1", n_envs=4)

# model = A2C("MlpPolicy", env, verbose=1)
# model.learn(total_timesteps=25000)
# model.save("a2c_cartpole")

# del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
import gym
import gym_anytrading
from gym_anytrading.envs import TradingEnv, ForexEnv, StocksEnv, Actions, Positions 
from gym_anytrading.datasets import FOREX_EURUSD_1H_ASK, STOCKS_GOOGL
import matplotlib.pyplot as plt

env = gym.make('forex-v0', frame_bound=(50, 100), window_size=10)
# env = gym.make('stocks-v0', frame_bound=(50, 100), window_size=10)

observation = env.reset()
while True:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    # env.render()
    if done:
        print("info:", info)
        break

plt.cla()
env.render_all()
plt.show()

In [None]:
# Importing the necessary modules
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Defining some constants and helper functions
WINDOW_SIZE = 10 # The number of previous observations to include in the state
MAX_ACCOUNT_BALANCE = 2147483647 # The maximum possible account balance
MAX_NUM_SHARES = 2147483647 # The maximum possible number of shares to hold
MAX_STEPS = 20000 # The maximum possible number of steps in an episode
INITIAL_ACCOUNT_BALANCE = 10000 # The initial account balance at the start of an episode

# A function to format a price as a string
def format_price(n):
    return ("-$" if n < 0 else "$") + "{0:.2f}".format(abs(n))

# A function to get the state vector from the data frame and the current step
def get_state(data, t, n):
    d = t - n + 1
    block = data[d:t + 1] if d >= 0 else -d * [data[0]] + data[0:t + 1] # pad with t0
    res = []
    for i in range(n - 1):
        res.append(block[i + 1] - block[i])
    return np.array([res])

# Defining the TradingEnv class, which inherits from the gym.Env class and implements the methods for creating, resetting, stepping, rendering, and closing the environment
class TradingEnv(gym.Env):
    # A constructor method that takes a data frame, a window size, and a frame bound as arguments and initializes the environment attributes
    def __init__(self, df, window_size, frame_bound):
        assert df.ndim == 2 # Check that the data frame has two dimensions (rows and columns)
        assert len(frame_bound) == 2 # Check that the frame bound has two elements (start and end)
        assert frame_bound[0] > window_size # Check that the frame bound start is larger than the window size

        self.seed() # Set a random seed for reproducibility
        self.df = df # Assign the data frame to an attribute
        self.window_size = window_size # Assign the window size to an attribute
        self.frame_bound = frame_bound # Assign the frame bound to an attribute
        self.prices, self.signal_features = self._process_data() # Process the data and assign the prices and signal features to attributes

        self.shape = (window_size, self.signal_features.shape[1]) # Define the shape of the state vector as a tuple of (window size, number of features)

        self.current_step = None # Initialize the current step as None
        self.balance = None # Initialize the balance as None
        self.shares_held = None # Initialize the shares held as None
        self.cost_basis = None # Initialize the cost basis as None
        self.total_shares_sold = None # Initialize the total shares sold as None
        self.total_sales_value = None # Initialize the total sales value as None

        self.trades = [] # Initialize an empty list to store the trades

        self.action_space = spaces.Discrete(3) # Define the action space as a discrete space with three possible actions: buy (0), sell (1), or hold (2)

        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=self.shape, dtype=np.float32) # Define the observation space as a box space with infinite bounds and a shape equal to the state vector shape

    # A method to process the data and extract the prices and signal features from the data frame
    def _process_data(self):
        prices = self.df.loc[:, 'Close'].to_numpy() # Get the close prices from the data frame and convert them to a numpy array

        prices[self.frame_bound[0] - self.window_size]  # validate index; start point should be >= 0

        diff = np.insert(np.diff(prices), 0, 0) # Compute the difference between consecutive prices and insert a zero at the beginning
        signal_features = np.column_stack((prices, diff)) # Stack the prices and diff arrays horizontally to form a matrix of signal features

        return prices, signal_features

    # A method to update the profit based on the current price and action
    def _update_profit(self, action, current_price):
        if action == 0: # buy action
            if self.balance > current_price: 
                self.balance -= current_price 
                self.cost_basis += current_price 
                self.shares_held += 1 
                self.trades.append({'step': self.current_step,
                                    'shares': 1,
                                    'total': current_price,
                                    'type': "buy"})
        elif action == 1: # sell action
            if self.shares_held > 0:
                self.balance += current_price
                self.shares_held -= 1
                self.cost_basis -= current_price
                self.total_shares_sold += 1
                self.total_sales_value += current_price
                self.trades.append({'step': self.current_step,
                                    'shares': 1,
                                    'total': current_price,
                                    'type': "sell"})

    # A method to get the next observation from the signal features and the current step
    def _next_observation(self):
        frame = np.arange(self.current_step - self.window_size + 1, self.current_step + 1) # Get a slice of the signal features corresponding to the window size
        return self.signal_features[frame, :] # Return the sliced signal features as a numpy array

    # A method to take an action and update the state, reward, done, and info
    def _take_action(self, action):
        current_price = self.prices[self.current_step] # Get the current price from the prices array
        self._update_profit(action, current_price) # Update the profit based on the action and the current price

        prev_cost = self.cost_basis / (self.shares_held + 1e-15) # Compute the previous cost per share
        additional_cost = current_price - prev_cost # Compute the additional cost per share

        reward = -additional_cost * self.shares_held # Compute the reward as the negative of the additional cost times the shares held

        done = False # Initialize done as False

        if self.balance <= 0: # If the balance is zero or negative, set done to True and penalize the reward
            done = True
            reward -= INITIAL_ACCOUNT_BALANCE / 2

        if self.current_step == len(self.prices) - 1: # If the current step is the last step, set done to True
            done = True

        info = {'balance': self.balance, # Create a dictionary of info with some useful metrics
                'shares_held': self.shares_held,
                'total_shares_sold': self.total_shares_sold,
                'total_sales_value': self.total_sales_value,
                'cost_basis': self.cost_basis,
                'net_worth': (self.balance + (self.shares_held * current_price)),
                'profit': (self.balance + (self.shares_held * current_price)) - INITIAL_ACCOUNT_BALANCE}

        return reward, done, info

    # A method to reset the environment and return the initial observation
    def reset(self):
        self.current_step = random.randint(self.frame_bound[0], self.frame_bound[1]) # Set the current step to a random value within the frame bound

        self.balance = INITIAL_ACCOUNT_BALANCE # Reset the balance to the initial value
        self.shares_held = 0 # Reset the shares held to zero
        self.cost_basis = 0 # Reset the cost basis to zero
        self.total_shares_sold = 0 # Reset the total shares sold to zero
        self.total_sales_value = 0 # Reset the total sales value to zero

        return self._next_observation() # Return the initial observation

    # A method to take a step in the environment and return the observation, reward, done, and info
    def step(self, action):
        assert action in [0, 1, 2] # Check that the action is valid

        prev_net_worth = (self.balance + (self.shares_held * self.prices[self.current_step])) # Compute the previous net worth

        self.current_step += 1 # Increment the current step by one

        reward, done, info = self._take_action(action) # Take an action and get the reward, done, and info

        next_state = self._next_observation() # Get the next state

        info['prev_net_worth'] = prev_net_worth # Add the previous net worth to the info dictionary

        return next_state, reward, done, info

    # A method to render the environment using matplotlib
    def render(self, mode='human'):
        if mode == 'system':
            print(f'Step: {self.current_step}, '
                  f'Net Worth: {self.balance + (self.shares_held * self.prices[self.current_step])}')
            return

        elif mode == 'none':
            return

        window_start = max(self.current_step - WINDOW_SIZE, 0)
        window_end = min(self.current_step + WINDOW_SIZE + 1, len(self.prices))
        
        date_range = pd.date_range(start=self.df.index[window_start], end=self.df.index[window_end-1])
        
        plt.figure(figsize=(10, 6)) # Create a figure with a specified size
        plt.title('Trade History') # Set the title of the figure
        plt.xlabel('Date') # Set the x-axis label of the figure
        plt.ylabel('Price') # Set the y-axis label of the figure
        plt.plot(date_range, self.prices[window_start:window_end], label='price', color='g') # Plot the prices in the window range with a green line and a label

        for trade in self.trades: # Loop over the trades list
            if trade['step'] >= window_start and trade['step'] < window_end: # Check if the trade step is within the window range
                date = self.df.index[trade['step']] # Get the date of the trade step from the data frame index
                color = 'r' if trade['type'] == 'sell' else 'b' # Set the color to red if the trade type is sell, or blue if it is buy
                marker = 'v' if trade['type'] == 'sell' else '^' # Set the marker to a downward triangle if the trade type is sell, or an upward triangle if it is buy
                plt.scatter(date, trade['total'], color=color, marker=marker, s=100) # Plot a scatter point for the trade with the specified color, marker, and size

        plt.legend() # Show the legend of the plot
        plt.show() # Show the plot

    # A method to close the environment and release any resources
    def close(self):
        pass

    # A method to save a rendered image to a file
    def save_rendered_image(self, path):
        fig = plt.gcf() # Get the current figure
        fig.savefig(path) # Save the figure to a file

# Defining the StocksEnv class, which inherits from the TradingEnv class and overrides some methods to customize the environment for stocks trading
class StocksEnv(TradingEnv):
    # A constructor method that takes a data frame and a window size as arguments and initializes the environment attributes
    def __init__(self, df, window_size):
        super().__init__(df, window_size, (WINDOW_SIZE, len(df) - 1)) # Call the constructor of the parent class with a frame bound equal to (window size, length of data frame - 1)

    # A method to process the data and extract the prices and signal features from the data frame
    def _process_data(self):
        prices = self.df.loc[:, 'Close'].to_numpy() # Get the close prices from the data frame and convert them to a numpy array

        prices[self.frame_bound[0] - self.window_size]  # validate index; start point should be >= 0

        diff = np.insert(np.diff(prices), 0, 0) # Compute the difference between consecutive prices and insert a zero at the beginning
        macd = self.df.loc[:, 'macd'].to_numpy() # Get the macd values from the data frame and convert them to a numpy array
        rsi = self.df.loc[:, 'rsi'].to_numpy() # Get the rsi values from the data frame and convert them to a numpy array

        signal_features = np.column_stack((prices, diff, macd, rsi)) # Stack the prices, diff, macd, and rsi arrays horizontally to form a matrix of signal features

        return prices, signal_features

# Defining the ForexEnv class, which inherits from the TradingEnv class and overrides some methods to customize the environment for forex trading
class ForexEnv(TradingEnv):
    # A constructor method that takes a data frame and a window size as arguments and initializes the environment attributes
    def __init__(self, df, window_size):
        super().__init__(df, window_size, (WINDOW_SIZE + 1, len(df))) # Call the constructor of the parent class with a frame bound equal to (window size + 1, length of data frame)

    # A method to process the data and extract the prices and signal features from the data frame
    def _process_data(self):
        prices = self.df.loc[:, 'Close'].to_numpy() / 10000.0 # Get the close prices from the data frame, divide them by 10000.0, and convert them to a numpy array

        prices[self.frame_bound[0] - self.window_size]  # validate index; start point should be >= 0

        diff = np.insert(np.diff(prices), 0, 0) / 100.0 # Compute the difference between consecutive prices, divide them by 100.0, and insert a zero at the beginning

        signal_features = np.column_stack((prices, diff)) # Stack the prices and diff arrays horizontally to form a matrix of signal features

        return prices, signal_features
