In [1]:
import random
import torch
from torch import nn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import keyboard
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import gymnasium as gym
from torch.distributions.normal import Normal
from tqdm import tqdm
from tqdm import trange
from maglev_env import MagneticEnv, DT

In [2]:
class Policy_Network(nn.Module):
    """Parametrized Policy Network."""

    def __init__(self, obs_space_dims: int, action_space_dims: int):
        """Initializes a neural network that estimates the mean and standard deviation
         of a normal distribution from which an action is sampled from.

        Args:
            obs_space_dims: Dimension of the observation space
            action_space_dims: Dimension of the action space
        """
        super().__init__()

        # NOTE think more about these values
        hidden_space1 = 16
        hidden_space2 = 16

        # Shared Network
        self.shared_net = nn.Sequential(
            nn.Linear(obs_space_dims, hidden_space1),
            nn.Tanh(),
            nn.Linear(hidden_space1, hidden_space2),
            nn.Tanh(),
        )

        # Policy Mean specific Linear Layer
        self.policy_mean_net = nn.Sequential(
            nn.Linear(hidden_space2, action_space_dims)
        )

        # Policy Std Dev specific Linear Layer
        # NOTE do we want relu on this?
        self.policy_stddev_net = nn.Sequential(
            nn.Linear(hidden_space2, action_space_dims),
        )

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        """Conditioned on the observation, returns the mean and standard deviation
         for each normal distribution from which an action is sampled from.

        Args:
            x: Observation from the environment

        Returns:
            action_means: predicted means of the action space's normal distribution
            action_stddevs: predicted standard deviation of the action space's normal distribution
        """
        shared_features = self.shared_net(x.float())

        action_means = self.policy_mean_net(shared_features)
        action_stddevs = torch.log(
            1 + torch.exp(self.policy_stddev_net(shared_features))
        )

        return action_means, action_stddevs

class Policy:
    """REINFORCE algorithm."""

    def __init__(self, obs_space_dims: int, action_space_dims: int):
        """Initializes an agent that learns a policy via REINFORCE algorithm.
        Args:
            obs_space_dims: Dimension of the observation space
            action_space_dims: Dimension of the action space
        """
        self.action_space_dims = action_space_dims

        # Hyperparameters
        self.learning_rate = 1e-4  # Learning rate for policy optimization
        self.gamma = 0.99  # Discount factor
        self.eps = 1e-6  # small number for mathematical stability

        self.probs = []  # Stores probability values of the sampled action
        self.rewards = []  # Stores the corresponding rewards

        self.net = Policy_Network(obs_space_dims, action_space_dims)
        self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=self.learning_rate)

    def sample_action(self, state: np.ndarray) -> float:
        """Returns action(s), conditioned on the policy and observation.

        Args:
            state: Observation from the environment

        Returns:
            action: Action(s) to be performed
        """
        state = torch.tensor(np.array([state]))
        action_means, action_stddevs = self.net(state)

        action_means = action_means.squeeze()
        action_stddevs = action_stddevs.squeeze()
        # create a normal distribution from the predicted
        #   mean and standard deviation and sample all actions action
        actions = np.zeros(self.action_space_dims)
        for action_dim in range(self.action_space_dims):
            distrib = Normal(action_means[action_dim] + self.eps, action_stddevs[action_dim] + self.eps)
            action = distrib.sample()
            prob = distrib.log_prob(action)
            actions[action_dim] = action.numpy()

            self.probs.append(prob)

        return actions

    def update(self):
        """Updates the policy network's weights."""
        running_g = 0
        gs = []

        # Discounted return (backwards) - [::-1] will return an array in reverse
        for R in self.rewards[::-1]:
            running_g = R + self.gamma * running_g
            gs.insert(0, running_g)

        deltas = torch.tensor(gs)

        loss = 0
        # minimize -1 * prob * reward obtained
        for log_prob, delta in zip(self.probs, deltas):
            loss += log_prob.mean() * delta * (-1)

        # Update the policy network
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Empty / zero out all episode-centric/related variables
        self.probs = []
        self.rewards = []

In [3]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device") 

DO_RENDER = False
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

mag_coords = [np.array([0.,-1.,3.]),np.array([0.,1.,3.])]
spawn_range = ((-0.1,0.1),(-0.1,0.1),(0,1))
desired_range = ((0,0),(0,0),(0.1,0.9))
# Create and wrap the environment
env = MagneticEnv(mag_coords, DT)
wrapped_env = gym.wrappers.RecordEpisodeStatistics(env, 50)  # Records episode-reward

total_num_episodes = int(100)  # Total number of episodes NOTE switch back to int(5e3) once done debugging
obs_space_dims = env.observation_space.shape[0]
action_space_dims = env.action_space.shape[0]

# Reinitialize agent every seed
agent = Policy(obs_space_dims, action_space_dims)
reward_over_episodes = []

#Tqdm progress bar object contains a list of the batch indices to train over
progress_bar = tqdm(range(total_num_episodes), desc='Training...', leave=False, disable=False)

for episode in progress_bar:
    obs, info = wrapped_env.reset(seed=RANDOM_SEED, options=(spawn_range,desired_range))
    done = False

    while not done:
        action = agent.sample_action(obs)
        obs, reward, terminated, truncated, info = wrapped_env.step(action)
        agent.rewards.append(reward)
        if DO_RENDER: env.render()

        done = terminated or truncated
    

    avg_reward = sum(agent.rewards)/len(agent.rewards)
    progress_bar.set_postfix({"Episode Reward:": f"Epoch: {episode} Reward: {avg_reward}"})
    print(f"Avg Reward Episode {episode}: {avg_reward}")
    reward_over_episodes.append(avg_reward)
    agent.update()

Using cuda device


Training...:   6%|▌         | 6/100 [00:00<00:03, 27.86it/s, Episode Reward:=Epoch: 5 Reward: -28.101566359561172]

Avg Reward Episode 0: -25.073585825322265
Avg Reward Episode 1: -28.80051930614039
Avg Reward Episode 2: -27.630517923777745
Avg Reward Episode 3: -29.26097186549937
Avg Reward Episode 4: -29.315144118485996
Avg Reward Episode 5: -28.101566359561172


Training...:   9%|▉         | 9/100 [00:00<00:03, 25.74it/s, Episode Reward:=Epoch: 11 Reward: -28.69666634747203] 

Avg Reward Episode 6: -29.98484976532892
Avg Reward Episode 7: -28.465264022737383
Avg Reward Episode 8: -27.78811007354065
Avg Reward Episode 9: -28.60130389492188
Avg Reward Episode 10: -29.363820437929633
Avg Reward Episode 11: -28.69666634747203


Training...:  17%|█▋        | 17/100 [00:00<00:02, 28.74it/s, Episode Reward:=Epoch: 18 Reward: -29.636492781256525]

Avg Reward Episode 12: -27.708396934592884
Avg Reward Episode 13: -28.360880877294772
Avg Reward Episode 14: -28.577541207695855
Avg Reward Episode 15: -29.362511377080462
Avg Reward Episode 16: -29.1845941280002
Avg Reward Episode 17: -29.2245177396441


Training...:  23%|██▎       | 23/100 [00:00<00:02, 28.71it/s, Episode Reward:=Epoch: 24 Reward: -29.769743198644182]

Avg Reward Episode 18: -29.636492781256525
Avg Reward Episode 19: -28.603039963498034
Avg Reward Episode 20: -27.838064619113016
Avg Reward Episode 21: -29.035965922554013
Avg Reward Episode 22: -27.79604304363328
Avg Reward Episode 23: -29.549806172310955


Training...:  27%|██▋       | 27/100 [00:01<00:02, 29.54it/s, Episode Reward:=Epoch: 30 Reward: -29.76017907193722] 

Avg Reward Episode 24: -29.769743198644182
Avg Reward Episode 25: -28.879933820448873
Avg Reward Episode 26: -27.618261139362392
Avg Reward Episode 27: -28.48046642789545
Avg Reward Episode 28: -29.557622010351874
Avg Reward Episode 29: -27.191142169602724


Training...:  34%|███▍      | 34/100 [00:01<00:02, 29.77it/s, Episode Reward:=Epoch: 35 Reward: -28.16894566398108] 

Avg Reward Episode 30: -29.76017907193722
Avg Reward Episode 31: -28.652410733969557
Avg Reward Episode 32: -28.744126978679667
Avg Reward Episode 33: -26.386956879234233
Avg Reward Episode 34: -30.015453689675994
Avg Reward Episode 35: -28.16894566398108


Training...:  40%|████      | 40/100 [00:01<00:02, 29.34it/s, Episode Reward:=Epoch: 41 Reward: -29.003321780908077]

Avg Reward Episode 36: -28.147443598962603
Avg Reward Episode 37: -28.82108676269764
Avg Reward Episode 38: -28.38581882053384
Avg Reward Episode 39: -29.106267088817276
Avg Reward Episode 40: -27.88374882939069
Avg Reward Episode 41: -29.003321780908077


Training...:  46%|████▌     | 46/100 [00:01<00:01, 28.42it/s, Episode Reward:=Epoch: 47 Reward: -29.366944382897675]

Avg Reward Episode 42: -28.48737593127528
Avg Reward Episode 43: -26.160059091879642
Avg Reward Episode 44: -27.369309665537394
Avg Reward Episode 45: -28.282330902732436
Avg Reward Episode 46: -28.697380926516136
Avg Reward Episode 47: -29.366944382897675


Training...:  52%|█████▏    | 52/100 [00:01<00:01, 28.65it/s, Episode Reward:=Epoch: 54 Reward: -28.303684968512204]

Avg Reward Episode 48: -27.656856599820955
Avg Reward Episode 49: -28.661682008518337
Avg Reward Episode 50: -28.963454700094307
Avg Reward Episode 51: -28.40714964359916
Avg Reward Episode 52: -29.047752452346653
Avg Reward Episode 53: -29.009977139678295


Training...:  58%|█████▊    | 58/100 [00:02<00:01, 25.67it/s, Episode Reward:=Epoch: 58 Reward: -27.621714692117838]

Avg Reward Episode 54: -28.303684968512204
Avg Reward Episode 55: -27.017169459738945
Avg Reward Episode 56: -28.49285169542332
Avg Reward Episode 57: -29.210331599048505
Avg Reward Episode 58: -27.621714692117838


Training...:  64%|██████▍   | 64/100 [00:02<00:01, 27.28it/s, Episode Reward:=Epoch: 65 Reward: -29.736424409562407]

Avg Reward Episode 59: -28.112935373714077
Avg Reward Episode 60: -28.800645147161138
Avg Reward Episode 61: -29.0391051775712
Avg Reward Episode 62: -28.071035369076913
Avg Reward Episode 63: -29.590372351903376
Avg Reward Episode 64: -28.74379140618233


Training...:  70%|███████   | 70/100 [00:02<00:01, 27.65it/s, Episode Reward:=Epoch: 70 Reward: -28.602637888173305]

Avg Reward Episode 65: -29.736424409562407
Avg Reward Episode 66: -25.662392512227406
Avg Reward Episode 67: -26.934082988637215
Avg Reward Episode 68: -28.858684760326955
Avg Reward Episode 69: -27.24842557425831


Training...:  73%|███████▎  | 73/100 [00:02<00:00, 27.53it/s, Episode Reward:=Epoch: 75 Reward: -29.489551024183406]

Avg Reward Episode 70: -28.602637888173305
Avg Reward Episode 71: -29.181979863625322
Avg Reward Episode 72: -29.221052344933028
Avg Reward Episode 73: -25.333161636324192
Avg Reward Episode 74: -28.949943498495166
Avg Reward Episode 75: -29.489551024183406


Training...:  82%|████████▏ | 82/100 [00:02<00:00, 27.93it/s, Episode Reward:=Epoch: 81 Reward: -29.031400648657034]

Avg Reward Episode 76: -29.288436708162426
Avg Reward Episode 77: -26.298428938453977
Avg Reward Episode 78: -26.48007493790066
Avg Reward Episode 79: -28.869000606454033
Avg Reward Episode 80: -28.153051395157178
Avg Reward Episode 81: -29.031400648657034


Training...:  85%|████████▌ | 85/100 [00:03<00:00, 27.45it/s, Episode Reward:=Epoch: 86 Reward: -29.29743833338935] 

Avg Reward Episode 82: -29.347075978893876
Avg Reward Episode 83: -27.185803290080585
Avg Reward Episode 84: -29.514413146333872
Avg Reward Episode 85: -27.671224899351547
Avg Reward Episode 86: -29.29743833338935


Training...:  91%|█████████ | 91/100 [00:03<00:00, 24.97it/s, Episode Reward:=Epoch: 92 Reward: -29.184979068656936]

Avg Reward Episode 87: -29.810438945689672
Avg Reward Episode 88: -29.21750161636382
Avg Reward Episode 89: -29.215208479931313
Avg Reward Episode 90: -27.676486893395477
Avg Reward Episode 91: -27.912941903100762


Training...:  97%|█████████▋| 97/100 [00:03<00:00, 26.04it/s, Episode Reward:=Epoch: 97 Reward: -29.6903153152279]  

Avg Reward Episode 92: -29.184979068656936
Avg Reward Episode 93: -28.879336780790098
Avg Reward Episode 94: -28.390825723372167
Avg Reward Episode 95: -27.704010217923972
Avg Reward Episode 96: -29.735791041820068


                                                                                                                     

Avg Reward Episode 97: -29.6903153152279
Avg Reward Episode 98: -28.112210452350478
Avg Reward Episode 99: -28.496289726213565




In [4]:
plt.figure()
xs = np.arange(len(reward_over_episodes))
# Fit a linear trendline
coefficients = np.polyfit(xs, reward_over_episodes, 1)
trendline = np.polyval(coefficients, xs)

# Plot the original data points 
plt.plot(xs, reward_over_episodes, label='Data Points')

# Plot the trendline
plt.plot(xs, trendline, color='red', label='Trendline')
slope = coefficients[0]
plt.text(0.5, 25, f'Slope: {slope:.2f}', fontsize=12, color='green')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.title('Reward Over Episodes')
plt.show()