## Import

In [58]:
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
from gym import spaces
import numpy as np
import random
import os

import pandas as pd
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

Sunday - 1

Saturday - 7

## Getting files for each day of the month and storing in pd_refs

In [17]:
files = os.listdir("DayCSV")

In [18]:
pd_refs = []

In [19]:
for i in files:
    pd_refs.append(pd.read_csv("DayCSV\\"+i))

## Environment - 1

### State
The data was modeled to function like an environment where the state space of the agent would be an array consisting of 
- Energy consumption of previous hour
- Value of the previous hour
- Day number  (1 indicates Sunday and 7 indicates Saturday)

### Action
Using the given state space, in this environment the agent would be required to predict the fall, rise or constant energy requirement for the next day, giving the agent 3 possible actions - Increase, Decrease and Constant. 

### Reward
For a correct prediction of the change in power consumption, the agent would be given a reward of +10 whereas a wrong prediction would give the agent a reward of -100.

In cases where the agent chose the action as constant, if the power consumption of next day is within 5%, then the agent is given a positive reward else it is given a negative reward. This condition is represented as follows

$0.95 * Energy_t <= Energy_{t+1}<= 1.05 * Energy_t$

where $t$ is timestep t.


In [89]:
class PowerPredEnv(Env):
    def __init__(self):
        self.action_space = Discrete(3) # Increase, Decrease, keep same
        self.observation_space = Box(low=np.array([0,0,1]), high=np.array([23,63000, 7]), dtype=np.int32) ## [Hour of day, Power in MW, Day of week]
        self.day = 0 #Current day - Opens the day's CSV file
        self.hour = 0 # Hour - Row number of the CSV
        self.state = [self.hour, pd_refs[self.day].loc[self.hour,"MW"], pd_refs[self.day].loc[self.hour,"Day"]] ## State of the environment
        
    def get_current_power(self):
        ## Returns the power consumption of the day stored in self.day
        return pd_refs[self.day].loc[self.hour,"MW"]

    def next_day(self):
        ## Changes the day to next day and checks if all CSV files have been read
        self.day += 1
        if(self.day == 30):
            return True
        self.hour = 0
        self.state = [self.hour, self.get_current_power(), pd_refs[self.day].loc[self.hour,"Day"]]
        return False
        
    def step(self, action):
        actual_power = pd_refs[self.day].loc[self.hour+1,"MW"] # Actual power at hour t+1
        prev_power = self.state[1] # Power of previous day
        done = False
        reward = 0
        ## 0 - Increase, 1 - Decrease, 2 - Same
        if(action == 0):
            if(actual_power > prev_power):
                ## Correct prediction of increase in power demand
                reward = 10
            else:
                reward = -100
        elif(action == 1):
            if(actual_power < prev_power):
                ## Correct prediction of decrease in power demand
                reward = 10
            else:
                reward = -100
        elif(action == 2):
            if(0.95*prev_power < actual_power < 1.05 * prev_power):
                ## Correct prediction of constant power demand
                ## The power is within 5% of the previous day's power consumption
                reward = 10
            else:
                reward = -100
        self.hour +=1 # Increase the hour count
        done = False
        if(self.hour == 23):
            ## If all hours of the day are completed, move to next file
            done = self.next_day()
        if not done:
            ## Check if all files are read, if not continue 
            self.state = [self.hour, self.get_current_power(), pd_refs[self.day].loc[self.hour,"Day"]]
        info = {
            "day":self.day+1,
            "reward":reward,
            "hour":self.hour-1,
            "prev":prev_power,
            "predicted":action,
            "actual":actual_power
        }
        return self.state, reward, False, done, info
    
    def render(self):
        pass
    
    def reset(self):
        ## Reset the values
        self.day = 0
        self.hour = 0
        self.state = [self.hour, self.get_current_power(), pd_refs[self.day].loc[self.hour,"Day"]]
        info = {
                    "day":self.day+1,
                    "reward":reward,
                    "hour":self.hour-1,
                    "predicted":action,
                }
        return self.state, info

In [71]:
env = PowerPredEnv()

## Result using random action selection for 5 episodes

In [72]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, _, done, info = env.step(action)
        score += reward
        # if(reward > 0):
        #     print(reward, info)
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-26100
Episode:2 Score:-27860
Episode:3 Score:-27530
Episode:4 Score:-26210
Episode:5 Score:-27970


## PPO

In [82]:
log_path = os.path.join('Training', 'logs')
model1 = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [83]:
model1.learn(total_timesteps=1000000)

Logging to Training\logs\PPO_7
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 690       |
|    ep_rew_mean     | -2.68e+04 |
| time/              |           |
|    fps             | 499       |
|    iterations      | 1         |
|    time_elapsed    | 4         |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 690         |
|    ep_rew_mean          | -2.58e+04   |
| time/                   |             |
|    fps                  | 405         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013047331 |
|    clip_fraction        | 0.0662      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 1.7

<stable_baselines3.ppo.ppo.PPO at 0x1a096e3cc40>

PPO

Average reward after 1,000,000 timesteps = -17600 

![image.png](attachment:9c204888-1f2f-499b-8267-50ace9e9ec2f.png)

The PPO model also stagnated in terms of performance and showed minor improvement on further training


## DQN

In [80]:
model2 = DQN('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [81]:
model2.learn(total_timesteps = 1000000)

Logging to Training\logs\DQN_3
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.65e+04 |
|    exploration_rate | 0.974     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 3967      |
|    time_elapsed     | 0         |
|    total_timesteps  | 2760      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.66e+04 |
|    exploration_rate | 0.948     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 5368      |
|    time_elapsed     | 1         |
|    total_timesteps  | 5520      |
-----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 690      |
|    ep_rew_mean      | -2.7e+04 |
|    exploration_rate | 0.921    |
| 

<stable_baselines3.dqn.dqn.DQN at 0x1a0b02db8e0>

DQN

![image.png](attachment:c93c86bb-5b87-4583-b00b-2d8b29d1dc34.png)

Average reward after 1,000,000 timesteps = -5605

## A2C

In [77]:
model3 = A2C('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [79]:
model3.learn(total_timesteps = 1000000)

Logging to Training\logs\A2C_3
------------------------------------
| time/                 |          |
|    fps                | 258      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.0263  |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 20099    |
|    policy_loss        | 0.124    |
|    value_loss         | 1.44e+03 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 690       |
|    ep_rew_mean        | -1.75e+04 |
| time/                 |           |
|    fps                | 246       |
|    iterations         | 200       |
|    time_elapsed       | 4         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.0523   |
|    explained_variance | 0      

KeyboardInterrupt: 

A2C

Average reward after 130000 timesteps = -17700

![Pasted image 20231110214713.png](attachment:6cdb0c86-e9c2-4c7c-bcb1-f0cc9c923062.png)

The average episode reward stays the same even after further training and has minimal change 

## Comparison

![image.png](attachment:d0553499-4aa1-47d2-994f-31d94d9bc0c2.png)

DQN - Pink
PPO - Green
A2C - Blue

The DQN model was able to better fit on the data while A2C and PPO reached stagnation 

# Environment - 2

Modifying the state space and reducing the variables to achieve convergence faster as the previous models may have been slow in converging due to a complex state space. The average rewards achieved by the models did not show much improvement over the reward gained using random actions.
### State

The data was modeled to function like an environment where the state space of the agent would be an array consisting of 
- Energy consumption of previous hour
- Value of the previous hour

### Action
Using the given state space, in this environment the agent would be required to predict the fall, rise or constant energy requirement for the next day, giving the agent 3 possible actions - Increase, Decrease and Constant. 

### Reward
For a correct prediction of the change in power consumption, the agent would be given a reward of +10 whereas a wrong prediction would give the agent a reward of -100.

In cases where the agent chose the action as constant, if the power consumption of next day is within 5%, then the agent is given a positive reward else it is given a negative reward. This condition is represented as follows

$0.95 * Energy_t <= Energy_{t+1}<= 1.05 * Energy_t$

where $t$ is timestep t.

In [None]:
class PowerPredEnv2(Env):
    def __init__(self):
        self.action_space = Discrete(3) # Increase, Decrease, keep same
        self.observation_space = Box(low=np.array([0,0]), high=np.array([23,63000]), dtype=np.int32) ## [Hour of day, Power in MW]
        self.day = 0 #Current day - Opens the day's CSV file
        self.hour = 0 # Hour - Row number of the CSV
        self.state = [self.hour, pd_refs[self.day].loc[self.hour,"MW"]] ## State of the environment

    def get_current_power(self):
        ## Returns the power consumption of the day stored in self.day
        return pd_refs[self.day].loc[self.hour,"MW"]

    def next_day(self):
        ## Changes the day to next day and checks if all CSV files have been read
        self.day += 1
        if(self.day == 30):
            return True
        self.hour = 0
        self.state = [self.hour, self.get_current_power()]
        return False
        
    def step(self, action):
        actual_power = pd_refs[self.day].loc[self.hour+1,"MW"] # Actual power at hour t+1
        prev_power = self.state[1] # Power of previous day
        done = False
        reward = 0
        ## 0 - Increase, 1 - Decrease, 2 - Same
        if(action == 0):
            if(actual_power > prev_power):
                 ## Correct prediction of increase in power demand
                reward = 10
            else:
                reward = -100
        elif(action == 1):
            if(actual_power < prev_power):
                 ## Correct prediction of decrease in power demand
                reward = 10
            else:
                reward = -100
        elif(action == 2):
             ## Correct prediction of constant power demand
             ## The power is within 5% of the previous day's power consumption
            if(0.95*prev_power < actual_power < 1.05 * prev_power):
                reward = 10
            else:
                reward = -100
        self.hour +=1
        done = False
        if(self.hour == 23):
            ## If all hours of the day are completed, move to next file
            done = self.next_day()
        if not done:
            ## Check if all files are read, if not continue 
            self.state = [self.hour, self.get_current_power()]
        info = {
            "day":self.day+1,
            "reward":reward,
            "hour":self.hour-1,
            "prev":prev_power,
            "predicted":action,
            "actual":actual_power
        }
        return self.state, reward, False, done, info
    
    def render(self):
        pass
    
    def reset(self):
        ## Reset the values
        self.day = 0
        self.hour = 0
        self.state = [self.hour, self.get_current_power()]
        info = {
                    "day":self.day+1,
                    "reward":reward,
                    "hour":self.hour-1,
                    "predicted":action,
                }
        return self.state, info

In [94]:
env = PowerPredEnv2()

## Result using random action selection for 5 episodes

In [95]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, _, done, info = env.step(action)
        score += reward
        # if(reward > 0):
        #     print(reward, info)
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-24120
Episode:2 Score:-27420
Episode:3 Score:-28190
Episode:4 Score:-25440
Episode:5 Score:-28080


## PPO

In [96]:
log_path = os.path.join('Training', 'logs')
model1 = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [102]:
model1.learn(total_timesteps=1000000)

Logging to Training\logs\PPO_9
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 690       |
|    ep_rew_mean     | -1.78e+04 |
| time/              |           |
|    fps             | 441       |
|    iterations      | 1         |
|    time_elapsed    | 4         |
|    total_timesteps | 2048      |
----------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 690           |
|    ep_rew_mean          | -1.77e+04     |
| time/                   |               |
|    fps                  | 363           |
|    iterations           | 2             |
|    time_elapsed         | 11            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00010378775 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.02         |
|  

KeyboardInterrupt: 

PPO

![image.png](attachment:505c964d-5ac2-4a7d-a11e-73b8292fb8da.png)

Average reward after 100,000 timesteps = -17900 


## DQN

In [103]:
model2 = DQN('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [104]:
model2.learn(total_timesteps = 100000)

Logging to Training\logs\DQN_5
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.68e+04 |
|    exploration_rate | 0.738     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 4151      |
|    time_elapsed     | 0         |
|    total_timesteps  | 2760      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.68e+04 |
|    exploration_rate | 0.476     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 5444      |
|    time_elapsed     | 1         |
|    total_timesteps  | 5520      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.64e+04 |
|    exploration_rate | 0.213    

<stable_baselines3.dqn.dqn.DQN at 0x1a0ad0c6f20>

DQN

![image.png](attachment:dce45d2e-8cd5-4a1e-88b9-12911da93625.png)

Average reward after 100,000 timesteps = -22700

## A2C

In [105]:
model3 = A2C('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [106]:
model3.learn(total_timesteps = 100000)

Logging to Training\logs\A2C_5
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.754   |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 9.35     |
|    value_loss         | 1.09e+03 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 690       |
|    ep_rew_mean        | -2.32e+04 |
| time/                 |           |
|    fps                | 254       |
|    iterations         | 200       |
|    time_elapsed       | 3         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.16     |
|    explained_variance | 0      

<stable_baselines3.a2c.a2c.A2C at 0x1a0bd953010>

A2C
![image.png](attachment:38a4fe31-93db-40f5-9aaf-a654dd45203c.png)

Average reward after 100000 timesteps = -17600

A2C algorithm stagnates

## Comparison

![image.png](attachment:f419579e-6628-46bc-b504-e84988b211c7.png)

![image.png](attachment:30899391-12d9-409a-a371-5a4729e6c37f.png)

The DQN algorithm was increasing in avereage reward. The PPO and A2C had stagnated in performance.

## Environment - 3

Modifying reward functions slightly by providing higher margin for constant power

### State

The data was modeled to function like an environment where the state space of the agent would be an array consisting of 
- Energy consumption of previous hour
- Value of the previous hour

### Action
Using the given state space, in this environment the agent would be required to predict the fall, rise or constant energy requirement for the next day, giving the agent 3 possible actions - Increase, Decrease and Constant. 

### Reward
For a correct prediction of the change in power consumption, the agent would be given a reward of +10 whereas a wrong prediction would give the agent a reward of -100.

In cases where the agent chose the action as constant, if the power consumption of next day is within 5%, then the agent is given a positive reward else it is given a negative reward. This condition is represented as follows

$0.95 * Energy_t <= Energy_{t+1}<= 1.05 * Energy_t$

where $t$ is timestep t.

In [108]:
class PowerPredEnv3(Env):
    def __init__(self):
        self.action_space = Discrete(3) # Increase, Decrease, keep same
        self.observation_space = Box(low=np.array([0,0]), high=np.array([23,63000]), dtype=np.int32) ## [Hour of day, Power in MW]
        self.day = 0 #Current day - Opens the day's CSV file
        self.hour = 0 # Hour - Row number of the CSV
        self.state = [self.hour, pd_refs[self.day].loc[self.hour,"MW"]] ## State of the environment

    def get_current_power(self):
        ## Returns the power consumption of the day stored in self.day
        return pd_refs[self.day].loc[self.hour,"MW"]

    def next_day(self):
        ## Changes the day to next day and checks if all CSV files have been read
        self.day += 1
        if(self.day == 30):
            return True
        self.hour = 0
        self.state = [self.hour, self.get_current_power()]
        return False
        
    def step(self, action):
        actual_power = pd_refs[self.day].loc[self.hour+1,"MW"] # Actual power at hour t+1
        prev_power = self.state[1] # Power of previous day
        done = False
        reward = 0
        ## 0 - Increase, 1 - Decrease, 2 - Same
        if(action == 0):
            if(actual_power > prev_power):
                 ## Correct prediction of increase in power demand
                reward = 10
            else:
                reward = -100
        elif(action == 1):
            if(actual_power < prev_power):
                 ## Correct prediction of decrease in power demand
                reward = 10
            else:
                reward = -100
        elif(action == 2):
             ## Correct prediction of constant power demand
             ## The power is within 5% of the previous day's power consumption
            if(0.92*prev_power < actual_power < 1.08 * prev_power):
                reward = 10
            else:
                reward = -100
        self.hour +=1
        done = False
        if(self.hour == 23):
            ## If all hours of the day are completed, move to next file
            done = self.next_day()
        if not done:
            ## Check if all files are read, if not continue 
            self.state = [self.hour, self.get_current_power()]
        info = {
            "day":self.day+1,
            "reward":reward,
            "hour":self.hour-1,
            "prev":prev_power,
            "predicted":action,
            "actual":actual_power
        }
        return self.state, reward, False, done, info
    
    def render(self):
        pass
    
    def reset(self):
        ## Reset the values
        self.day = 0
        self.hour = 0
        self.state = [self.hour, self.get_current_power()]
        info = {
                    "day":self.day+1,
                    "reward":reward,
                    "hour":self.hour-1,
                    "predicted":action,
                }
        return self.state, info

In [109]:
env = PowerPredEnv3()

## Result using random action selection for 5 episodes

In [110]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, _, done, info = env.step(action)
        score += reward
        # if(reward > 0):
        #     print(reward, info)
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-21040
Episode:2 Score:-19940
Episode:3 Score:-16640
Episode:4 Score:-18950
Episode:5 Score:-19720


## PPO

In [111]:
log_path = os.path.join('Training', 'logs')
model1 = PPO('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [112]:
model1.learn(total_timesteps=100000)

Logging to Training\logs\PPO_10
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 690       |
|    ep_rew_mean     | -1.91e+04 |
| time/              |           |
|    fps             | 395       |
|    iterations      | 1         |
|    time_elapsed    | 5         |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 690         |
|    ep_rew_mean          | -1.86e+04   |
| time/                   |             |
|    fps                  | 348         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011559788 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 1.

<stable_baselines3.ppo.ppo.PPO at 0x1a0bd9534c0>

PPO

![image.png](attachment:a9661434-b138-404a-b8eb-b519341c71ce.png)

Average reward after 100,000 timesteps = 19100


## DQN

In [113]:
model2 = DQN('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [114]:
model2.learn(total_timesteps = 100000)

Logging to Training\logs\DQN_6
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.02e+04 |
|    exploration_rate | 0.738     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 7056      |
|    time_elapsed     | 0         |
|    total_timesteps  | 2760      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.08e+04 |
|    exploration_rate | 0.476     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 7945      |
|    time_elapsed     | 0         |
|    total_timesteps  | 5520      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 690       |
|    ep_rew_mean      | -2.08e+04 |
|    exploration_rate | 0.213    

<stable_baselines3.dqn.dqn.DQN at 0x1a106f50220>

DQN

![image.png](attachment:ef56bcf0-b442-4c65-91c5-7300f54a4642.png)

Average reward after 100,000 timesteps = -7190

## A2C

In [115]:
model3 = A2C('MlpPolicy', env, verbose=1, tensorboard_log = log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [116]:
model3.learn(total_timesteps = 100000)

Logging to Training\logs\A2C_6
------------------------------------
| time/                 |          |
|    fps                | 291      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.128   |
|    explained_variance | 1.79e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.717    |
|    value_loss         | 1.05e+03 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 690       |
|    ep_rew_mean        | 520       |
| time/                 |           |
|    fps                | 277       |
|    iterations         | 200       |
|    time_elapsed       | 3         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.0859   |
|    explained_variance | -1.19e-

<stable_baselines3.a2c.a2c.A2C at 0x1a106f51a50>

A2C

![image.png](attachment:0f46f98f-e4a9-498d-a7cb-c5d7b99ae6c6.png)

Average reward after 100,000 timesteps = 1950

## Comparison

![image.png](attachment:11ea39c3-9724-46a6-869a-da9567f07b1d.png)

![image.png](attachment:233c7c4e-3089-442b-836f-94c2cc6b4b0d.png)

The DQN algorithm was not able to converge as fast as the PPO and A2C algorithms, with DQN getting negative average rewards while the other algorithms got a positive reward after 100,000 steps