# MC REINFORCE -TYPE -I



# IMPORTS


In [None]:
# '''
# Installing packages for rendering the game on Colab
# '''

# !pip install gym pyvirtualdisplay > /dev/null 2>&1
# !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
# !apt-get update > /dev/null 2>&1
# !apt-get install cmake > /dev/null 2>&1
# !pip install --upgrade setuptools 2>&1
# !pip install ez_setup > /dev/null 2>&1
# !pip install gym[atari] > /dev/null 2>&1
# !pip install git+https://github.com/tensorflow/docs > /dev/null 2>&1
# !pip install gym[classic_control]

In [None]:
!pip install pyvirtualdisplay

  and should_run_async(code)


Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-3.0-py3-none-any.whl (15 kB)
Installing collected packages: pyvirtualdisplay
Successfully installed pyvirtualdisplay-3.0


In [None]:
'''
A bunch of imports, you don't have to worry about these
'''

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp

  if (distutils.version.LooseVersion(tf.__version__) <


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
'''
Please refer to the first tutorial for more details on the specifics of environments
We've only added important commands you might find useful for experiments.
'''

'''
List of example environments
(Source - https://gym.openai.com/envs/#classic_control)

'Acrobot-v1'
'Cartpole-v1'
'MountainCar-v0'
'''

env = gym.make('CartPole-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())
print("----")

'''
# Understanding State, Action, Reward Dynamics

The agent decides an action to take depending on the state.

The Environment keeps a variable specifically for the current state.
- Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
- It returns the new current state and reward for the agent to take the next action

'''

state = env.reset()
''' This returns the initial state (when environment is reset) '''

print(state)
print("----")

action = env.action_space.sample()
''' We take a random action now '''

print(action)
print("----")

next_state, reward, done, info = env.step(action)
''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

print(next_state)
print(reward)
print(done)
print(info)
print("----")


4
2
0
----
[ 0.01369617 -0.02302133 -0.04590265 -0.04834723]
----
1
----
[ 0.01323574  0.17272775 -0.04686959 -0.3551522 ]
1.0
False
{}
----


# NETWORK FOR J(θ)

In [None]:
'''
### Q Network & Some 'hyperparameters'

QNetwork1:
Input Layer - 4 nodes (State Shape) \
Hidden Layer 1 - 128 nodes \
Hidden Layer 2 - 64 nodes \
Output Layer - 2 nodes (Action Space) \
Optimizer - zero_grad()
'''

import torch
import torch.nn as nn
import torch.nn.functional as F


'''
Bunch of Hyper parameters (Which you might have to tune later)
'''
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
LR = 5e-4               # learning rate
UPDATE_EVERY = 20       # how often to update the network (When Q target is present)


class QNetwork1(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [None]:
class TutorialAgent():

    def __init__(self, state_size, action_size, seed):

        ''' Agent Environment Interaction '''
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        ''' Q-Network '''
        self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):

        ''' Save experience in replay memory '''
        self.memory.add(state, action, reward, next_state, done)

        ''' If enough samples are available in memory, get random subset and learn '''
        if len(self.memory) >= BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        """ +Q TARGETS PRESENT """
        ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        ''' Epsilon-greedy action selection (Already Present) '''
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ +E EXPERIENCE REPLAY PRESENT """
        states, actions, rewards, next_states, dones = experiences

        ''' Get max predicted Q values (for next states) from target model'''
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        ''' Compute Q targets for current states '''
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        ''' Get expected Q values from local model '''
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        ''' Compute loss '''
        loss = F.mse_loss(Q_expected, Q_targets)

        ''' Minimize the loss '''
        self.optimizer.zero_grad()
        loss.backward()

        ''' Gradiant Clipping '''
        """ +T TRUNCATION PRESENT """
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-1, 1)

        self.optimizer.step()

In [None]:
''' Defining DQN Algorithm '''

state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n


def dqn(n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores_window = deque(maxlen=100)
    ''' last 100 scores for checking if the avg is more than 195 '''
    '''list to store rewards'''
    rewards =[]

    eps = eps_start
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)
        rewards.append(score)

        '''eps won't affect the softmax selection'''
        '''we are taking constant tau for softmax whereas linear decay of epsilon is employed in epsilon greedy'''

        eps = max(eps_end, eps_decay*eps)
        ''' decrease epsilon '''

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

        if i_episode % 100 == 0:
           print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=195.0:
           print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
           break
    return rewards

''' Trial run to check if algorithm runs and saves the data '''

begin_time = datetime.datetime.now()

agent = TutorialAgent(state_size=state_shape,action_size = action_shape,seed = 0)
rewards = dqn()

time_taken = datetime.datetime.now() - begin_time

print(time_taken)

NameError: name 'ReplayBuffer' is not defined

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import datetime
import gym

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Define the REINFORCE agent
class REINFORCEAgent():
    def __init__(self, state_size, action_size, seed, lr=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.policy_network = PolicyNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr)

        self.saved_log_probs = []
        self.rewards = []

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        logits = self.policy_network(state)
        probs = torch.softmax(logits, dim=1)
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        self.saved_log_probs.append(m.log_prob(action))
        return action.item()

    def learn(self, gamma):
        discounts = [gamma ** i for i in range(len(self.rewards) + 1)]
        R = sum([a * b for a, b in zip(discounts, self.rewards)])

        policy_loss = []
        for log_prob in self.saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()

        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()

        self.saved_log_probs = []
        self.rewards = []

# Define the REINFORCE algorithm
def reinforce(env, agent, n_episodes=10000, max_t=1000, gamma=0.99):
    scores_window = deque(maxlen=100)
    rewards = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.rewards.append(reward)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)
        rewards.append(score)

        agent.learn(gamma)

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        if np.mean(scores_window) >= 195.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            break

    return rewards

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set random seed for reproducibility
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# Define hyperparameters
LR = 1e-04  # Learning rate
gamma = 0.99  # Discount factor
n_episodes = 10000  # Maximum number of episodes
max_t = 10000  # Maximum number of timesteps per episode

# Define the environment
# Replace 'CartPole-v1' with the name of your environment
env = gym.make('CartPole-v1')  # Example environment (CartPole-v1)

# Get state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Create an instance of the REINFORCE agent
agent = REINFORCEAgent(state_size, action_size, seed=seed, lr=LR)

# Run the REINFORCE algorithm
begin_time = datetime.datetime.now()
rewards = reinforce(env, agent, n_episodes=n_episodes, max_t=max_t, gamma=gamma)
time_taken = datetime.datetime.now() - begin_time
print("Time taken:", time_taken)

# Close the environment
env.close()


Episode 100	Average Score: 23.47
Episode 200	Average Score: 22.42
Episode 300	Average Score: 24.22
Episode 400	Average Score: 26.12
Episode 500	Average Score: 25.37
Episode 600	Average Score: 27.47
Episode 700	Average Score: 29.40
Episode 800	Average Score: 28.49
Episode 900	Average Score: 32.54
Episode 1000	Average Score: 32.50
Episode 1100	Average Score: 36.19
Episode 1200	Average Score: 44.04
Episode 1300	Average Score: 45.18
Episode 1400	Average Score: 43.45
Episode 1500	Average Score: 42.07
Episode 1600	Average Score: 50.26
Episode 1700	Average Score: 52.52
Episode 1800	Average Score: 53.87
Episode 1900	Average Score: 62.72
Episode 2000	Average Score: 59.66
Episode 2100	Average Score: 60.15
Episode 2200	Average Score: 63.79
Episode 2300	Average Score: 61.84
Episode 2400	Average Score: 71.07
Episode 2500	Average Score: 73.48
Episode 2600	Average Score: 66.21
Episode 2700	Average Score: 76.29
Episode 2800	Average Score: 82.82
Episode 2900	Average Score: 84.93
Episode 3000	Average Sc

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import datetime
import gym

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Define the REINFORCE agent
class REINFORCEAgent():
    def __init__(self, state_size, action_size, seed, lr=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.policy_network = PolicyNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr)

        self.saved_log_probs = []
        self.rewards = []

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        logits = self.policy_network(state)
        probs = torch.softmax(logits, dim=1)
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        self.saved_log_probs.append(m.log_prob(action))
        return action.item()

    def learn(self, gamma):
        discounts = [gamma ** i for i in range(len(self.rewards) + 1)]
        R = sum([a * b for a, b in zip(discounts, self.rewards)])

        policy_loss = []
        for log_prob in self.saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()

        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()

        self.saved_log_probs = []
        self.rewards = []

# Define the REINFORCE algorithm
def reinforce(env, agent, n_episodes=10000, max_t=1000, gamma=0.99):
    scores_window = deque(maxlen=100)
    rewards = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.rewards.append(reward)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)
        rewards.append(score)

        agent.learn(gamma)

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))

        if np.mean(scores_window) >= 195.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            break

    return rewards

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Set random seed for reproducibility
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# Define hyperparameters
LR = 1e-04  # Learning rate
gamma = 0.99  # Discount factor
n_episodes = 10000  # Maximum number of episodes
max_t = 10000  # Maximum number of timesteps per episode

# Define the environment
# Replace 'CartPole-v1' with the name of your environment
env = gym.make('Acrobot-v1')  # Example environment (CartPole-v1)

# Get state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Create an instance of the REINFORCE agent
agent = REINFORCEAgent(state_size, action_size, seed=seed, lr=LR)

# Run the REINFORCE algorithm
begin_time = datetime.datetime.now()
rewards = reinforce(env, agent, n_episodes=n_episodes, max_t=max_t, gamma=gamma)
time_taken = datetime.datetime.now() - begin_time
print("Time taken:", time_taken)

# Close the environment
env.close()


Episode 100	Average Score: -496.05
Episode 200	Average Score: -491.20
Episode 300	Average Score: -493.67
Episode 400	Average Score: -498.33
Episode 500	Average Score: -495.48
Episode 600	Average Score: -496.77
Episode 700	Average Score: -493.16
Episode 800	Average Score: -489.44
Episode 900	Average Score: -494.39
Episode 1000	Average Score: -486.38
Episode 1100	Average Score: -475.50
Episode 1200	Average Score: -460.22
Episode 1300	Average Score: -438.56
Episode 1400	Average Score: -469.49
Episode 1500	Average Score: -456.82
Episode 1600	Average Score: -454.15
Episode 1700	Average Score: -464.88
Episode 1800	Average Score: -477.82
Episode 1900	Average Score: -480.87
Episode 2000	Average Score: -495.31
Episode 2100	Average Score: -462.76
Episode 2200	Average Score: -479.16
Episode 2300	Average Score: -487.86
Episode 2400	Average Score: -470.57
Episode 2500	Average Score: -480.80
Episode 2600	Average Score: -476.02
Episode 2700	Average Score: -487.22
Episode 2800	Average Score: -457.36
E