In [1]:
!pip uninstall -y gym
!pip install gymnasium

[0mCollecting gymnasium
  Downloading gymnasium-1.2.2-py3-none-any.whl.metadata (10 kB)
Collecting cloudpickle>=1.2.0 (from gymnasium)
  Downloading cloudpickle-3.1.2-py3-none-any.whl.metadata (7.1 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.2.2-py3-none-any.whl (952 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m952.1/952.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hDownloading cloudpickle-3.1.2-py3-none-any.whl (22 kB)
Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, cloudpickle, gymnasium
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [gymnasium]
[1A[2KSuccessfully installed cloudpickle-3.1.2 farama-notifications-0.0.4 gymnasium-1.2.2


In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import gymnasium as gym # Changed from 'import gym'
from collections import deque
import matplotlib.pyplot as plt

In [4]:
class PolicyNetwork(nn.Module):
    """Neural network for policy approximation"""
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=-1)

In [6]:
class ValueNetwork(nn.Module):
    """Neural network for value function approximation"""
    def __init__(self, state_dim, hidden_dim=128):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [7]:
class REINFORCE:
    def __init__(self, state_dim, action_dim, lr=0.001, gamma=0.99):
        self.gamma = gamma
        self.policy = PolicyNetwork(state_dim, action_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)

        # Storage for episode
        self.log_probs = []
        self.rewards = []

    def select_action(self, state):
        """Select action according to policy"""
        state = torch.FloatTensor(state).unsqueeze(0)
        probs = self.policy(state)
        dist = Categorical(probs)
        action = dist.sample()
        self.log_probs.append(dist.log_prob(action))
        return action.item()

    def store_reward(self, reward):
        """Store reward for current step"""
        self.rewards.append(reward)

    def update(self):
        """Update policy using REINFORCE algorithm"""
        # Calculate discounted returns
        returns = []
        G = 0
        for r in reversed(self.rewards):
            G = r + self.gamma * G
            returns.insert(0, G)

        # Normalize returns (optional but helps with stability)
        returns = torch.FloatTensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        # Calculate policy loss
        policy_loss = []
        for log_prob, G in zip(self.log_probs, returns):
            policy_loss.append(-log_prob * G)

        # Perform backprop
        self.optimizer.zero_grad()
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

        # Clear episode storage
        self.log_probs = []
        self.rewards = []

        return policy_loss.item()

In [8]:
class ActorCritic:

    def __init__(self, state_dim, action_dim, lr_actor=0.001, lr_critic=0.001, gamma=0.99):
        self.gamma = gamma

        # Actor (policy) and Critic (value function)
        self.actor = PolicyNetwork(state_dim, action_dim)
        self.critic = ValueNetwork(state_dim)

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic)

        # Storage for episode
        self.log_probs = []
        self.values = []
        self.rewards = []

    def select_action(self, state):
        """Select action and compute value estimate"""
        state = torch.FloatTensor(state).unsqueeze(0)

        # Get action probabilities from actor
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()

        # Get value estimate from critic
        value = self.critic(state)

        self.log_probs.append(dist.log_prob(action))
        self.values.append(value)

        return action.item()

    def store_reward(self, reward):
        """Store reward for current step"""
        self.rewards.append(reward)

    def update(self):
        """Update actor and critic using advantage"""
        # Calculate returns and advantages
        returns = []
        G = 0
        for r in reversed(self.rewards):
            G = r + self.gamma * G
            returns.insert(0, G)

        returns = torch.FloatTensor(returns)
        values = torch.cat(self.values)

        # Calculate advantages (TD error)
        advantages = returns - values.squeeze().detach()

        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Actor loss (policy gradient with advantage)
        actor_loss = []
        for log_prob, advantage in zip(self.log_probs, advantages):
            actor_loss.append(-log_prob * advantage)
        actor_loss = torch.stack(actor_loss).sum()

        # Critic loss (MSE between predicted value and actual return)
        critic_loss = F.mse_loss(values.squeeze(), returns)

        # Update actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Clear episode storage
        self.log_probs = []
        self.values = []
        self.rewards = []

        return actor_loss.item(), critic_loss.item()
