In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import matplotlib.pyplot as plt
from collections import deque

# Policy Network Definition
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_dim)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.softmax(self.fc2(x))
        return x

# REINFORCE Algorithm Implementation
class REINFORCE:
    def __init__(self, state_dim, action_dim, learning_rate=0.01, gamma=0.99):
        self.gamma = gamma
        self.policy_network = PolicyNetwork(state_dim, action_dim)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)
        self.episode_rewards = []
        self.episode_log_probs = []
    
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32)
        action_probs = self.policy_network(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        self.episode_log_probs.append(action_dist.log_prob(action))
        return action.item()
    
    def update_policy(self):
        G = 0
        policy_loss = []
        for log_prob, reward in zip(reversed(self.episode_log_probs), reversed(self.episode_rewards)):
            G = reward + self.gamma * G
            policy_loss.append(-log_prob * G)
        
        self.optimizer.zero_grad()
        loss = torch.stack(policy_loss).sum()
        loss.backward()
        self.optimizer.step()
        
        self.episode_rewards = []
        self.episode_log_probs = []

# Train REINFORCE on CartPole-v1
def train_reinforce(env_name='CartPole-v1', episodes=1000):
    env = gym.make(env_name)
    agent = REINFORCE(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n)
    reward_history = []
    
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        
        for t in range(200):
            action = agent.select_action(state)
            state, reward, done, _ = env.step(action)
            agent.episode_rewards.append(reward)
            episode_reward += reward
            if done:
                break
        
        agent.update_policy()
        reward_history.append(episode_reward)
        if episode % 50 == 0:
            print(f"Episode {episode}, Reward: {episode_reward}")
    
    env.close()
    return reward_history

# Run training and plot results
reward_history = train_reinforce()
plt.plot(reward_history)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("REINFORCE Training Performance")
plt.show()


  state = torch.tensor(state, dtype=torch.float32)


ValueError: expected sequence of length 4 at dim 1 (got 0)