In [66]:
import gymnasium as gym 
import torch
import numpy as np
import random

episodes = 5000
lr = 1e-3
discount = 0.9
batch = 10
resets = 50
epsilon = 0.1



env = gym.make("CartPole-v1")

q_net = torch.nn.Sequential(
    torch.nn.Linear(4,32),
    torch.nn.ReLU(),
    torch.nn.Linear(32,32),
    torch.nn.ReLU(),
    torch.nn.Linear(32,env.action_space.n),
)

q_net2  = torch.nn.Sequential(
    torch.nn.Linear(4,32),
    torch.nn.ReLU(),
    torch.nn.Linear(32,32),
    torch.nn.ReLU(),
    torch.nn.Linear(32,env.action_space.n),

)

optimizer = torch.optim.Adam(q_net.parameters(), lr=lr)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [67]:
replay = []
totalsteps = 0


for ep in range(episodes):
    steps = 0
    obs, _ = env.reset()
    obs = torch.tensor(obs, dtype=torch.float32).to(device)
    done = False
    while not done:
        #sample action based on greedy or Q values
        if random.random() < epsilon:
            action = random.randint(0, 1)
        else:
            action = torch.argmax(q_net(obs)).item()

        #execute action
        newobs, reward, terminated, truncated, _ = env.step(action)
        steps += 1
        totalsteps += 1
        done = terminated or truncated

        #store transition
        replay.append([obs, action, reward, newobs, done])
        
        #reset current state
        obs = torch.tensor(newobs, dtype=torch.float32).to(device)
    
    if (ep + 1) % 500 == 0:
        print(f"Episode {ep+1}: steps = {steps}")

    #check if enough transitions to sample a whole batch
    if len(replay) > batch:
        sample =  random.sample(replay, batch)
    else:
        sample = replay

    #sample batch
    for s in sample:
        if s[4]:
            #if transiiton is the last in episode set target to the reward
            target = s[2]
        else:
            #if transition is not the last episode
            #collect future q_vals from state transitioned to
            future_qvals = q_net2(torch.tensor(s[3], dtype=torch.float32).to(device))
            #target is the immideate reward + best reward possible from future
            target = s[2] + discount * torch.max(future_qvals).item()
        
        actions = q_net(torch.tensor(s[0], dtype=torch.float32).to(device))
        #collect qval for the action taken in obs
        qval = actions[s[1]]

        #find mean squared error between the target qval found from looking at future rewards
        loss = (target - qval)**2

        #update params based on loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    #every so often update the behavior policy to the target policy
    if totalsteps % 100 == 0:
        q_net2.load_state_dict(q_net.state_dict())
        
env.close()

  actions = q_net(torch.tensor(s[0], dtype=torch.float32).to(device))


Episode 500: steps = 10
Episode 1000: steps = 17
Episode 1500: steps = 72
Episode 2000: steps = 138
Episode 2500: steps = 126
Episode 3000: steps = 165
Episode 3500: steps = 119
Episode 4000: steps = 218
Episode 4500: steps = 213
Episode 5000: steps = 259


In [68]:
env = gym.make("CartPole-v1", render_mode="human")

for episode in range(10):  # Show 3 episodes
    obs, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        obs_tensor = torch.tensor(obs, dtype=torch.float32).to(device)
        probs = q_net(obs_tensor)
        action = torch.argmax(probs).item()

        obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward

    print(f"Episode {episode+1} finished with reward {total_reward}")

env.close()


Episode 1 finished with reward 500.0
Episode 2 finished with reward 311.0
Episode 3 finished with reward 500.0
Episode 4 finished with reward 500.0
Episode 5 finished with reward 500.0
Episode 6 finished with reward 284.0
Episode 7 finished with reward 330.0
Episode 8 finished with reward 317.0
Episode 9 finished with reward 500.0
Episode 10 finished with reward 500.0
