In [None]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [None]:
# Install system dependencies
!apt-get install -y swig

# Install alternative Box2D package
!pip install Box2D-kengz

# Install gymnasium with Box2D support
!pip install gymnasium[box2d]


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 45 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,478 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 121925 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.fc1 = nn.Linear(state_size, 8)
        self.fc2 = nn.Linear(8, 8)
        self.fc3 = nn.Linear(8, 4)
        self.fc4 = nn.Linear(4, action_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# initialize the environment
env = gym.make("CartPole-v1")

# HyperParameters
lr = 0.001
episodes = 15000
epsilon = 1
min_epsilon = 0.08
epsilon_decay = 0.99
gamma = 0.7
alpha = 0.1

# defining the q network
q_network = QNetwork(4, 2).to(device)
optimizer = optim.Adam(q_network.parameters(), lr=lr)
criterion = nn.MSELoss()

# training loop
for episode in range(episodes):
    q_network.train()
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)

        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                action = q_network(state_tensor).max(1)[1].item()

        next_state, reward, done, _, _ = env.step(action)
        total_reward += reward

        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(device)

        # Compute Q values for current state and next state
        q_values = q_network(state_tensor)
        next_q_values = q_network(next_state_tensor).max(1)[0]

        # Compute target Q value
        target_q_value = reward + gamma * next_q_values * (1 - done)

        # Compute loss
        loss = criterion(q_values[:, action], target_q_value)

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        state = next_state

    if (episode + 1) % 100 == 0:
        print(f"Episode: {episode + 1} of {episodes} || Reward: {total_reward}")

# Testing
env = gym.make("CartPole-v1", render_mode="human")
q_network.eval()

for _ in range(5):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action = q_network(state_tensor).max(1)[1].item()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        env.render()

    print(f"Test episode reward: {total_reward}")

q_network.save("q_network.pth")
env.close()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym

class QNetwork(nn.Module):
    def __init__(self, state_space, action_space):
        super().__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(state_space, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, action_space)
        )

    def forward(self, x):
        return self.fc1(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("Pendulum-v1", g=1.25)

# HyperParameters
lr = 0.0003
gamma = 0.99
epsilon = 1
min_epsilon = 0.03
epsilon_decay = 0.9995
episodes = 2000

q_network = QNetwork(3, 1).to(device)
optimizer = optim.Adam(q_network.parameters(), lr=lr)
criterion = nn.MSELoss()

# Training loop
for episode in range(episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)

        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                action = q_network(state_tensor).cpu().numpy().flatten()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(device)

        with torch.no_grad():
            next_q_value = q_network(next_state_tensor)
            target_q_value = reward + gamma * next_q_value * (1 - done)

        current_q_value = q_network(state_tensor)

        loss = criterion(current_q_value, target_q_value)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state
        total_reward += reward

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    if ((episode+1) % 100) == 0:
        print(f"Episode {episode+1} of {episodes} || Total Reward: {total_reward:.2f}")

torch.save(q_network,"pendulum_q_network")

# test the model
env = gym.make("Pendulum-v1", g=1.2, render_mode="human")
for _ in range(5):
    state,_ = env.reset()
    test_reward = 0
    done = False
    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action = q_network(state).cpu().numpy().flatten()

        next_state, reward, terminated, truncated, _ = env.step(action)
        state = next_state
        total_reward+=reward
        done = terminated or truncated
        env.render()

    print("test Reward : ",total_reward)

env.close()

Episode 100 of 2000 || Total Reward: -1276.91
Episode 200 of 2000 || Total Reward: -1462.79
Episode 300 of 2000 || Total Reward: -1438.94
Episode 400 of 2000 || Total Reward: -991.01
Episode 500 of 2000 || Total Reward: -1353.44
Episode 600 of 2000 || Total Reward: -1384.72
Episode 700 of 2000 || Total Reward: -1204.71
Episode 800 of 2000 || Total Reward: -1572.27
Episode 900 of 2000 || Total Reward: -1646.71
Episode 1000 of 2000 || Total Reward: -1526.03
Episode 1100 of 2000 || Total Reward: -1730.93
Episode 1200 of 2000 || Total Reward: -1774.94
Episode 1300 of 2000 || Total Reward: -1753.70
Episode 1400 of 2000 || Total Reward: -1710.29


KeyboardInterrupt: 