In [81]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [82]:
import gymnasium as gym
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo

In [84]:
class Actor(nn.Module):
    def __init__(self,inp,out):
        super(Actor,self).__init__()
        self.fc_layer = nn.Sequential(
            nn.Linear(inp, 256),
            nn.LayerNorm(256),  
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.Dropout(0.1),
            
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            
            nn.Linear(64, out),
            nn.Softmax()
        )
    def forward(self,x):
        out=self.fc_layer(torch.tensor(x))
        return out

In [85]:
class Value(nn.Module):
    def __init__(self,inp,out):
        super(Value,self).__init__()
        self.fc_layer=nn.Sequential(
            nn.Linear(inp, 256),
            nn.LayerNorm(256),  
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.Dropout(0.1),
            
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            
            nn.Linear(64, out),
            nn.Softmax())
                        
    def forward(self,x):
        x=torch.tensor(x)
        out=self.fc_layer(x)
        return out
        

In [103]:
import sys
class Agent():
    def __init__(self) -> None:

        self.seed=np.random.seed(0)
        self.batch_size=10
        self.gamma=0.99
        self.lamda=0.95
        self.clip_param = 0.2
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.policy = Actor(4, 2).to(self.device)
        self.value = Value(4, 1).to(self.device)
        self.policy_opti = torch.optim.Adam(self.policy.parameters(), lr=1e-3)
        self.value_opti = torch.optim.Adam(self.value.parameters(), lr=1e-4)
        self.traj=[]

    def save_model(self,epoch):
        torch.save({
            'policy_net': self.policy.state_dict(),
            'policy_opti': self.policy_opti.state_dict()
        }, f'dqn_model_epoch {epoch}.pth')
        
    def act(self, state):
        with torch.no_grad():
            out=(self.policy.forward(state))
            val=self.value.forward(state)
            return val.detach(), out.detach()
   
    
    def teach(self):
        T=len(self.traj)
        value_col=np.zeros(T+1)
        return_col=np.zeros(T)
        advantage=np.zeros(T)
        prob_col=np.zeros(T)
        reward_col=np.zeros(T)
        dones=np.zeros(T)
        last_state=0
        states=[]
        actions=[]
        i=0
        for e in self.traj:
            state,act,reward,next_state,log_prob,done,val=e
            value_col[i]=(val)
            dones[i]=done
            states.append(torch.tensor(state))
            actions.append(torch.tensor(act))
            prob_col[i]=(log_prob)
            reward_col[i]=(reward)
            last_state=next_state
            i+=1
        
        states = torch.stack(states).to(self.device)
        actions = torch.tensor(actions).to(self.device)
        value_col[-1]=self.value.forward(self.traj[-1][3]).item()
        gae=0
        for t in reversed(range(T)):
            delta = reward_col[t] + self.gamma * value_col[t + 1] * (1 - dones[t]) - value_col[t]
            gae = delta + self.gamma * self.lamda * gae * (1 - dones[t])
            advantage[t] = gae
            return_col[t] = gae + value_col[t]
        advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
        reward_col = (reward_col - reward_col.mean()) / (reward_col.std() + 1e-8)
        return_col = (return_col - return_col.mean()) / (return_col.std() + 1e-8)
        adv_col=torch.FloatTensor(advantage).to(self.device)
        return_col = torch.FloatTensor(return_col).to(self.device)
        
        for _ in range(10):
            new_prob=(torch.clamp(self.policy.forward(states), min=1e-8)).to(self.device)
            prob_col=torch.clamp(torch.tensor(prob_col).to(self.device),min=1e-8)
    
            dist=Categorical(new_prob)
            new_log_prob=dist.log_prob(actions)
            ratio = torch.exp(new_log_prob-prob_col)
            clipped_ratio = torch.clamp(ratio, 1-self.clip_param, 1+self.clip_param)
            policy_loss=-torch.min((ratio * adv_col),clipped_ratio* adv_col).mean() 
            value_loss= nn.MSELoss()(self.value(states).squeeze().to(self.device),return_col)
            if torch.isnan(policy_loss).any() or torch.isinf(policy_loss).any() or torch.isnan(value_loss).any() or torch.isinf(value_loss).any():
                print("NaN or Inf in policy_loss")
                break
            self.policy_opti.zero_grad()
            policy_loss.backward()
            self.policy_opti.step()
            self.value_opti.zero_grad()
            value_loss.backward()
            self.value_opti.step()
        self.traj.clear()
            

In [104]:
def train(epoch, batch=20):
    agent=Agent()
    env = gym.make("CartPole-v1", render_mode="rgb_array")
    # env = RecordVideo(env, video_folder="cartpole-agent", name_prefix="eval",
    #               episode_trigger=lambda x: True)
    k=0
    for i in tqdm(range(epoch), desc="Training Epochs"):
        state,info=env.reset()
        episode_done=False
        step=0
        true_rew=0
        while not episode_done and step<500 :
            val, prob=agent.act(state)
            dist = torch.distributions.Categorical(probs=prob)
            action=dist.sample()
            log_prob=dist.log_prob(action)
            last_state=state
            state, reward, terminated, truncated, info = env.step(action.item() )
            if terminated or truncated:
                episode_done=True;
            agent.traj.append((last_state,action,reward,state,log_prob,episode_done,val))
            k+=1
            true_rew+=reward
            step+=1
            if(k>=batch):
                k=0
                agent.teach()
        if(i%100==0):
            print(true_rew)
    env.close()
    return agent
    

In [108]:
agent=train(3000,512)

Training Epochs:   0%|          | 14/3000 [00:00<00:44, 67.11it/s]

20.0


  actions.append(torch.tensor(act))
  out=self.fc_layer(torch.tensor(x))
  prob_col=torch.clamp(torch.tensor(prob_col).to(self.device),min=1e-8)
Training Epochs:   4%|▎         | 107/3000 [00:02<01:18, 36.93it/s]

16.0


Training Epochs:   7%|▋         | 207/3000 [00:05<01:19, 35.31it/s]

24.0


Training Epochs:  10%|█         | 308/3000 [00:08<01:12, 36.90it/s]

19.0


Training Epochs:  14%|█▎        | 408/3000 [00:11<01:06, 39.09it/s]

23.0


Training Epochs:  17%|█▋        | 503/3000 [00:14<01:06, 37.80it/s]

15.0


Training Epochs:  20%|██        | 606/3000 [00:17<01:27, 27.31it/s]

25.0


Training Epochs:  23%|██▎       | 701/3000 [00:23<04:45,  8.05it/s]

104.0


Training Epochs:  27%|██▋       | 802/3000 [00:43<08:15,  4.43it/s]

170.0


Training Epochs:  30%|███       | 912/3000 [00:54<00:31, 66.58it/s]

13.0


Training Epochs:  34%|███▎      | 1012/3000 [00:55<00:24, 81.01it/s]

11.0


Training Epochs:  37%|███▋      | 1115/3000 [00:57<00:25, 72.59it/s]

11.0


Training Epochs:  40%|████      | 1214/3000 [00:58<00:26, 67.42it/s]

12.0


Training Epochs:  44%|████▎     | 1312/3000 [01:00<00:20, 81.80it/s]

11.0


Training Epochs:  47%|████▋     | 1414/3000 [01:01<00:19, 79.77it/s]

11.0


Training Epochs:  50%|█████     | 1515/3000 [01:03<00:19, 77.29it/s]

10.0


Training Epochs:  54%|█████▍    | 1618/3000 [01:04<00:17, 80.27it/s]

11.0


Training Epochs:  57%|█████▋    | 1717/3000 [01:05<00:16, 79.47it/s]

10.0


Training Epochs:  60%|██████    | 1813/3000 [01:07<00:16, 74.01it/s]

10.0


Training Epochs:  64%|██████▍   | 1913/3000 [01:08<00:14, 77.21it/s]

10.0


Training Epochs:  67%|██████▋   | 2017/3000 [01:10<00:12, 75.78it/s]

12.0


Training Epochs:  71%|███████   | 2116/3000 [01:11<00:11, 77.52it/s]

10.0


Training Epochs:  74%|███████▎  | 2212/3000 [01:13<00:10, 74.47it/s]

10.0


Training Epochs:  77%|███████▋  | 2317/3000 [01:14<00:08, 79.24it/s]

9.0


Training Epochs:  80%|████████  | 2408/3000 [01:16<00:07, 76.59it/s]

10.0


Training Epochs:  83%|████████▎ | 2498/3000 [01:17<00:07, 66.38it/s]

12.0


Training Epochs:  87%|████████▋ | 2602/3000 [01:24<01:13,  5.38it/s]

140.0


Training Epochs:  90%|█████████ | 2701/3000 [01:44<00:58,  5.08it/s]

147.0


Training Epochs:  93%|█████████▎| 2801/3000 [02:24<01:55,  1.72it/s]

342.0


Training Epochs:  97%|█████████▋| 2901/3000 [03:32<01:05,  1.51it/s]

500.0


Training Epochs: 100%|██████████| 3000/3000 [04:37<00:00, 10.80it/s]


In [109]:
agent

<__main__.Agent at 0x7f7408d02170>

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
env = RecordVideo(env, video_folder="cartpole-agent", name_prefix="eval",
                  episode_trigger=lambda x: True)
state,_=env.reset()
episode_done=False
true_rew=0
while not episode_done:
    val, prob=agent.act(state)
    print(prob)
    dist = torch.distributions.Categorical(probs=prob)
    action=dist.sample()
    log_prob=dist.log_prob(action)
    last_state=state
    state, reward, terminated, truncated, info = env.step(action.item() )
    true_rew+=reward
    if terminated :
        episode_done=True;
print(true_rew)

In [None]:
import shutil
shutil.rmtree("/kaggle/working/")