In [1]:
import numpy as np
from torch import nn
import torch
import random
from torch.distributions import Categorical

from collections import deque

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
class My_env:
    def __init__(self):
        self.state = torch.tensor(data = [0, 0, 0, 0], dtype = torch.float32)
        self.action_0_reward = 5
        self.action_1_reward = 20
        self.action_2_reward = 0

        self.minus_pt_prob = 0.1

        self.step_counter = 0
        self.max_step = 10
        self.max_score = self.max_step * max(self.action_0_reward, self.action_1_reward, self.action_2_reward)

    def reset(self):
        # Reset môi trường trở về trạng thái ban đầu và trả về quan sát đầu tiên
        self.state = torch.tensor(data = [0, 0, 0, 0], dtype = torch.float32)
        self.step_counter = 0
        return self.state
    
    def step(self, action):
        self.step_counter += 1
        self.state[action + 1] += 1

        if action == 0: # action A -> +5 points
            self.state[0] += self.action_0_reward
            reward = self.action_0_reward
        elif action == 1: # action B -> +20 pts with probs = 0.9, -all otherwise
            tmp = random.uniform(0, 1)
            if tmp <= 0.1: 
                reward = -20 #-self.state[0]
                self.state[0] = 0
            else: 
                self.state[0] += self.action_1_reward
                reward = self.action_1_reward
        elif action == 2: # action C -> +0 pts
            self.state[0] += self.action_2_reward
            reward = self.action_2_reward

        done = False
        if self.step_counter == self.max_step:
            done = True
        return self.state, reward, done

In [3]:
class My_policy(nn.Module):
    def __init__(self, env, n_hiddens = 5):
        super().__init__()
        self.env = env
        self.fc1 = nn.Linear(4, n_hiddens)
        self.fc2 = nn.Linear(n_hiddens, 3)

    def normalize(self, state):
        state[0] /= self.env.max_score
        state[1] /= self.env.max_step
        state[2] /= self.env.max_step
        state[3] /= self.env.max_step
        return state
    
    def forward(self, state): # state: 
        state = self.normalize(state).to(device)
        h = self.fc1(state)
        output = self.fc2(h)
        return torch.softmax(output, dim = 0)

    def act(self, state):
        state = state.float().to(device)
        probs = self.forward(state).cpu()
        # [0, 1, 2]
        # probs = [0.4, 0.2, 0.4]
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [4]:
env = My_env()
policy = My_policy(env).to(device)

In [5]:
# from trainer import ReinforceTrainer
learning_rate = 0.01
optimizer = torch.optim.Adam(params = policy.parameters(), lr = learning_rate)
# trainer = ReinforceTrainer(policy, env, optimizer)

In [6]:
from trainer import reinforce

reinforce(policy, env, optimizer, 1000, 20, 1.0, 10, learning_rate)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mducanh2002add[0m ([33mducanh2002add-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Episode 10	Average Score: 59.00
Episode 20	Average Score: 72.75
Episode 30	Average Score: 78.67
Episode 40	Average Score: 90.88
Episode 50	Average Score: 97.90
Episode 60	Average Score: 107.67
Episode 70	Average Score: 113.79
Episode 80	Average Score: 116.12
Episode 90	Average Score: 118.61
Episode 100	Average Score: 123.45
Episode 110	Average Score: 133.30
Episode 120	Average Score: 137.80
Episode 130	Average Score: 145.10
Episode 140	Average Score: 145.40
Episode 150	Average Score: 145.35
Episode 160	Average Score: 146.00
Episode 170	Average Score: 146.70
Episode 180	Average Score: 147.75
Episode 190	Average Score: 147.45
Episode 200	Average Score: 143.40
Episode 210	Average Score: 140.55
Episode 220	Average Score: 140.85
Episode 230	Average Score: 142.80
Episode 240	Average Score: 145.15
Episode 250	Average Score: 145.85
Episode 260	Average Score: 143.85
Episode 270	Average Score: 140.20
Episode 280	Average Score: 141.50
Episode 290	Average Score: 143.20
Episode 300	Average Score: 1