In [9]:
import pandas as pd
reward = {"1": 10, "2":20, "3": 40}
all_rewards = [reward for _ in range(5)]
all_rewards

[{'1': 10, '2': 20, '3': 40},
 {'1': 10, '2': 20, '3': 40},
 {'1': 10, '2': 20, '3': 40},
 {'1': 10, '2': 20, '3': 40},
 {'1': 10, '2': 20, '3': 40}]

In [8]:
pd.DataFrame(all_rewards).sum().mean()

116.66666666666667

In [20]:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 12:11:49 2024

@author: naftabi
"""

import torch
import torch.nn as nn
import torch.optim as optim

import random

from utils import Memory
from model import QNetwork

class DQLAgent:
    def __init__(self, 
                 state_dim: int, 
                 action_dim: int, 
                 hidden_dim: int = 64, 
                 lr: float = 1e-4, 
                 gamma: float = 0.99,
                 tau: float = 0.05,
                 update_every: int = 20,
                 epsilon_start: float = 0.9,
                 epsilon_end: float = 0.05,
                 epsilon_decay: float = 0.995,
                 buffer_size: int = 10000,
                 seed: int = None):
        
        if seed is not None:
            random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
        
        self.gamma = gamma
        self.tau = tau
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.step_count = 0
        self.update_every = update_every
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Networks
        self.q_network = QNetwork(state_dim, action_dim, hidden_dim)
        self.target_q_network = QNetwork(state_dim, action_dim, hidden_dim)
        self.target_q_network.load_state_dict(self.q_network.state_dict())
        self.target_q_network.eval()
        
        self.optimizer = optim.RMSprop(self.q_network.parameters(), lr=lr)
        self.memory = Memory(buffer_size)
        self.criterion = nn.MSELoss()
        
    def update_epsilon(self):
        # Decay epsilon
        self.epsilon = max(self.epsilon_end, self.epsilon_decay * self.epsilon)  
        
    def act(self, state):
        self.q_network.eval()
        if random.random() > self.epsilon:
            state = torch.FloatTensor(state).unsqueeze(0)
            with torch.no_grad():
                q_value = self.q_network.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.action_dim)
        self.update_epsilon()
        return action
    
    def update(self, batch_size):
        self.q_network.train()
        state, action, reward, next_state = self.memory.sample(batch_size)
        state = torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        
        q_values = self.q_network(state)
        with torch.no_grad():
            next_q_values = self.q_network(next_state)
        
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        max_next_q_value = next_q_values.max(1)[0]
        expected_q_value = reward + self.gamma * max_next_q_value 
        
        loss = self.criterion(q_value, expected_q_value)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.step_count += 1
        if self.step_count % self.update_every == 0:
            for target_params, params in zip(self.target_q_network.parameters(), self.q_network.parameters()):
                target_params.data.copy_(self.tau * params.data + (1.0 - self.tau) * target_params.data)
                
    def load_policy(self, idx):
        filepath = f'./saved_q/agent_{idx}.pkl'
        try:
            state_dict = torch.load(filepath, map_location=torch.device('cpu'))
            self.q_network.load_state_dict(state_dict)
        except FileNotFoundError:
            raise Exception(f"Error: Policy file not found at {filepath}")
            

agent = DQLAgent(16, 4, hidden_dim=128)
agent.load_policy("i_cr30_101")

Policy loaded successfully from ./saved_q/agent_i_cr30_101.pkl


In [5]:
state_dict = torch.load("./saved_q/agent_i_cr30_101.pkl", map_location=torch.device('cpu'))

In [16]:
state_dict["fc1.weight"]

torch.Size([128, 16])