# Notebook settings

In [None]:
!apt-get -qq install neofetch
!pip --quiet install wandb -qU
!pip --quiet install profilegraph

In [13]:
!nvidia-smi
!neofetch

Sun Feb 19 11:27:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports 

In [4]:
# https://dvelopery0115.github.io/2021/08/01/Introduction_to_W&B.html
# https://www.tensorflow.org/guide/tf_numpy # numpy report
import random
import copy
import pickle
import math
import time
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pprint

from collections import namedtuple

# loading profiler to check usage
%load_ext profilegraph

def asHHMMSS(s):
    m = math.floor(s / 60)
    s -= m * 60
    h = math.floor(m / 60)
    m -= h * 60
    return '%dh:%dm:%ds'% (h, m, s)


print(f"Timing: {asHHMMSS(162)}")
N_agents = 2

# classic_online - simple
# setting seed
SEED= 1111
np.random.seed(SEED)
random.seed(SEED)

# Log in to your W&B account
import wandb
import os

os.environ["WANDB_API_KEY"] = 'a956746b2267924a17a7ec60afa5aca91090843d'
# os.environ["WANDB_MODE"] = "online"
os.environ["WANDB_NOTEBOOK_NAME"] = "DQN model"


SEEDS = np.random.randint(1000, size=25)
SEEDS # 788 

Timing: 0h:2m:42s


array([412, 311, 741, 337, 396, 674, 692, 152, 918, 788, 907, 270, 776,
       486, 908, 494, 754, 854, 392, 105, 958, 317, 691, 554,  76])

In [None]:
from utils import foo
from models import DQN

In [5]:
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

def make_plots(step_values, ylabel="Rewards", title='Player 1',
               p_last_mean=False):
    plt.figure(2, figsize=(14, 9))
    plt.clf()
    # durations_t = torch.tensor(step_values, dtype=torch.float)
    durations_t = np.array(step_values, dtype=float)
    
    plt.xlabel('Steps')
    plt.ylabel(ylabel)
    plt.grid(True, alpha=.45)
    # plt.plot(durations_t.numpy())
    plt.plot(durations_t, alpha=.6)
    
    # # Take 100 step averages and plot them too
    n_EPI = 100
    if len(durations_t) >= n_EPI:
        means = [np.mean(durations_t[i-n_EPI:i]) for i in range(n_EPI, len(durations_t) + 1)]
        # means = np.concatenate((np.zeros(99), means))
        plt.plot(means)
        # means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        # means = torch.cat((torch.zeros(99), means))
        # plt.plot(means.numpy())
    
    plt.title(f'{title} training... stabilizes at {means[-1]:.4f}') \
              if p_last_mean else plt.title(f'{title} training...') 

    ax = plt.gca()
    ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))
    # plt.pause(0.001)  # pause a bit so that plots are updated
    # if is_ipython:
    #     display.clear_output(wait=True)
    #     display.display(plt.gcf())

###### Functions for the Agents

def replay_classic_reward(action):
    # Compute profits for all agents
    price = actions_space[action]
    demand = np.exp((QUALITY - price) / HORIZON)
    demand = demand / (np.sum(demand) + 1.) #np.exp(a0 / HORIZON))
    reward = np.multiply(price - MARGIN_COST, demand)
    return reward


actions_space = np.arange(1.43, 2.0, 0.04) # 15 actions
QUALITY = np.full((N_agents), 2) #np.ones(2) * 2
MARGIN_COST = np.ones(N_agents)
HORIZON = 1 / 4
a0 = 0  # initial action
N_actions = actions_space.size
N_agents = 2

reward_sum = np.array([5.58232796, 5.78802889, 5.92606135, 5.99644584, 6.00067233,
                       5.94172477, 5.82402394, 5.65328833, 5.43631956, 5.18072579,
                       4.89460298, 4.58619785, 4.26357789, 3.93433261, 3.60532586])


print(f'Action space size: {N_actions}') # 15

Action space size: 15


# Agent

## DQN 

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd 
import torch.nn.functional as F
import torchvision.transforms as T
from torch.nn.utils import clip_grad_norm_

from itertools import count
from collections import namedtuple, deque


DRATE = 0.95
MAX_NORM = 1.0
# TARGET_UPDATE = 10000

# BATCH_SIZE = 256
BATCH_SIZE = 128
# BATCH_SIZE = 64
# LR = 1e-4
# LR = 4e-4 # 0.0004


# USE_CUDA = False
USE_CUDA = torch.cuda.is_available()
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)

device = torch.device("cuda" if USE_CUDA else "cpu")
print(device)
torch.cuda.empty_cache() # cleaning CUDA

class DQN_agent(nn.Module):
    def __init__(self, N_space, N_actions, tuned_q, agent_name="DQN"):
        super(DQN_agent, self).__init__()

        self.epsilon = 1.  # initial exploration prob
        self.ep_end = 0.01 # modified given it requires less visited -- only 1%
        self.decay_rate = .00001   # 1e-5
        self.action_size = N_actions
        self.state_size = len(N_space) # (15, 15)
        
        self.fc1 = nn.Linear(self.state_size, 512)
        # self.fc1 = nn.Linear(self.state_size, 512, bias=False)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, self.action_size)
        self.bn1 = nn.BatchNorm1d(num_features=512)
        # self.bn2 = nn.BatchNorm1d(num_features=256)

        # self.fc3.bias.data.copy_(torch.from_numpy(tuned_q)) # update bias giving reward_sum values

        print(f"\n[ DQN pyTorch's {agent_name.title()} AGENT SETUP]")

    def forward(self, input):               
        x = self.fc1(input)
        # x = self.bn1(x)
        x = F.relu(x)
        # x = F.gelu(x)
        # x = F.silu(x)
        x = self.fc2(x)
        # x = self.bn2(x)
        x = F.relu(x)
        # x = F.gelu(x)
        # x = F.silu(x) 
        return self.fc3(x)

    def select_action(self, state):
        # Exploration VS Exploitation
        if random.random() > self.epsilon:
            self.eval()  
            with torch.no_grad():                
                state = Variable(torch.from_numpy(state.astype(np.float32)).unsqueeze(0))
                return self.forward(state).argmax().item()
        return random.randrange(self.action_size) 
    
    def update_exp_epsilon(self, step):
        '''Exponential update of the e-greedy rate'''
        # decrease the exploration
        # self.epsilon = self.ep_end + (1. - self.ep_end) * np.exp(-self.decay_rate * (step))
        # self.epsilon = np.exp(-self.decay_rate * step )
        self.epsilon = self.ep_end + (1. - self.ep_end) * \
                            np.exp(-self.decay_rate * step )

    def update_lin_epsilon(self, step):
        '''Linear update of the e-greedy rate'''
        start = 1.
        end = 0.01 # minimum epsilon
        end_fraction = 0.1 # % of total steps to linearly anneal epsilon

        def func(progress_remaining: float) -> float:
            if (1 - progress_remaining) > end_fraction:
                return end
            else:
                return start + (1 - progress_remaining) * (end - start) / end_fraction
        self.epsilon = func(1. -step / STEPS)


cuda


# RUN

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, state, action, next_state, reward):        
        """Save a transition"""       
        self.memory.append(Transition(*(state, action, next_state, reward)))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory.maxlen)

def train_step_td_loss_batch(idx, agent):
    '''Using the Bellman equation + Memory (Exp) replay'''
    agents[agent].train() # set to train due to the batchNorm
    transitions = memory.sample(BATCH_SIZE)
    # transposing the batch
    batch = Transition(*zip(*transitions))
        
    states_batch = Variable( torch.from_numpy(np.array(batch.state, dtype=np.float32))  )
    next_states_batch = Variable( torch.from_numpy(np.array(batch.next_state, dtype=np.float32))  )
    action_batch = Variable(torch.from_numpy(np.array( batch.action)).unsqueeze(1)) 
    reward_batch = Variable(torch.from_numpy(np.array( batch.reward)))

    # # computing Q(s_t, a)
    # # pick the q-val from the given action with gather giving the indexes
    state_action_values = agents[agent](states_batch).gather(dim=1, index=action_batch[:, :, idx])
       
    with torch.no_grad():
        # compute V(s_t+1) for each next_state
        # next_state_values = torch.zeros(BATCH_SIZE, device=device)
        # # take the max q-value for action in next_state
        # amax() -> max_values
        next_state_values = target_agents[agent](next_states_batch).amax(dim=1)

        # Compute the expected Q values
        expected_state_action_values = reward_batch[:, idx] + (DRATE * next_state_values )

    optimizers[agent].zero_grad()  # setting optimizer to zeroes
    # Compute loss
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(dim=1))
    loss.backward()
    # clipping the values inside the model
    # for param in agents[agent].parameters(): # clipping
    #     param.grad.data.clamp_(-MAX_NORM, MAX_NORM)
    # torch.nn.utils.clip_grad_value_(agents[agent].parameters(), 100)
    # torch.nn.utils.clip_grad_value_(agents[agent].parameters(), MAX_NORM)
    # perform one-step to optim the model
    optimizers[agent].step()

    return loss.item(), state_action_values.mean().item()

## Hard

In [None]:
# Compute Huber loss
criterion = nn.SmoothL1Loss()
TARGET_UPDATE = 5
AGENTS = ['player_1', 'player_2']
LR = 0.001

# for ii, SEED in enumerate([412, 311, 741, 337, 396], 1):
for ii, SEED in enumerate([412], 0):

    # for LR in [.1, .01, .001, .0001]:
    print(f"\n\n\tSession: {ii}, seed: {SEED}, LR: {LR}\n")

    torch.cuda.empty_cache()

    memory = ReplayMemory(capacity=100000)

    agents = {}
    target_agents = {}
    action = {}
    optimizers = {}
    loss_hist = {}
    rewards = {}
    q_val_hist = {}


    # setting agents
    for agent in AGENTS: # state(actions p1, actions p2), possible_actions to take
    
        # double q-learning
        agents[agent] = DQN_agent(N_space=(N_actions, N_actions), N_actions=N_actions, tuned_q=reward_sum, agent_name=agent).to(device)
        # optimizers[agent] = optim.AdamW(agents[agent].parameters(), lr=LR, amsgrad=True) 
        optimizers[agent] = optim.Adam(agents[agent].parameters(), lr=LR) 
        # optimizers[agent] = optim.RMSprop(agents[agent].parameters(), lr=LR) 
        # optimizers[agent] = optim.SGD(agents[agent].parameters(), lr=LR, momentum=0.9)

        ## creating target networks
        target_agents[agent] = DQN_agent(N_space=(N_actions, N_actions), N_actions=N_actions, tuned_q=reward_sum, agent_name=f"target_{agent}").to(device)
        target_agents[agent].load_state_dict( agents[agent].state_dict() ) # copying values from learning model
        # Q_target parameters are frozen.
        for p in target_agents[agent].parameters():
            p.requires_grad = False
        target_agents[agent].eval()

        action[agent] = -1
        loss_hist[agent] = []
        rewards[agent] = []
        q_val_hist[agent] = []

    config = {    
            # 'learning rate': 'polynomial', #0.15,
            'discount rate': 0.95,
            'exploration type': 'exponential epsilon',
            'decay rate': 0.00001,
            'seed': SEED,

            'q-values tuned': False,
            'exp replay': len(memory),            
            'batch sz': BATCH_SIZE,
            'target upd': TARGET_UPDATE,
            'batchNorm': False,
            'clipping' : False,
            'optim': type(optimizers[agent]).__name__,
            # 'job_type' : LR
        }
        
    NAME = f"hard{ii}"
    run = wandb.init(project='Stability', name=NAME, notes="DQN with expReplay -- hard upd"
                    , group=type(optimizers[agent]).__name__
                    , job_type = f"{LR}"
                    , tags=['relu', f'{type(optimizers[agent]).__name__} {LR}', 'DQN', 'exp replay']
                    , config=config, reinit=True
                    )

    wandb.watch(agents['player_1'], log='gradients')
    wandb.watch(agents['player_2'], log='gradients')

    # TRAINING LOOP
    # seeds
    np.random.seed(SEED)
    random.seed(SEED)
    torch.manual_seed(SEED)

    state_hist = []
    step = 0

    start_training = time.time()
    for epi in range(4000):
    # for epi in range(12):

        state = np.random.randint(0, N_actions, size=N_agents)
        for _ in range(500):
        # for _ in range(1000):

            action = np.zeros(N_agents, dtype=int)
            for idx, agent in enumerate(AGENTS):       
                action[idx] = agents[agent].select_action(state)
                        
            # play both actions on the given state and get rewards for p1&p2
            reward = replay_classic_reward(action)
            next_state = action
            
            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            if len(memory) > BATCH_SIZE:
            
                for idx, agent in enumerate(AGENTS):
                    # update agent
                    loss, q_val = train_step_td_loss_batch(idx, agent)

                    # keep track of values
                    # loss_hist[agent].append(loss) # loss
                    # rewards[agent].append(reward[idx]) # reward
                    # q_val_hist[agent].append(q_val) # q value
                    
                    # update epsilon
                    agents[agent].update_exp_epsilon( step ) # exponential
                    # agents[agent].update_lin_epsilon(step ) # linear

                    # # loggin wandb every n steps
                    if step % 25 == 0:
                        wandb.log({
                                f'q-value_{agent}': q_val,
                                f'loss_{agent}': loss,
                                f'reward_{agent}': reward[idx],          
                                f'epsilon_{agent}': agents[agent].epsilon,                      
                                },
                                step= step #
                            )
                step +=1 # updating steps counter
            state = next_state

        # Update the target network, copying all weights and biases to target DQN
        if epi % TARGET_UPDATE == 0:
            for agent in AGENTS:
                target_agents[agent].load_state_dict(agents[agent].state_dict())
    
    def test(agents):
        for agent in AGENTS:
            agents[agent].epsilon = -1
            agents[agent].eval()
        EPISODES = 20
        STEPS = 500
        rew1, rew2 = [], []    
        for ep in range(EPISODES):

            state = np.random.randint(0, N_actions, size=N_agents)
            for _ in range(STEPS):
                action = np.zeros(N_agents, dtype=int)
                for idx, agent in enumerate(AGENTS):       
                    action[idx] = agents[agent].select_action(state) 
                
                reward = replay_classic_reward(action)
                next_state = action

                # gathering episode rewards
                rew1.append(reward[0])
                rew2.append(reward[1])
                state = next_state

        r1 = np.mean(rew1)
        r2 = np.mean(rew2)
        
        return r1 + r2 - abs(r1 - r2)

    mean_both_reward = test(agents)
    wandb.log({'mean_reward_test': mean_both_reward})
        
    time_taken = asHHMMSS(time.time() - start_training)
    print(f'\n\t[TOTAL TRAINING TOOK {time_taken} ] \n')
    # wandb.config.update({'time taken': time_taken})
    
    wandb.finish() # finishing session