#Import dataset and build train/test loader + train/test function

In [None]:
#!pip install stable_baselines3

In [None]:
import gym
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms, utils, models,datasets
import torchvision.transforms as transforms
from torch.utils.data import Subset
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal

import numpy as np


import math
import random

import time
from matplotlib import pyplot as plt
from copy import deepcopy
import itertools

from __future__ import print_function
import os
import csv
import math
from typing import Optional, Union

from gym import logger, spaces
from gym.envs.classic_control import utils
from gym.error import DependencyNotInstalled

##Train and test function

In [None]:
def train( model, device, train_loader, optimizer, epoch,loss_fn):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            


def test(model, device, test_loader, loss_fn):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            #test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    #test_loss /= len(test_loader.dataset)

    print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return correct / len(test_loader.dataset)

##Import dataset and build train/test loaders

In [None]:
train_kwargs = {'batch_size': 64}
test_kwargs = {'batch_size': 64}

In [None]:
# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()])

# CIFAR-10 dataset
train_data = torchvision.datasets.CIFAR10(root='../../data/',
                                             train=True, 
                                             transform=transform,
                                             download=True)

test_data = torchvision.datasets.CIFAR10(root='../../data/',
                                            train=False, 
                                            transform=transforms.ToTensor())

# takes the first 10% images of CIFAR10 train set
train_dataset = Subset(train_data, indices=range(len(train_data) // 10))
test_dataset = Subset(test_data, indices=range(len(test_data) // 10))

Files already downloaded and verified


In [None]:
print(f'Size training set before subset:{len(train_data)}')
print(f'Size test set before subset:{len(test_data)}')
print(f'Training set size :{len(train_dataset)}')
print(f'Test set size :{len(test_dataset)}')

Size training set before subset:50000
Size test set before subset:10000
Training set size :5000
Test set size :1000


## Device selection

In [None]:
#device config
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


##Class MetaRLEnv

In [None]:
class MetaRLEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
    """
    ### Description

    ### Action Space

    The action is a `ndarray` with shape `(1,)` of continuous value between 1e-6 and 1e-2. It sets the new value for the learning rate used 
    for the scheduling step.

    | Num | Action | Min  | Max |
    |-----|--------|------|-----|
    | 0   | lr     | 1e-5 | 1e-2 |

    
    ### Observation Space

    The observation is a `ndarray` with shape `(2,)` with the values corresponding to the following positions and velocities:

    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | current lr            | 1e-5                |  1e-2             |
    | 1   | accuracy              | 0                   | 1                 |
    | 2   | Bacth_size            | 20                  | 200               |
   
  
    ### Rewards

    Reward = difference of accuracy with previous step
    r = new_accuracy - prev_accuracy

    ### Starting State

    Init state = [1e-3,0,64]

    ### Episode End

    The episode ends if any one of the following occurs:

    1. Termination: lr<0 or lr>1
    2. Termination: Iter > N_iters

    ### Data loader using batch size
  train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)

  test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)
    
    """

    def __init__(self,train_dataset, test_dataset,device):
        super(MetaRLEnv, self).__init__()   
        self.max_lr = 1e-2
        self.min_lr = 1e-5
        self.min_batch = 20
        self.max_batch = 200
        ###### INIT INNER MODEL
        
        self.device = device
        self.train_dataset = train_dataset
        self.test_dataset= test_dataset
        self.loss_fn = nn.CrossEntropyLoss()

        #####
        self.prev_state = None
        self.lr_list = []
        self.batch_size_list = []
        self.episode_number = 0
        self.epoch = 1
        
        ######################## Boundaries of observation space
        high = np.array(
            [   self.max_lr, #max bound lr
                1, #max bound accuracy
                self.max_batch,#batch size limits when added,
            ],
            dtype=np.float32,)

        low = np.array(
            [
                self.min_lr,
                0,
                self.min_batch,#batch size limits,
            ],
            dtype=np.float32,)
        ######################## Define research space
        self.action_space = spaces.Box(low=self.min_lr, high=self.max_lr, shape=(1,), dtype=np.float32)
        #Check because batch_size is an integer
        #self.action_space = spaces.Box(low=np.array([self.min_lr,self.min_batch]), high = np.array([self.max_lr,self.max_batch]), shape=(2,), dtype=np.float32)
        self.observation_space = spaces.Box(low, high, dtype=np.float32)

        self.history_params = {"lr":[1e-3],"accuracy":[0],"batch_size":[64]}


    def step(self,lr_pred):

        ##store prec state
        self.prev_state = self.state
        #ensure right values
        lr_pred = np.clip(lr_pred, self.min_lr, self.max_lr)[0]
        #to check
        self.last_lr = lr_pred
        # store batch_size
        self.batch_size = 64


        ##Compute train loader (for the case with the batch size) where here batch_size <- self.state[2]
        train_loader = torch.utils.data.DataLoader(dataset=self.train_dataset,
                                           batch_size=self.batch_size, 
                                           shuffle=True)

        test_loader = torch.utils.data.DataLoader(dataset=self.test_dataset,
                                          batch_size=self.batch_size, 
                                          shuffle=False)
        
        #perform action = scheduling step only if validation error increase
        if (self.prev_state[1]>self.state[1])or(self.epoch%3==0):
          for param_group in self.optimizer.param_groups:
              param_group['lr'] = lr_pred
              self.lr_list.append(lr_pred)

        #transition of the environmebnt
        train(self.model, self.device, train_loader, self.optimizer, self.epoch,self.loss_fn)
        new_acc = test(self.model, self.device, test_loader,self.loss_fn)

        #update state
        self.state = np.array([lr_pred, new_acc,self.batch_size],dtype = np.float32)
        self.epoch+=1
      
        print(f"Episode:{self.episode_number} Epoch:{self.epoch} : current lr:{self.state[0]} and previous:{self.prev_state[0]} and new_acc:{self.state[1]} and previous :{self.prev_state[1]}")
        
        #compute reward function
        reward = self.reward_function("acc_diff")
        #save reward
        self.save_reward(reward,self.epoch)
        
        # Write to the history buffers
        self.write_history_params()

        return np.array(self.state, dtype=np.float32), reward, False, False, {}

    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)

        #not a random init state
        self.state = np.array([1e-3,0,64])
        self.last_lr = None

        #reset model
        self.model = models.resnet18(pretrained=False).to(device) #for the next episode, reset the model
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3) #reset the optimizer

        #output data previous episode
        if self.episode_number >0: #si on a deja commencé le training
          self.output_data()
        #init history buffer  
        self.history_params = {}
        self.history_params = {"lr":[self.state[0]], "accuracy":[self.state[1]],"batch_size":[64]}
        self.episode_number +=1
        self.epoch = 1

        return np.array(self.state, dtype=np.float32), {}

    def reward_function(self,reward_fn):
      '''
      Take as input the choice of reward function used and return the reward computed
      '''
      if reward_fn =="cartpole_like":
        return (self.state[1]-self.prev_state[1])*1
      elif reward_fn == "acc_diff":
        return self.state[1]-self.prev_state[1]
    
    def write_history_params(self):
        self.history_params["lr"].append(self.state[0])
        self.history_params["accuracy"].append(self.state[1])
        self.history_params["batch_size"].append(self.batch_size)
    
    def save_reward(self, reward,epoch):
        '''
        Outputs a file with the reward every epoch of the learning episode
        '''
        name = "rewards.csv"
        if (not os.path.exists(name)):
            with open(name, "w") as csv_file:
                spam_writer = csv.writer(csv_file, delimiter=";", lineterminator="\n")
                spam_writer.writerow(["Episode","Epoch", "Reward"])
                spam_writer.writerow([self.episode_number,epoch, reward])
        else:
            with open(name, "a") as csv_file:
                spam_writer = csv.writer(csv_file, delimiter=";", lineterminator="\n")
                spam_writer.writerow([self.episode_number, epoch, reward])
    
    def output_data(self):
      '''
      Output .csv file with all the training data
      '''
      name = "output.csv"
      if (not os.path.exists(name)):
            with open(name, "w") as csv_file:
                spam_writer = csv.writer(csv_file, delimiter=";", lineterminator="\n")
                spam_writer.writerow(["Episode","Epoch","lr","batch_size","accuracy"])
                for epoch in range(len(self.history_params["lr"])):
                  spam_writer.writerow([self.episode_number,epoch,self.history_params["lr"][epoch],self.history_params["batch_size"][epoch], self.history_params['accuracy'][epoch]])
      else:
          with open(name, "a") as csv_file:
              spam_writer = csv.writer(csv_file, delimiter=";", lineterminator="\n")
              for epoch in range(len(self.history_params["lr"])):
                  spam_writer.writerow([self.episode_number,epoch,self.history_params["lr"][epoch],self.history_params["batch_size"][epoch], self.history_params['accuracy'][epoch]])

    


#Import v2 PPO

In [None]:
"""
Proximal Policy Optimization (PPO) version 3
----------------------------
1 actor and 1 critic
This one is basically the same as PPO_continuous_v2 with slightly different coding style.
* It uses batch of samples for update (which can be more than an episode).
* It merge the losses of critic and actor into one update manner, using a single optimizer 
instead of one for actor and one for critic.
* It uses the min of clipping value loss and non-clipping value loss.
* It additionally has a policy entropy bonus in loss (line 146).
* It uses MultivariateNormal for policy distribution instead of Normal.
To run
------
python ***.py
"""
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
import gym
import numpy as np

##Class Memory

In [None]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

##Class ActorCritic

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std):
        super(ActorCritic, self).__init__()
        # action mean range -1 to 1
        self.actor =  nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.Tanh(),
                nn.Linear(64, 32),
                nn.Tanh(),
                nn.Linear(32, action_dim),
                nn.Tanh()
                )
        # critic
        self.critic = nn.Sequential(
                nn.Linear(state_dim, 64),
                nn.Tanh(),
                nn.Linear(64, 32),
                nn.Tanh(),
                nn.Linear(32, 1)
                )
        self.action_var = torch.full((action_dim,), action_std*action_std).to(device)
        
    def forward(self):
        raise NotImplementedError
    
    def act(self, state, memory):
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).to(device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(action_logprob)
        
        return action.detach()
    
    def evaluate(self, state, action):   
        action_mean = torch.squeeze(self.actor(state))
        
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        
        action_logprobs = dist.log_prob(torch.squeeze(action))
        dist_entropy = dist.entropy()
        state_value = self.critic(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

##Class PPO

In [None]:
class PPO:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        
        self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def select_action(self, state, memory):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()
    
    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        rewards = rewards.type(torch.FloatTensor).to(device)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).to(device)).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(device)).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs)).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

#Main Training Loop

In [None]:
if __name__ == '__main__':
    
    ############## Hyperparameters ##############
    env_name = "MetaModel"
    solved_reward = 300              # stop training if avg_reward > solved_reward
    log_interval = 1     #init 20    # print avg reward in the interval
    max_episodes = 100   #init 10000 # max training episodes
    inner_epochs = 10    #init 150   # max timesteps in one episode
    
    update_timestep = 30 #init 500         # update policy every n timesteps=epochs
    action_std = 0.5            # constant std for action distribution (Multivariate Normal)
    K_epochs = 40 #init80               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.95 #init 0.99                # discount factor
    
    meta_lr = 0.0003                 # parameters for Adam optimizer
    betas = (0.9, 0.999)
    
    random_seed = None
    #############################################
    
    # creating environment
    env = MetaRLEnv(train_dataset,test_dataset,device)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    if random_seed:
        print("Random Seed: {}".format(random_seed))
        torch.manual_seed(random_seed)
        env.reset(seed=random_seed)
        np.random.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, meta_lr, betas, gamma, K_epochs, eps_clip)
    print(meta_lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0
    
    # training loop
    for i_episode in range(1, max_episodes+1):
        state,_ = env.reset()
        for epoch in range(inner_epochs):
            time_step +=1
            # Running policy_old:
            action = ppo.select_action(state, memory)
            state, reward, done, _,_ = env.step(action)
            
            # Saving reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if its time
            if time_step % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            running_reward += reward
            if done:
                break
        
        avg_length += epoch
        
            
        
        # save every 5 episodes
        if i_episode % 5 == 0:
            torch.save(ppo.policy.state_dict(), './PPO_continuous_{}.pth'.format(env_name))
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            
            print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

0.0003 (0.9, 0.999)




[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m

Episode:59 Epoch:8 : current lr:0.009999999776482582 and previous:9.999999747378752e-06 and new_acc:0.46700000762939453 and previous :0.47099998593330383

Test set: Accuracy: 470/1000 (47%)

Episode:59 Epoch:9 : current lr:0.009999999776482582 and previous:0.009999999776482582 and new_acc:0.4699999988079071 and previous :0.46700000762939453

Test set: Accuracy: 162/1000 (16%)

Episode:59 Epoch:10 : current lr:0.009999999776482582 and previous:0.009999999776482582 and new_acc:0.16200000047683716 and previous :0.4699999988079071

Test set: Accuracy: 296/1000 (30%)

Episode:59 Epoch:11 : current lr:9.999999747378752e-06 and previous:0.009999999776482582 and new_acc:0.29600000381469727 and previous :0.16200000047683716
Episode 59 	 Avg length: 9 	 Avg reward: 0

Test set: Accuracy: 302/1000 (30%)

Episode:60 Epoch:2 : current lr:0.009999999776482582 and previous:0.001 and new_acc:0.3019999861717224