In [9]:
import gym 
from gym.spaces import Discrete, Box

import numpy as np 
import os 
import random 
import torch as T 

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback

In [10]:
class ShiftEnv1(gym.Env):
    
    def __init__(self, reset_num_hour, appliance_hourly_power_consumption, hourly_total_power_consumption, max_on_times):
        self.action_space= Discrete(2)
        self.observation_space= Box(low=0, high= hourly_total_power_consumption, shape=(1,), dtype= float)
        self.reset_num_hour= reset_num_hour
        self.appliance_hourly_power_consumption= appliance_hourly_power_consumption
        self.state= hourly_total_power_consumption # state is going to be the hourly power consumption
        self.max_on_times= max_on_times
        self.on_times= 0
        self.curr_hour=0 
        
    def reset(self, seed= None, options= None):
        self.curr_hour= 0 
        self.on_times= 0 
        return self.curr_hour, {}
    
    def step(self,action):
        
        reward= 0 
        self.curr_hour+=1
        
        if action==1:# Appliance is turned ON 
            
            
            self.state -= self.appliance_hourly_power_consumption
            self.on_times+=1
            
            if self.on_times > self.max_on_times:
                reward= -15
            else:
                reward= 25
        
        else:# Appliance is turned OFF
            #reward= -10
            if self.curr_hour in [9, 10, 11, 12, 13, 14, 15, 16, 6, 7, 8]: # intermidate hours
                reward= -10
            elif self.curr_hour in [0, 1, 2, 3, 4, 5]: #off beak hours
                reward= -5
            elif self.curr_hour in [17, 18, 19, 20, 21, 22, 23]: # peak hours
                reward= 25
                
            
        
        done = self.curr_hour == self.reset_num_hour
        
        if done:
            self.curr_hour= 0
        
        next_state= round(self.state,4) # the new total power consumption is the next_state 
        
        info = {'Action taken: ': action, 'Total Power Consumption ': self.state,
                'Hour of the day': self.curr_hour, 'ON Times ': self.on_times}
        #print(f"Current hour: {info['Hour of the day']}, Action taken: {info['Action taken: ']}\
        #\nTotal Power Consumption {info['Total Power Consumption ']}, ON Times: {self.on_times}")
        #print("_"*20)
        
        return next_state, reward, done, info, {}

In [3]:
### New Reward Function 
class ShiftEnv2(Env):
    
    def __init__(self, reset_num_hour, appliance_hourly_power_consumption, hourly_total_power_consumption, max_on_times):
        self.action_space= Discrete(2)
        self.observation_space= Box(low=0, high= hourly_total_power_consumption, shape=(1,), dtype= float)
        self.reset_num_hour= reset_num_hour
        self.appliance_hourly_power_consumption= appliance_hourly_power_consumption
        self.state= hourly_total_power_consumption # state is going to be the hourly power consumption
        self.max_on_times= max_on_times
        self.on_times= 0
        self.curr_hour=0 
        
    def reset(self, seed= None, options= None):
        self.curr_hour=0 
        self.on_times= 0
        return self.curr_hour, {}
    
    def step(self,action):
        
        reward= 0 
        self.curr_hour+=1
        
        if action==1:# Appliance is turned ON 
            
            
            self.state -= self.appliance_hourly_power_consumption
            self.on_times+=1
            if self.on_times > self.max_on_times:
                reward= -15
            else:
                reward= 25
                
            if self.curr_hour in [9, 10, 11, 12, 13, 14, 15, 16, 6, 7, 8]:# intermidate hours
                
                reward= 25
                
            elif self.curr_hour in [0, 1, 2, 3, 4, 5]: #off beak hours
                reward= 15
            
            elif self.curr_hour in [17, 18, 19, 20, 21, 22, 23]: # peak hours
                reward= -15
    
        else:# Appliance is turned OFF
            #reward= -10
            if self.curr_hour in [9, 10, 11, 12, 13, 14, 15, 16, 6, 7, 8]: # intermidate hours
                reward= -15
            elif self.curr_hour in [0, 1, 2, 3, 4, 5]: #off beak hours
                reward= -10
            elif self.curr_hour in [17, 18, 19, 20, 21, 22, 23]: # peak hours
                reward= 25
                
            
        
        done = self.curr_hour == self.reset_num_hour
        
        if done:
            self.curr_hour= 0
        
        next_state= round(self.state,4) # the new total power consumption is the next_state 
        
        info = {'Action taken: ': action, 'Total Power Consumption ': self.state,
                'Hour of the day': self.curr_hour}
        #print(f"Current hour: {info['Hour of the day']}, Action taken: {info['Action taken: ']}\
        #\nTotal Power Consumption {info['Total Power Consumption ']}, ON Times: {self.on_times}")
        #print("_"*20)
        
        return next_state, reward, done, info, {}

In [4]:
from stable_baselines3.common.callbacks import BaseCallback

class CustomCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomCallback, self).__init__(verbose)

    def _on_step(self):
        # Perform any desired actions or data collection at each training step
        pass

    def _on_training_end(self):
        return self.model, self.logger.episode_info

In [5]:
def agent(custom_env, lr, gamma, epsilon_dec, epsilon_min, log_path= None):
          
    policy_args= dict(net_arch=[256,256])# two hidden layers each with 256 neurons
    
    model= DQN(env= custom_env, policy= 'MlpPolicy', learning_rate=lr, gamma= gamma,
               exploration_initial_eps= 1.0, exploration_fraction= epsilon_dec, 
               exploration_final_eps= epsilon_min, buffer_size= 100000, verbose= 1, 
               batch_size= 64, policy_kwargs= policy_args, tensorboard_log= log_path)
    
    
    return model

# Testing Enviroment


In [None]:
env= ShiftEnv1(reset_num_hour=23, appliance_hourly_power_consumption=0.025,
               hourly_total_power_consumption= 250, max_on_times=3)
episodes= 48
for episode in range(episodes):
    state= env.reset()
    score= 0
    done= False
    while not done:
        action= env.action_space.sample()
        next_state, reward, done, info, _= env.step(action)
        score+=reward
        state= next_state
    #print(f'Time:{episode}\nScore: {score}\n Info: {info}')
    print('+*'*20)
env.close()

In [None]:
env= ShiftEnv2(reset_num_hour=24, appliance_hourly_power_consumption=0.025,
               hourly_total_power_consumption= 250, max_on_times=3)
episodes= 48
for episode in range(episodes):
    state= env.reset()
    score= 0
    done= False
    while not done:
        action= env.action_space.sample()
        next_state, reward, done, info, _= env.step(action)
        score+=reward
        state= next_state
    #print(f'Time:{episode}\nScore: {score}\n Info: {info}')
    print('+*'*20)
env.close()

# Training Model


## First Env

In [14]:
env= ShiftEnv1(reset_num_hour=24, appliance_hourly_power_consumption=0.025,
               hourly_total_power_consumption= 250, max_on_times=3)
log_path= os.path.join('ShiftableDQN', 'Env1')
eval_callback= EvalCallback(eval_env= env,log_path= log_path, deterministic=True)
# increasing training timesteps
model= agent(custom_env= env, lr= 0.03, gamma= 0.99,epsilon_dec= 0.05, epsilon_min= 0.01)
model.learn(total_timesteps=17520, log_interval=876)
env.close()



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 23.8     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 876      |
|    fps              | 807      |
|    time_elapsed     | 1        |
|    total_timesteps  | 876      |
| train/              |          |
|    learning_rate    | 0.03     |
|    loss             | 1.67     |
|    n_updates        | 193      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 25       |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 1752     |
|    fps              | 768      |
|    time_elapsed     | 2        |
|    total_timesteps  | 1752     |
| train/              |        

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 25       |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 14892    |
|    fps              | 590      |
|    time_elapsed     | 25       |
|    total_timesteps  | 14892    |
| train/              |          |
|    learning_rate    | 0.03     |
|    loss             | 1.23e+03 |
|    n_updates        | 3697     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 25       |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 15768    |
|    fps              | 587      |
|    time_elapsed     | 26       |
|    total_timesteps  | 15768    |
| train/              |          |
|    learning_rate    | 0.03     |
|    loss             | 1.07e+03 |
|    n_updates      

In [15]:
np.mean([item['r'] for item in model.ep_info_buffer])


25.0

In [None]:
agent_params= {
    'lr': [0.03, 0.02, 0.01],
    'gamma': [0.99, 0.85, 0.80, 0.70],
    'eps_min': [0.01, 0.1, 0.15],
    'eps_dec': [0.05, 0.15, 0.25, 0.50, 0.75, 0.85],
}
best_reward=0
best_params= {}


for lr in agent_params['lr']:
    for gamma in agent_params['gamma']:
        for eps_min in agent_params['eps_min']:
            for eps_dec in agent_params['eps_dec']:
                
                env= ShiftEnv1(reset_num_hour=24, appliance_hourly_power_consumption=0.025,
                               hourly_total_power_consumption= 250, max_on_times=3)
                model= agent(custom_env= env, lr= lr, gamma= gamma,
                             epsilon_dec= eps_dec, epsilon_min= eps_min)
                
                print('Parameters\n')
                print(f'Learning Rate: {lr}, Gamma: {gamma}, Epsilon Min: {eps_min}, Epsilon Decrement: {eps_dec}')
                model.learn(total_timesteps= 4320, log_interval=24)
                
                episode_reward_mean= np.mean([item['r'] for item in model.ep_info_buffer])
                if episode_reward_mean > best_reward:
                    best_reward= episode_reward_mean
                    best_params= {
                        'lr': lr,
                        'gamma': gamma,
                        'eps_min': eps_min,
                        'eps_dec': eps_dec
                    }
                    print(f'BEST REWARD: {best_reward}\nPARAMETERS: {best_params}')
                    
                
                print('+*'*40)
                env.close()


print(f'BEST REWARD: {best_reward}\n')
print(f'BEST PARAMS: {best_params}')

In [18]:
env= ShiftEnv1(reset_num_hour=24, appliance_hourly_power_consumption=0.025,
               hourly_total_power_consumption= 250, max_on_times=3)
log_path= os.path.join('ShiftableDQN', 'Env1')
eval_callback= EvalCallback(eval_env= env,log_path= log_path, deterministic=True)
# increasing training timesteps
model_env1= agent(custom_env= env, lr= 0.03, gamma= 0.99,epsilon_dec= 0.05, epsilon_min= 0.01)
model_env1.learn(total_timesteps=17520, log_interval=876)
env.close()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 24.7     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 876      |
|    fps              | 809      |
|    time_elapsed     | 1        |
|    total_timesteps  | 876      |
| train/              |          |
|    learning_rate    | 0.03     |
|    loss             | 0.76     |
|    n_updates        | 193      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 24.7     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 1752     |
|    fps              | 733      |
|    time_elapsed     | 2        |
|    total_timesteps  | 1752     |
| train/              |        

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 25       |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 14892    |
|    fps              | 619      |
|    time_elapsed     | 24       |
|    total_timesteps  | 14892    |
| train/              |          |
|    learning_rate    | 0.03     |
|    loss             | 1.18e+03 |
|    n_updates        | 3697     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 24.7     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 15768    |
|    fps              | 617      |
|    time_elapsed     | 25       |
|    total_timesteps  | 15768    |
| train/              |          |
|    learning_rate    | 0.03     |
|    loss             | 1.33e+03 |
|    n_updates      

In [19]:
np.mean([item['r'] for item in model_env1.ep_info_buffer])


24.7

In [20]:
model_env1.save('env1_shiftable_dqn_model')

## Second Env

In [None]:

agent_params= {
    'lr': [0.03, 0.02, 0.01],
    'gamma': [0.99, 0.85, 0.80, 0.70],
    'eps_min': [0.01, 0.1, 0.15],
    'eps_dec': [0.05, 0.15, 0.25, 0.50, 0.75, 0.85],
}
best_reward=0
best_params= {}

best_reward=0
best_params= {}

for lr in agent_params['lr']:
    for gamma in agent_params['gamma']:
        for eps_dec in agent_params['eps_dec']:
            for eps_min in agent_params['eps_min']:
                
                env= ShiftEnv2(reset_num_hour=24, appliance_hourly_power_consumption=0.025,
                               hourly_total_power_consumption= 250, max_on_times=3)
                model= agent(custom_env= env, lr= lr, gamma= gamma, epsilon_dec= eps_dec, epsilon_min= eps_min)
                model.learn(total_timesteps= 4320, log_interval= 24)
                
                episode_reward_mean= np.mean([item['r'] for item in model.ep_info_buffer])
                if episode_reward_mean > best_reward:
                    
                    best_reward= episode_reward_mean
                    best_params= {
                        'lr': lr,
                        'gamma': gamma,
                        'eps_min': eps_min,
                        'eps_dec': eps_dec
                    }
                    print(f'BEST REWARD: {best_reward}\nPARAMETERS: {best_params}')
                
                print('+*'*40)
                env.close()

In [None]:
print(f'BEST REWARD: {best_reward}\n')
print(f'BEST PARAMS: {best_params}')



In [None]:
env= ShiftEnv2(reset_num_hour=24, appliance_hourly_power_consumption=0.025,hourly_total_power_consumption= 250, max_on_times=3)

log_path= os.path.join('ShiftableDQN', 'Env2')
eval_callback= EvalCallback(eval_env= env,log_path= log_path, deterministic=True)
model_env2= agent(custom_env=env, lr= 0.03, gamma= 0.99, epsilon_min= 0.01, epsilon_dec= 0.5)
model_env2.learn(total_timesteps=17520, log_interval=876)
env.close()

In [None]:
np.mean([item['r'] for item in model_env2.ep_info_buffer])


# Save Model

In [None]:
env1_path= os.path.join('Shiftable Agent Deployment', 'Enviromnent1')

In [None]:
model_env1.save(env1_path)

In [None]:
env2_path= os.path.join('Shiftable Agent Deployment', 'Enviromnent2')

In [None]:
model_env2.save(env2_path)

In [None]:
del model_env2

In [None]:
env2_path

In [None]:
model_env2= DQN.load('Shiftable Agent Deployment\Enviromnent2.zip', shift_env2)