In [None]:
import os
import copy
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch
from classes_and_functions.train_and_evaluate import train_and_eval, train_agent, evaluate_agent
from classes_and_functions.plot import plot_graph
from classes_and_functions.serialize import serialize_loss_step_reward
from classes_and_functions.ini_agent_replay_buffer import initialize_agent_and_replay_buffer
from classes_and_functions.serialize import serialize_replay_buffer,deserialize_replay_buffer
from classes_and_functions.polynomial_decay import polynomial_decay

config = {  "DISCOUNT_FACTOR": 0.95,
            "TARGET_UPDATE": 10,
            "DECAY_TIME": 100,
            "MIN_EP": 0.01,
            "POWER_EP": 7,
            "MIN_LR": 0.0001,
            "POWER_LR": 2,
            "LAYERS": [64, 128],
            "ACTIVATION_FUNCTION": "Sigmoid",
            "SEED": 24
        }

config_expert_agent = {  "EPISODES": 10000,
            "BATCH_SIZE": 256,
            "BUFFER_SIZE": 500000,
            "ep": 1,
            "INITIAL_EP": 1,
            "lr": 0.02,
            "INITIAL_LR": 0.02,
        }

config_pre_training = {  "REPLAYS": 200000,
            "BATCH_SIZE": 32,
            "BUFFER_SIZE": 500000,
            "ep": 1,
            "INITIAL_EP": 1,
            "lr": 0.02,
            "INITIAL_LR": 0.02,
        }

config_online_training = {  "EPISODES": 10000,
            "BATCH_SIZE": 256,
            "BUFFER_SIZE": 100000,
            "ep": 1,
            "INITIAL_EP": 1,
            "lr": 0.005,
            "INITIAL_LR": 0.005,
        }


path = f"comp_offline/seed_{config['SEED']}"

folder_is_exists = os.path.exists(path)
if not folder_is_exists:
    os.makedirs(path)
    os.makedirs(f"{path}/results")

with open(f"{path}/settings.txt", "w") as f:
    f.write("config\n")
    f.write(str(config))
    f.write("\n")
    f.write("config_expert_agent\n")
    f.write(str(config_expert_agent))
    f.write("\n")
    f.write("config_pre_training\n")
    f.write(str(config_pre_training))
    f.write("\n")
    f.write("config_online_training\n")
    f.write(str(config_online_training))
    f.write("\n")
    
    
config_expert_agent.update(config)
config_pre_training.update(config)
config_online_training.update(config)

if torch.cuda.is_available():
    print("Using GPU")
else:
    print("Using CPU")


**Train an agent in order to achieve a replay buffer**


In [None]:
def get_expert_learning_buffer(dqn_agent,replay_buffer,config):
    
    
    mean_loss_list = []
    step_list = []
    reward_list = []
    
    for i in range(1,config['EPISODES']+1):
        dqn_agent.train_mode()
        mean_loss = train_agent(dqn_agent,replay_buffer,ep=config['ep'] ,batch_size=config['BATCH_SIZE'],TN=True,seed=i)
        dqn_agent.evaluate_mode()
        steps,reward = evaluate_agent(dqn_agent,graphical=False,seed=i)

        mean_loss_list.append(mean_loss)
        step_list.append(steps)
        reward_list.append(reward)


        if i % config['DECAY_TIME'] == 0:
            config['ep'] = polynomial_decay(config['INITIAL_EP'],config['MIN_EP'],i,config['EPISODES'],config['POWER_EP'])

        if i % config['DECAY_TIME'] == 0 and len(reward_list) > 100 and np.mean(reward_list[-100:]) > 2500:
            config['lr'] = polynomial_decay(config['INITIAL_LR'],config['MIN_LR'],i,config['EPISODES'],config['POWER_LR'])
            dqn_agent.set_learning_rate(config['lr'])


        if i % config['TARGET_UPDATE'] == 0:
            dqn_agent.update_target_q_network()


        if i % 100 == 0:
            print(f"Episodes: {i} eb: {round(config['ep'],5)}, lr: {round(dqn_agent.get_learning_rate(),5)}, mean loss: {round(np.mean(mean_loss_list[-100:]),5)} ,Steps: {round(np.mean(step_list[-100:]),5)}, Reward: {round(np.mean(reward_list[-100:]),0)} , buffer size: {replay_buffer.buffer_size()}")
            

        if replay_buffer.buffer_size() >= config['BUFFER_SIZE']:
            serialize_replay_buffer(replay_buffer,path)
            break

    return mean_loss_list,step_list,reward_list


# INITIALIZE agent and replay buffer
agent_TN,replay_buffer_TN = initialize_agent_and_replay_buffer(config=config_expert_agent)
# Train agent with target network
mean_loss_list,step_list,reward_list = get_expert_learning_buffer(agent_TN,replay_buffer_TN,config_expert_agent)

# SERIALIZE
serialize_loss_step_reward(mean_loss_list,step_list,reward_list,"PRE_TN",f"{path}/results")




**Initialize an agent. Pre train in offline mode and continue in online mode**

In [None]:
replays_l = [400000,450000,500000,550000,600000,650000,700000,750000]

def pre_training(agent,replay_buffer,replays,config,TN=False):
    mean_loss_list = []
    for i in range(1,replays+1):
        agent.train_mode()
        loss = agent.replay(replay_buffer,batch_size=config['BATCH_SIZE'],target_network=TN)
        mean_loss_list.append(loss)
        
        if i % 1000 == 0:        
            agent.evaluate_mode()
            steps,reward = evaluate_agent(agent,config,graphical=False)
            print(f"[{i}] mean loss: {round(np.mean(mean_loss_list[-100:]),5)}, steps: {int(steps)}, reward: {int(reward)}")
            
    return mean_loss_list


replay_buffer_PRE = deserialize_replay_buffer(path)

for rep in replays_l:
    dqn_agent_PRE,_ = initialize_agent_and_replay_buffer(config=config_pre_training)
    mean_loss = pre_training(dqn_agent_PRE,replay_buffer_PRE,rep,config_pre_training,TN=False)
    serialize_loss_step_reward(mean_loss,[],[],f"PRE_{rep}",f"{path}/results")

    config_online_training['ep'] = config_online_training['INITIAL_EP']
    config_online_training['lr'] = config_online_training['INITIAL_LR']

    dqn_agent_PRE.set_learning_rate(config_online_training['INITIAL_LR'])
    _,replay_buffer = initialize_agent_and_replay_buffer(config=config_online_training)
    mean_loss_list,step_list,reward_list = train_and_eval(dqn_agent_PRE,replay_buffer,config_online_training,stop_index=config_online_training['EPISODES'],TN=False)

    serialize_loss_step_reward(mean_loss_list,step_list,reward_list,f"_{rep}",f"{path}/results")

    # GRAPH 1
    plot_graph(mean_loss_list,"Episodes","Mean loss",color="orange",ylim=[0,10],path_name=f"{path}/mean_loss_{rep}")

    # GRAPH 2
    plot_graph(step_list,"Episodes","Steps",type=plt.bar,color="green",path_name=f"{path}/steps_{rep}")

    # GRAPH 3
    plot_graph(reward_list,"Episodes","Reward",type=plt.plot,color="blue",path_name=f"{path}/reward_{rep}")
