In [1]:
import os
import sys
import support_functions

sys.path.append(os.path.join(os.path.abspath(os.getcwd()), "PEGNN"))
import json
import time
import myconfig_al_only as myconfig
import solver_al_only as solver
from datetime import datetime


def make_dir(path):
    try:
        os.mkdir(path)
    except:
        pass


# rebuild the folder missed?
def build_folder_and_clean(path):
    check = os.path.exists(path)
    if check:
        pass
    else:
        os.makedirs(path)


def train(job_id, settings):
    result_sheet = []

    print("Start training...")
    list_total, list_err = solver.training(settings=settings, job_id=job_id)
    print("Start evaluation...")
    best_err, r_squared = solver.evaluate(settings=settings, job_id=job_id)

    result_sheet.append([list_total, list_err, best_err, r_squared])

    # collect wandb result into file
    rtn = {
        "best_err": sum(result_sheet[0][2]) / len(result_sheet[0][2]),
        "r_squared": sum(result_sheet[0][3]) / len(result_sheet[0][3]),
        "list_total_0": result_sheet[0][0],
        "list_err_0": result_sheet[0][1],
    }

    json_dump = json.dumps(rtn)
    with open(settings['agent_dir'] + f'/{job_id}.rtn', 'w') as fresult:
        fresult.write(json_dump)



# RuntimeError: mat1 and mat2 shapes cannot be multiplied (2974x42 and 46x256)
# problem for number of the dataset_size since i change the size into minimal size
# but if this problem occurs in ssh server then means all right
if __name__ == '__main__':
    job_id = '000006'

    print('Init...')

    settings = {
        'agent_id': '00001',
        'agent_dir': './logs',
        'origin_path': './Dataset_res250_reg4c/',

        # debug mode=>data_set
        'debug': True,
        'bp': False,

        # full_batch->batch->accumulation_steps double
        'batch': 16,
        'accumulation_steps': 128 // 16,
        'test_batch': 0,

        'es_mindelta': 0.5,

        # 'num_features_in': 14,
        'num_features_in': 9,

        'num_features_out': 1,
        'emb_hidden_dim': 256,
        
        'k': 20,
        'conv_dim': 256,

        'seed': 1,
        'model': 'PEGNN',
        'fold': 4,
        'holdout': 1,
        'lowest_rank': 1,

        'hp_marker': 'tuned',
        'nn_length': 3,
        'nn_hidden_dim': 32,
        'dropout_rate': 0.1,

        # for transformer
        'd_model': 32,
        'nhead': 2,

        'dim_feedforward': 128,
        'transformer_dropout': 0.1,
        'num_encoder_layers': 2,
        'env_features_in': 11,

        
        'transformer_dec_output': 32,
        'emb_dim': 32,
        'epoch': 3,
        'es_endure': 5,
        'nn_lr': 1e-5,

        
        'aux_task_num': 1,
        
        # ----MAOAL----
        'hyper_lr': 1e-5,
        'hyper_decay': 0.0,
        'hyper_interval': 20,
        'hyper_aux_loss_weight': 0.001,

        # ----Task heads----
        'heads_nn_length': 2,
        'heads_nn_hidden_dim': 64,
        'heads_dropout_rate': 0.1,
        
    }

    # build working folder
    dt_string = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    coffer_slot = myconfig.coffer_path + str(job_id) + '/'

    # missed
    make_dir(coffer_slot)
    build_folder_and_clean(coffer_slot)
    settings['coffer_slot'] = coffer_slot
    settings['tgt_op'] = 'mcpm10'

    train(job_id, settings)

Init...
Start training...
{
  "agent_id": "00001",
  "agent_dir": "./logs",
  "origin_path": "./Dataset_res250_reg4c/",
  "debug": true,
  "bp": false,
  "batch": 16,
  "accumulation_steps": 8,
  "test_batch": 0,
  "es_mindelta": 0.5,
  "num_features_in": 9,
  "num_features_out": 1,
  "emb_hidden_dim": 256,
  "k": 20,
  "conv_dim": 256,
  "seed": 1,
  "model": "PEGNN",
  "fold": 4,
  "holdout": 1,
  "lowest_rank": 1,
  "hp_marker": "tuned",
  "nn_length": 3,
  "nn_hidden_dim": 32,
  "dropout_rate": 0.1,
  "d_model": 32,
  "nhead": 2,
  "dim_feedforward": 128,
  "transformer_dropout": 0.1,
  "num_encoder_layers": 2,
  "env_features_in": 11,
  "transformer_dec_output": 32,
  "emb_dim": 32,
  "epoch": 3,
  "es_endure": 5,
  "nn_lr": 1e-05,
  "aux_task_num": 1,
  "hyper_lr": 1e-05,
  "hyper_decay": 0.0,
  "hyper_interval": 20,
  "hyper_aux_loss_weight": 0.001,
  "heads_nn_length": 2,
  "heads_nn_hidden_dim": 64,
  "heads_dropout_rate": 0.1,
  "coffer_slot": "./coffer_al_only/000006/",
  "t



Length of df dict: 100
Length of call list: 2304




Length of df dict: 1205
Length of call list: 30976




Length of df dict: 100
Length of call list: 3072
name: spenc.ffn.layers.0.linear.weight, param: torch.Size([256, 64])
name: spenc.ffn.layers.0.linear.bias, param: torch.Size([256])
name: spdec.0.weight, param: torch.Size([128, 256])
name: spdec.0.bias, param: torch.Size([128])
name: spdec.2.weight, param: torch.Size([64, 128])
name: spdec.2.bias, param: torch.Size([64])
name: spdec.4.weight, param: torch.Size([32, 64])
name: spdec.4.bias, param: torch.Size([32])
name: conv1.bias, param: torch.Size([256])
name: conv1.lin.weight, param: torch.Size([256, 41])
name: conv2.bias, param: torch.Size([256])
name: conv2.lin.weight, param: torch.Size([256, 256])
{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 2, 9: 2, 10: 3, 11: 3}
PEGCN(
  (spenc): GridCellSpatialRelationEncoder(
    (ffn): MultiLayerFeedForwardNN(
      (layers): ModuleList(
        (0): SingleFeedForwardNN(
          (dropout): Dropout(p=0.5, inplace=False)
          (act): ReLU()
          (linear): Linear(in_features=64,



Each epoch #real_iter: 18.0
working on training loop
Start Evaluation 0.17973297834396362 Aux_loss:[7.263245788635686e-05] - real_iter_time: 1.6361680030822754
		--------
		Iter: 18, inter_train_loss: 3.9892451763153076
		--------

		--------
		Iter: 18, inter_aux_loss: [0.0012667598202824593]
		--------

		--------
		test_loss: 413.4653015136719, last best test_loss: inf
		--------

		--------
		r_squared: -5.200649058384271, MSE: 29.349379
		--------



  vmin = self._density_vmin(array)
  vmax = self._density_vmax(array)


Current epoch: 1




Start Evaluation 0.11225584894418716 Aux_loss:[6.749433669028804e-05] - real_iter_time: 0.45812416076660156
		--------
		Iter: 36, inter_train_loss: 2.656019926071167
		--------

		--------
		Iter: 36, inter_aux_loss: [0.0011964954901486635]
		--------

		--------
		test_loss: 217.3418731689453, last best test_loss: 413.4653015136719
		--------

		--------
		r_squared: -2.259428761539274, MSE: 21.278994
		--------



  vmin = self._density_vmin(array)
  vmax = self._density_vmax(array)


Current epoch: 2




Start Evaluation 0.08269195258617401 Aux_loss:[5.745437738369219e-05] - real_iter_time: 0.51339459419250498
		--------
		Iter: 54, inter_train_loss: 1.8878557682037354
		--------

		--------
		Iter: 54, inter_aux_loss: [0.0011419253423810005]
		--------

		--------
		test_loss: 119.32720947265625, last best test_loss: 217.3418731689453
		--------

		--------
		r_squared: -0.7895241964518545, MSE: 15.767002
		--------



  vmin = self._density_vmin(array)
  vmax = self._density_vmax(array)


Current epoch: 3




Start Evaluation 0.06577761471271515 Aux_loss:[4.480565257836133e-05] - real_iter_time: 0.57837843894958523
		--------
		Iter: 72, inter_train_loss: 1.3266254663467407
		--------

		--------
		Iter: 72, inter_aux_loss: [0.0010283681331202388]
		--------

		--------
		test_loss: 83.69699096679688, last best test_loss: 119.32720947265625
		--------

		--------
		r_squared: -0.25518562415411195, MSE: 13.204875
		--------



  vmin = self._density_vmin(array)
  vmax = self._density_vmax(array)


Current epoch: 4




Finished Training
Start evaluation...
Working on CPU
Length of df dict: 100
Length of call list: 512




		--------
		r_squared: -81.17395685648698, MSE: 20.920858
		--------

		--------
		Differ: 18.136117935180664, count: 512
		--------



  vmin = self._density_vmin(array)
  vmax = self._density_vmax(array)


<Figure size 640x480 with 0 Axes>