In [1]:
import copy
import torch
from torch.utils.data import *
from transformers import *
import inspect
import sys
sys.path.insert(0, "..")

from models import *
from logic import *
from my_datasets import *

from utils import *

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n, r = 5, 8
ap, bp, tp, sp = 0.2, 0.2, 0.4, 0.1
num_arsteps = 3

qed_train_dataset_config = OneShotQedDatasetConfig(r,n,ap,bp,tp,dataset_len=1000,seed=1234)
qed_test_dataset_config = OneShotQedDatasetConfig(r,n,ap,bp,tp,dataset_len=500,seed=2345)
qed_train_dataset = OneShotQedDataset(qed_train_dataset_config)
qed_test_dataset = OneShotQedDataset(qed_test_dataset_config)

succ_train_dataset_config = PredictSuccDatasetConfig(r,n,ap,bp,tp,dataset_len=1000,seed=1234)
succ_test_dataset_config = PredictSuccDatasetConfig(r,n,ap,bp,tp,dataset_len=500,seed=2345)
succ_train_dataset = PredictSuccDataset(succ_train_dataset_config)
succ_test_dataset = PredictSuccDataset(succ_test_dataset_config)

arsteps_train_dataset_config = AutoRegFixedStepsDatasetConfig(r,n,ap,bp,sp,num_arsteps,dataset_len=1000,seed=1234)
arsteps_test_dataset_config = AutoRegFixedStepsDatasetConfig(r,n,ap,bp,sp,num_arsteps,dataset_len=500,seed=2345)
arsteps_train_dataset = AutoRegFixedStepsDataset(arsteps_train_dataset_config)
arsteps_test_dataset = AutoRegFixedStepsDataset(arsteps_test_dataset_config)

In [3]:
mytf_config = MyTfConfig(embed_dim=768, ffwd_width=1024, ffwd_depth=4, num_heads=2, num_layers=8)
mytf_model = get_seq2seq_model("mytf", config=mytf_config)
mytf_qed_model = OneShotQedTaskModel(OneShotQedTaskConfig(n, copy.deepcopy(mytf_model)))
mytf_succ_model = PredictSuccTaskModel(PredictSuccTaskConfig(n, copy.deepcopy(mytf_model)))
mytf_arsteps_model = AutoRegFixedStepsTaskModel(AutoRegFixedStepsTaskConfig(n, num_arsteps, copy.deepcopy(mytf_model)))

mygpt2_model = get_seq2seq_model("gpt2", use_pretrained=True)
mygpt2_qed_model = OneShotQedTaskModel(OneShotQedTaskConfig(n, copy.deepcopy(mygpt2_model)))
mygpt2_succ_model = PredictSuccTaskModel(PredictSuccTaskConfig(n, copy.deepcopy(mygpt2_model)))
mygpt2_arsteps_model = AutoRegFixedStepsTaskModel(AutoRegFixedStepsTaskConfig(n, num_arsteps, copy.deepcopy(mygpt2_model)))

loading configuration file config.json from cache at /home/antonxue/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.34.1",
  

In [4]:
qed_training_args = TrainingArguments(
    "test-trainer",
    evaluation_strategy = "epoch",
    num_train_epochs = 50,
    per_device_train_batch_size = 24,
    per_device_eval_batch_size = 24,
    logging_steps = 5
)

succ_training_args = qed_training_args
arsteps_training_args = qed_training_args

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [5]:
### QED GPT2
mygpt2_qed_trainer = Trainer(mygpt2_qed_model, qed_training_args,
    train_dataset = qed_train_dataset,
    # eval_dataset = qed_test_dataset,
    eval_dataset = qed_train_dataset,
    compute_metrics = qed_compute_metrics)
mygpt2_qed_trainer.train()

***** Running training *****
  Num examples = 1,000
  Num Epochs = 50
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 2,100
  Number of trainable parameters = 126,422,018


Epoch,Training Loss,Validation Loss,Accuracy,Avg ones
1,0.6968,0.684572,0.0,1.0
2,0.6831,0.685032,0.0,0.5
3,0.6472,0.687531,0.0,1.0
4,0.8298,0.685986,0.0,0.0
5,0.435,1.014067,0.0,0.5
6,0.5029,1.334158,0.0,0.5
7,0.2796,1.96345,0.0,0.5
8,0.1349,2.246068,0.0,0.5
9,0.2174,5.524375,0.0,0.5
10,0.0086,5.825505,0.0,0.5


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
Saving model checkpoint to test-trainer/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 24
***** Running Evaluatio

TrainOutput(global_step=2100, training_loss=0.12189393867894935, metrics={'train_runtime': 490.6657, 'train_samples_per_second': 101.902, 'train_steps_per_second': 4.28, 'total_flos': 0.0, 'train_loss': 0.12189393867894935, 'epoch': 50.0})

In [6]:
### QED MyTf
mytf_qed_trainer = Trainer(mytf_qed_model, qed_training_args,
    train_dataset = qed_train_dataset,
    eval_dataset = qed_test_dataset,
    compute_metrics = qed_compute_metrics)
# mytf_qed_trainer.train()

In [7]:
### SUCC GPT2
mygpt2_succ_trainer = Trainer(mygpt2_succ_model, succ_training_args,
    train_dataset = succ_train_dataset,
    eval_dataset = succ_test_dataset,
    compute_metrics = succ_compute_metrics)
mygpt2_succ_trainer.train()

***** Running training *****
  Num examples = 1,000
  Num Epochs = 50
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 2,100
  Number of trainable parameters = 126,424,325


Epoch,Training Loss,Validation Loss,Accuracy,Avg ones
1,0.5346,0.538459,0.7792,1.0
2,0.5534,0.533173,0.7792,1.0
3,0.5252,0.534825,0.7792,1.0
4,0.5273,0.535458,0.7792,1.0
5,0.4993,0.540908,0.7792,1.0
6,0.5198,0.541432,0.7792,1.0
7,0.4912,0.559266,0.7792,1.0
8,0.4462,0.596251,0.7792,1.0
9,0.4547,0.753767,0.4336,0.4
10,0.4063,0.61244,0.7792,1.0


***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
Saving model checkpoint to test-trainer/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Nu

TrainOutput(global_step=2100, training_loss=0.16895931706630757, metrics={'train_runtime': 535.1955, 'train_samples_per_second': 93.424, 'train_steps_per_second': 3.924, 'total_flos': 0.0, 'train_loss': 0.16895931706630757, 'epoch': 50.0})

In [8]:
### SUCC MyTF
mytf_succ_trainer = Trainer(mytf_succ_model, succ_training_args,
    train_dataset = succ_train_dataset,
    eval_dataset = succ_test_dataset,
    compute_metrics = succ_compute_metrics)
# mytf_succ_trainer.train()

In [9]:
### ARSteps GPT2
mygpt2_arsteps_trainer = Trainer(mygpt2_arsteps_model, arsteps_training_args,
    train_dataset = arsteps_train_dataset,
    eval_dataset = arsteps_test_dataset,
    compute_metrics = arsteps_compute_metrics)
mygpt2_arsteps_trainer.train()

***** Running training *****
  Num examples = 1,000
  Num Epochs = 50
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 2,100
  Number of trainable parameters = 126,424,325


Epoch,Training Loss,Validation Loss,Accuracy,Avg ones
1,0.6182,0.624824,0.693867,1.0
2,0.6319,0.6175,0.693867,1.0
3,0.6307,0.617782,0.693867,1.0
4,0.6329,0.618737,0.693867,1.0
5,0.6204,0.623103,0.693867,1.0
6,0.6336,0.621176,0.693867,1.0
7,0.6124,0.61909,0.693867,1.0
8,0.6049,0.638163,0.693867,1.0
9,0.5937,0.688657,0.693867,1.0
10,0.6501,0.628843,0.693867,1.0


***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
Saving model checkpoint to test-trainer/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 24
***** Running Evaluation *****
  Nu

TrainOutput(global_step=2100, training_loss=0.2682330193259709, metrics={'train_runtime': 1353.767, 'train_samples_per_second': 36.934, 'train_steps_per_second': 1.551, 'total_flos': 0.0, 'train_loss': 0.2682330193259709, 'epoch': 50.0})