In [1]:
%reset -f

Importing cellular automata & optimization classes, and other stuff

In [1]:
import os
import sys
import shutil


sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))

from lamm_automata.blender import Lattice, clear_initial
from lamm_automata.ruleset import conway, seeds
from lamm_automata.genetic import Optimizer, RulesetMutator, ArbitraryRulesetMutator
from lamm_automata.objectives import surface_to_vol

import numpy as np
import torch
import pandas as pd
import random
import datetime
import decimal

Open all experiment data as Pandas DataFrames

In [3]:
list_df = []
for filename in os.listdir(os.path.join(os.getcwd(), 'data')):
    df = pd.read_hdf(f'data/{filename}')
    list_df.append(df)
    # print data to make sure they are correct
    print(f'EXPERIMENT DATA FROM {filename}:')
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     # print(df)

EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_40.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_41.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_6.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_12.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_9.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_14.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_17.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_24.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_49.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_36.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_39.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_31.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_32.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_44.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_45.h5:
EXPERIMENT DATA FROM test_experiment_100ITERS_32GRID_4.h5:
EXPERIMENT DATA FROM test_experiment_100ITE

Converting mutation history of training samples to strings of tokens (flatten numpy arrays of states and positions, also choose encoding and what to include)

In [None]:
# DECIMALS = 3
# PRECISION = 3
# decimal.getcontext().prec = PRECISION
# decimal.getcontext().Emin = -DECIMALS
# print(decimal.getcontext(),"\n")
max_exp = 0
min_exp = 0
def decimal_to_tokens(val):
    # split to sign, digits, and exponent
    # print(f'rounded val: {round(val, NUM_DECIMALS)}')
    # s, d, e = decimal.Decimal(round(val, NUM_DECIMALS)).as_tuple()
    global min_exp, max_exp
    # max_exp = max(max_exp, e)
    # min_exp = min(min_exp, e)
    # val = round(float(val), DECIMALS)
    formatted_val = "{:.2e}".format(val)
    # print(formatted_val)
    digits, exp = formatted_val.split("e")

    digits = digits.replace(".","")
    digits = digits.replace("+","")
    digits = digits.replace("-","")
    # print(f'digits: {digits}')

    exp = int(exp)
    max_exp = max(max_exp, abs(exp))
    return f"{'[PPOS]' if val > 0 else '[PNEG]'} {' '.join(digits)} {'[PEPOS]' if exp >= 0 else '[PENEG]'} {abs(exp)}"

data_strings = []
# performance_metric_vals = []
# store performance metric values separately to put them in the input directly, they shouldn't be tokenized
for data_frame in list_df:
    # final goal: [BOS][BMB][ICM][pos of mutation in IC (2 tokens)][SRTM][pos of mutation in SRT (3 tokens)][P][performance metric after the mutation][EMB] ... [EOS]
    data_str = '[BOS] '
    # perf_met = []
    for index, row in data_frame.iterrows():
        # print(row)
        if index == 0:
            # initial conditions
            grid_sz = row["ic_cell_pos"]
            initial_ic = row["ic_state_old"]
            initial_srt = row["srt_state_old"]
            # print(f'grid_sz: {grid_sz}, initial_ic: {initial_ic}, initial_srt: {initial_srt}')
        else:
            data_str += '[BMB] '
            # columns: ic_cell_pos, ic_state_old, ic_state_new, srt_cell_pos, srt_state_old, srt_state_new, objective
            ic_mut_strings = []
            srt_mut_strings = []
            # only include mutation positions since otherwise data will be too big
            if type(row["ic_cell_pos"]) != int and len(row["ic_cell_pos"].shape) >= 1:
                if len(row["ic_cell_pos"].shape) == 2:
                    # batch update
                    for i in range(row["ic_cell_pos"].shape[0]):
                        ic_mut_strings.append(f'[ICM] {row["ic_cell_pos"][i][0]} {row["ic_cell_pos"][i][1]} ')
                else:
                    ic_mut_strings.append(f'[ICM] {row["ic_cell_pos"][0]} {row["ic_cell_pos"][1]} ')
            
            if type(row["srt_cell_pos"]) != int and len(row["srt_cell_pos"].shape) >= 1:
                if len(row["srt_cell_pos"].shape) == 2:
                    # batch update
                    for i in range(row["srt_cell_pos"].shape[0]):
                        ic_mut_strings.append(f'[SRTM] {row["srt_cell_pos"][i][0]} {row["srt_cell_pos"][i][1]} {row["srt_cell_pos"][i][2]} ')
                else:
                    ic_mut_strings.append(f'[SRTM] {row["srt_cell_pos"][0]} {row["srt_cell_pos"][1]} {row["srt_cell_pos"][2]} ')
            data_str += ''.join(ic_mut_strings)
            data_str += ''.join(srt_mut_strings)
            data_str += f'[BP] {decimal_to_tokens(row["objective"])} [EP] '
            # data_str += f'[P] # '
            # perf_met.append(row["objective"])
            data_str += '[EMB] '
    data_str += '[EOS]'
    # print(data_str)
    # print(perf_met)
    data_strings.append(data_str)
    # performance_metric_vals.append(perf_met)

Import tokenizer modules

In [3]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
from transformers import PreTrainedTokenizerFast
from TokenizerChanger import TokenizerChanger

  from .autonotebook import tqdm as notebook_tqdm


Setup and train tokenizer

In [None]:
MODEL_NAME = "test_model"
model_dir = os.path.join("models", MODEL_NAME+datetime.datetime.now().strftime('_%m_%d_%H%M'))
print(f'model directory: {model_dir}')

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# want to keeps numbers (& floats, which the objective value is) as they are and use trained ids for the special tokens & delimiters
num_tokens = [str(i) for i in range(min_exp, max(grid_sz, 10, max_exp))] # will surely have 0-9 for the decimals when tokenized, and also the numbers for the exponent
# print(num_tokens)
delimiter_tokens = ["[BMB]","[ICM]","[SRTM]","[EMB]"]
special_tokens = ["[UNK]", "[PAD]", "[BOS]", "[EOS]"]
# perf_num_tokens = ["[P]","#"]
perf_num_tokens = ["[BP]", "[PPOS]", "[PNEG]", "[PEPOS]", "[PENEG]", "[EP]"] # sign, digits, and exponent, inside a [BP] ... [EP] block

all_tokens = num_tokens + delimiter_tokens + special_tokens + perf_num_tokens

# set up tokenizer
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))

# pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# trainer for the tokenizer
trainer = trainers.WordLevelTrainer(
    special_tokens = all_tokens
)

# train tokenizer on tokens themselves and the data as well
tokenizer.train_from_iterator(all_tokens+data_strings, trainer = trainer)

# save the tokenizer
tokenizer.save(os.path.join(model_dir,"tokenizer.json"))

NameError: name 'min_exp' is not defined

Splitting mutation history strings into tokens and converting them to tensors using the token ids

In [None]:
delimiter_tokens = ["[BMB]","[ICM]","[SRTM]","[EMB]"]
special_tokens = ["[UNK]", "[PAD]", "[BOS]", "[EOS]"]
# perf_num_tokens = ["[P]","#"]
perf_num_tokens = ["[BP]", "[PPOS]", "[PNEG]", "[PEPOS]", "[PENEG]", "[EP]"]

# load tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file = os.path.join(model_dir,"tokenizer.json"))

# special tokens
encoder_special_tokens_dict = {"additional_special_tokens": delimiter_tokens+special_tokens+perf_num_tokens}
tokenizer.add_special_tokens(encoder_special_tokens_dict)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]', 'unk_token': '[UNK]', 'eos_token': '[EOS]'})

tokenizer.padding_side = "right"

NameError: name 'data_strings' is not defined

In [None]:
# def str_to_tensor(dat_str, p_met):
#     tensor_without_p = encoder_tokenizer(dat_str, padding="longest", return_tensors="pt")["input_ids"]
#     tensor_without_p = tensor_without_p.float()
#     tensor_without_p[tensor_without_p==encoder_tokenizer.convert_tokens_to_ids("#")] = torch.Tensor(p_met)
#     return tensor_without_p

def str_to_tensor(dat_str):
    return torch.Tensor(tokenizer(dat_str, padding="longest", return_tensors="pt")["input_ids"])
    # tensor_without_p = tensor_without_p.float()
    # tensor_without_p[tensor_without_p==encoder_tokenizer.convert_tokens_to_ids("#")] = torch.Tensor(p_met)
    # return tensor_without_p

# decoder_tokenizer = PreTrainedTokenizerFast(tokenizer_file = "tokenizer/tokenizer.json")

# # special tokens, but not [P] and # (they should not be produced by the model, take care of that in training)
# decoder_special_tokens_dict = {"additional_special_tokens": delimiter_tokens+special_tokens}
# decoder_tokenizer.add_special_tokens(decoder_special_tokens_dict)
# decoder_tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]', 'unk_token': '[UNK]', 'eos_token': '[EOS]'})
# # changer = TokenizerChanger(decoder_tokenizer)
# # changer.delete_tokens(["[P]","#"])

# print(f"Decoder vocab: {decoder_tokenizer.get_vocab()}")

# def tensor_decode(tensor):
#     if(tensor.dim()==2):
#         return [decoder_tokenizer.decode(seq.int()) for seq in tensor]
#     return decoder_tokenizer.decode(tensor.int())

def tensor_decode(tensor):
    if(tensor.dim()==2):
        return [tokenizer.decode(seq) for seq in tensor]
    return tokenizer.decode(tensor)

# test string to tensor
torch.set_printoptions(profile="full")
# encoded = [str_to_tensor(dat, p) for dat, p in zip(data_strings, performance_metric_vals)]
# encoded = [str_to_tensor(dat) for dat in data_strings]
encoded = str_to_tensor(data_strings)
# decode_data_strings = [dat.replace("[P]","").replace("#","") for dat in data_strings]
# print(decode_data_strings)
# output_encoded = [str_to_tensor(dat, []) for dat, p in zip(decode_data_strings, performance_metric_vals)]
decoded = [tensor_decode(enc_seq) for enc_seq in encoded]
# print(f"Encode:\n{encoded}")
# print(f"Decode:\n{decoded}")

Shuffling and splitting the dataset

In [8]:
random.shuffle(encoded)

PERCENT_TRAIN = 0.9
PERCENT_VALIDATE = 0.09
PERCENT_TEST = 0.01

# print(encoded[0].shape)
train_data = encoded[:int(PERCENT_TRAIN*len(encoded))]
validate_data = encoded[int(PERCENT_TRAIN*len(encoded)):int((PERCENT_TRAIN+PERCENT_VALIDATE)*len(encoded))]
test_data = encoded[int((PERCENT_TRAIN+PERCENT_VALIDATE)*len(encoded)):]

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(validate_data)}")
print(f"Testing set size: {len(test_data)}")

Training set size: 45
Validation set size: 4
Testing set size: 1


Import transformer modules

In [7]:
from x_transformers import Decoder, TransformerWrapper
from x_transformers.autoregressive_wrapper import AutoregressiveWrapper

More imports (mostly PyTorch stuff)

In [8]:
from torch.utils.data import Dataset, IterableDataset, DataLoader, get_worker_info
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F

import time
import pickle
import json

Setting up & training the transformer model

In [9]:
!export 'PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True'
# !export 'CUDA_LAUNCH_BLOCKING=1'
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Training with {device}')

# empty cache
torch.cuda.empty_cache()

# parameters
DIM = 50
DEPTH = 6
HEADS = 4
LIMIT_SEQ_LEN = 100
ATTN_FLASH = True
ROTARY_POS_EMB = True
MASKING = False
MASK_PROB = 0.15
EPOCHS = 1
BATCH_SIZE = 1
LEARNING_RATE = 1e-3
NUM_DATALOADING_WORKERS = 0
DROP_LAST = False
BATCH_NUM_DATA_LOGGING = 1

num_tokens = max(tokenizer.get_vocab().values())
max_seq_len = max([len(seq) for seq in encoded])

print(num_tokens, max_seq_len, LIMIT_SEQ_LEN)

# base transformer model
model = TransformerWrapper(
    num_tokens = num_tokens,
    max_seq_len = min(max_seq_len,LIMIT_SEQ_LEN),
    attn_layers = Decoder(
        dim = DIM,
        depth = DEPTH,
        heads = HEADS,
        attn_flash = ATTN_FLASH,
        rotary_pos_emb = ROTARY_POS_EMB,
    ),    
)
# wrap the transformer into an autoregressor
model = AutoregressiveWrapper(model, mask_prob=MASK_PROB if MASKING else 0)
if torch.cuda.is_available():
    model.cuda()

# save model hyperparams
with open(os.path.join(model_dir,'model_hyperparams.json'), 'w', encoding='utf-8') as f:
    json.dump({
        "model_name": MODEL_NAME,
        "num_tokens": model.net.num_tokens,
        "max_seq_len": model.max_seq_len,
        "dim": DIM, 
        "depth": DEPTH,
        "limit_seq_len": LIMIT_SEQ_LEN, 
        "heads": HEADS,
        "rotary_pos_emb": ROTARY_POS_EMB,
        "attn_flash": ATTN_FLASH,
        "masking": MASKING,
        "mask_prob": MASK_PROB
        }, f, ensure_ascii=False, indent=4)

with open(os.path.join(model_dir,'model_training_params.json'), 'w', encoding='utf-8') as f:
    json.dump({
        "epochs": EPOCHS, 
        "batch_size": BATCH_SIZE, 
        "lr": LEARNING_RATE, 
        "workers": NUM_DATALOADING_WORKERS, 
        "drop_last": DROP_LAST,
        }, f, ensure_ascii=False, indent=4)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# setup dataset loading
class MutationDataset(IterableDataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __iter__(self):
        worker_info = get_worker_info()
        if worker_info is None:
            return iter(self.sequences)
        else:
            seqs_per_worker = len(self.sequences)//worker_info.num_workers
            return iter(self.sequences[worker_info.id*seqs_per_worker:(worker_info.id+1)*seqs_per_worker])

# datasets
training_set = MutationDataset(train_data)
validation_set = MutationDataset(validate_data)
test_set = MutationDataset(test_data)

# loaders for training
training_loader = DataLoader(training_set, batch_size=BATCH_SIZE, num_workers=NUM_DATALOADING_WORKERS, drop_last=DROP_LAST)
validation_loader = DataLoader(validation_set, batch_size=BATCH_SIZE, num_workers=NUM_DATALOADING_WORKERS, drop_last=DROP_LAST)

# logging setup -> TensorBoard (for now)
log_writer = SummaryWriter(log_dir=os.path.join(model_dir, "logs"))

# time
start_train_time = time.time()

# store losses
training_losses, avg_train_losses, validation_losses, epoch_times = [], [], [], []

# print(torch.cuda.memory_summary(device=None, abbreviated=False))

# train
print('Starting training')
total_seqs = 0
for epoch_num in range(EPOCHS):
    epoch_start_time = time.time()
    print(f'Training epoch {epoch_num+1} started at {int(epoch_start_time-start_train_time)} seconds')

    total_train_loss = 0
    running_train_loss = 0
    count_iters = 0

    training_losses.append([])

    model.train(True)

    # print(len(train_data))

    for sequence in training_loader:
        optimizer.zero_grad()

        # train
        model.train()
        if torch.cuda.is_available():
            sequence = sequence[:][:min(sequence.shape[1],LIMIT_SEQ_LEN)].cuda()
        
        print(torch.cuda.memory_stats())

        # print(sequence, sequence.shape)
        loss = model(sequence)
        loss.backward()

        optimizer.step()
        
        running_train_loss += loss.item()
        total_train_loss += loss.item()

        total_seqs += len(sequence)
        count_iters += 1

        print(torch.cuda.memory_stats())

        if count_iters % BATCH_NUM_DATA_LOGGING == (BATCH_NUM_DATA_LOGGING - 1):
            # store and print losses
            train_loss = running_train_loss/BATCH_NUM_DATA_LOGGING
            running_train_loss = 0

            training_losses[epoch_num].append(train_loss)
            
            print(f"Batch: {count_iters}, train loss: {train_loss}")

            # log to TensorBoard
            log_writer.add_scalar("Loss/train", train_loss, total_seqs)
    
    # save model
    torch.save(model.state_dict(), os.path.join(model_dir, f"weights_{epoch_num+1}.pt"))

    # validate
    model.eval()
    with torch.no_grad():
        val_loss = model(next(enumerate(validation_loader))[1]).item()

    # compute and log average losses
    avg_train_loss = total_train_loss/count_iters

    validation_losses.append(val_loss)
    avg_train_losses.append(avg_train_loss)

    epoch_end_time = time.time()
    epoch_time = int(epoch_end_time-epoch_start_time)
    epoch_times.append(epoch_time)

    print(f"Epoch {epoch_num+1}: avg train loss: {avg_train_loss}, validation loss: {val_loss}, in time: {epoch_time}")

    log_writer.add_scalar("Loss/validate", val_loss, epoch_num)
    log_writer.add_scalar("Loss/avg_train", avg_train_loss, epoch_num)

    log_writer.flush()

finished_training_time = time.time()
print(f'Finished training in {int(finished_training_time-start_train_time)} seconds')

log_writer.close()


Training with cuda
93 257474 100
model directory: models/test_model_04_12_2029
Starting training
Training epoch 1 started at 0 seconds
OrderedDict([('active.all.allocated', 65), ('active.all.current', 65), ('active.all.freed', 0), ('active.all.peak', 65), ('active.large_pool.allocated', 1), ('active.large_pool.current', 1), ('active.large_pool.freed', 0), ('active.large_pool.peak', 1), ('active.small_pool.allocated', 64), ('active.small_pool.current', 64), ('active.small_pool.freed', 0), ('active.small_pool.peak', 64), ('active_bytes.all.allocated', 3828736), ('active_bytes.all.current', 3828736), ('active_bytes.all.freed', 0), ('active_bytes.all.peak', 3828736), ('active_bytes.large_pool.allocated', 2060288), ('active_bytes.large_pool.current', 2060288), ('active_bytes.large_pool.freed', 0), ('active_bytes.large_pool.peak', 2060288), ('active_bytes.small_pool.allocated', 1768448), ('active_bytes.small_pool.current', 1768448), ('active_bytes.small_pool.freed', 0), ('active_bytes.small_

AttributeError: 'tuple' object has no attribute 'shape'