In [13]:
import numpy as np
import pickle
import jax
import optax
import haiku as hk
import functools
import random

from neural_networks_solomonoff_induction.models import transformer
from neural_networks_solomonoff_induction.data import data_generator as dg_lib
from neural_networks_solomonoff_induction.data import utm_data_generator as utm_dg_lib
from neural_networks_solomonoff_induction.data import utms as utms_lib

utm = utms_lib.BrainPhoqueUTM(alphabet_size=2)
data_generator = utm_dg_lib.UTMDataGenerator(
    batch_size=1,
    seq_length=256,
    rng=1,
    utm=utm,
    memory_size=10,
    maximum_steps=100,
    tokenizer=utm_dg_lib.Tokenizer.SEQ_POSITION,#utm_dg_lib.Tokenizer.ASCII,
    maximum_program_length=20,
)

fname = "/home/cwyeth/Desktop/neural_networks_solomonoff_induction/params.npz"
with open(fname, "rb") as f:
    #params = pickle.load(f)
    loaded_params = np.load(f, allow_pickle=True)
    # print(loaded_params)
    params = dict(loaded_params)
    # print(params)
    # print(params.keys())
    for k in params.keys():
        params[k] = params[k].item()
    #params = hk.data_structures.to_immutable_dict(loaded_params)
    # print(params)
    #print(params)
    # for k in params.keys():
    #     print(k)
    #print(params["linear_1"])
    
config = transformer.TransformerConfig(vocab_size=data_generator.feature_size)
model = hk.transform(
    functools.partial(transformer.transformer_decoder, config=config)
)

# Initialize parameters.
dummy_batch, _ = data_generator.sample_dummy(1)
# Transform one-hots to integer tokens.
dummy_batch = np.argmax(dummy_batch, axis=-1)
rng = jax.random.PRNGKey(0)
params_init = model.init(rng, dummy_batch)

2024-03-10 17:09:39.467468: W external/xla/xla/service/gpu/nvptx_compiler.cc:744] The NVIDIA driver's CUDA version is 12.2 which is older than the ptxas CUDA version (12.3.107). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


In [3]:
# UTM Testing

print(f"feature size: {data_generator.feature_size}")

batch, log_dict = data_generator.sample()

print("Batch:")
print(batch.shape)
# print(batch)
# print(batch[0, :, :])
print("log dict")
print(log_dict.keys())
# print(log_dict["categorical_probs"]) # Apparently isomorphic to batch since batch is "1 hot" and gen is deterministic
print(log_dict["params"][0])
print("result keys")
print(log_dict["results"][0].keys())
# print(log_dict["results"][0]["alphabet_size"])
print(repr(log_dict["results"][0]["output"]))
# print(repr(log_dict["results"][0]["short_program"]))

# It seems that jax is so functional that parameters are carried around outside the model??

# Transform one-hots to integer tokens.
#batch = np.argmax(batch, axis=-1)

_, _, log_dict = data_generator.sample_from_params(['[,.[+]+]'])
print(repr(log_dict['results'][0]['short_program']))
print(repr(log_dict['results'][0]['output']))
print(log_dict['results'][0]['status'])
print(log_dict['results'][0]['input_symbols'])


feature size: 2
Batch:
(1, 256, 2)
log dict
dict_keys(['categorical_probs', 'params', 'results', 'loss_mask'])
+.+><<+,],+]>[,>]->,
result keys
dict_keys(['status', 'alphabet_size', 'num_steps', 'memory_index', 'output', 'output_length', 'short_program', 'short_program_length', 'input_symbols'])
'\x01'
'[,.[+]+]'
'\x01\x01\x01\x00\x00\x01\x00\x00\x00\x01\x00\x00'
TIMEOUT
[1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]


In [221]:

print("Simple Alternating Test")

batch = np.array([[1] + [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]*4])
print(batch.shape)
print(batch)

conditionals = model.apply(
    params=params,
    targets=batch,
    rng=None,
)

print(conditionals.shape)
print(conditionals)

preds = []
for i in range(conditionals.shape[1]):
    preds.append(int(conditionals[0,i,0] < conditionals[0,i,1]))
print(f"Padded true: {[0] + list(batch[0,:])}")
print(f"Predictions: {[0] + preds}")



Simple Alternating Test
(1, 105)
[[1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
  1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0
  1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]]
(1, 105, 2)
[[[-3.52796048e-01 -1.21308243e+00]
  [-6.94330692e-01 -6.91964984e-01]
  [-4.17537928e-01 -1.07489514e+00]
  [-6.84005439e-01 -7.02373326e-01]
  [-5.39967775e-01 -8.74110579e-01]
  [-4.69937921e-01 -9.80938733e-01]
  [-9.71428931e-01 -4.75686550e-01]
  [-1.52314529e-01 -1.95699883e+00]
  [-2.01736259e+00 -1.42722949e-01]
  [-7.62302130e-02 -2.61187005e+00]
  [-2.62831092e+00 -7.49394372e-02]
  [-4.46894392e-02 -3.13028026e+00]
  [-3.22934413e+00 -4.03882451e-02]
  [-2.81937644e-02 -3.58271718e+00]
  [-3.58815408e+00 -2.80388016e-02]
  [-2.51660571e-02 -3.69481468e+00]
  [-3.67177629e+00 -2.57602502e-02]
  [-4.62821051e-02 -3.09605241e+00]
  [-3.48591733e+00 -3.11044063e-02]
  [-1.39167771e-01 -2.04085207e+00]
  [-3.27231884e+0

In [222]:
print("More complicated deterministic sequence")
h = [1,1,1]
seq_reps = 50 
for i in range(seq_reps):
    h += [0, 1, 1, 1]
batch = np.array([h])

conditionals = model.apply(
    params=params,
    targets=batch,
    rng=None,
)

print(conditionals.shape)
print(conditionals[:, :, :2])

preds = []
for i in range(conditionals.shape[1]):
    preds.append(int(conditionals[0,i,0] < conditionals[0,i,1]))
true_seq = list(batch[0,:])
print(f"Padded true: {[0] + true_seq}")
print(f"Predictions: {[0] + preds}")
print(f"Accuracy without padding: {sum([pred == true_bit for pred, true_bit in zip(preds, true_seq)])/len(true_seq)}")

More complicated deterministic sequence
(1, 203, 2)
[[[-3.52796108e-01 -1.21308243e+00]
  [-6.94330752e-01 -6.91965044e-01]
  [-8.80929768e-01 -5.35113931e-01]
  [-1.21124673e+00 -3.53573710e-01]
  [-4.25086945e-01 -1.06048715e+00]
  [-7.44888484e-01 -6.43951833e-01]
  [-8.60913098e-01 -5.49524605e-01]
  [-1.09323859e+00 -4.08162922e-01]
  [-6.79253578e-01 -7.07236528e-01]
  [-1.26693654e+00 -3.30858499e-01]
  [-1.64442563e+00 -2.14584619e-01]
  [-2.27242902e-01 -1.59320664e+00]
  [-2.38927460e+00 -9.61763710e-02]
  [-2.78502941e+00 -6.37145713e-02]
  [-3.62492180e+00 -2.70127896e-02]
  [-6.24668822e-02 -2.80418921e+00]
  [-3.22328568e+00 -4.06387188e-02]
  [-3.53121901e+00 -2.97060963e-02]
  [-4.18122959e+00 -1.53976036e-02]
  [-3.33867371e-02 -3.41624498e+00]
  [-3.62507629e+00 -2.70084981e-02]
  [-4.20295858e+00 -1.50641669e-02]
  [-4.72602940e+00 -8.90108384e-03]
  [-2.74439882e-02 -3.60930014e+00]
  [-3.74780297e+00 -2.38517225e-02]
  [-4.31054020e+00 -1.35172801e-02]
  [-4.801161

In [226]:


print("Alternating with random bits interspersed")
# This seems arbitrary but builds up to the simple environment where o is irrelevant, a=r
h = []
seq_reps = 80 # Best possible expected accuracy is 2/3 + 1/6 = 5/6 ~0.83. Rises until about 20 then actually falls rapidly!
for i in range(seq_reps):
    h += [0, random.choice([0,1]), 1]
batch = np.array([h])

conditionals = model.apply(
    params=params,
    targets=batch,
    rng=None,
)

print(conditionals.shape)
print(conditionals[:, :, :2])

preds = []
for i in range(conditionals.shape[1]):
    preds.append(int(conditionals[0,i,0] < conditionals[0,i,1]))
true_seq = list(batch[0,:])
print(f"Padded true: {[0] + true_seq}")
print(f"Predictions: {[0] + preds}")
print(f"Accuracy without padding: {sum([pred == true_bit for pred, true_bit in zip(preds, true_seq)])/len(true_seq)}")



Alternating with random bits interspersed
(1, 240, 2)
[[[-3.52796108e-01 -1.21308243e+00]
  [-3.73434246e-01 -1.16592669e+00]
  [-7.15374649e-01 -6.71402931e-01]
  [-9.08050239e-01 -5.16357303e-01]
  [-4.42213506e-01 -1.02893448e+00]
  [-7.11763501e-01 -6.74871087e-01]
  [-7.95743823e-01 -6.00104153e-01]
  [-7.38170862e-01 -6.50063515e-01]
  [-4.51871932e-01 -1.01179886e+00]
  [-7.61074424e-01 -6.29542053e-01]
  [-4.92689669e-01 -9.44126666e-01]
  [-6.55106902e-01 -7.32692003e-01]
  [-6.37143433e-01 -7.52474427e-01]
  [-4.11303908e-01 -1.08703589e+00]
  [-8.76831830e-01 -5.38023949e-01]
  [-1.19366467e-01 -2.18464661e+00]
  [-1.82631862e+00 -1.75550789e-01]
  [-6.69377685e-01 -7.17495441e-01]
  [-4.07854706e-01 -1.09385014e+00]
  [-4.79407132e-01 -9.65350509e-01]
  [-8.94472003e-01 -5.25640488e-01]
  [-3.30585301e-01 -1.26763344e+00]
  [-2.61036277e-01 -1.47077632e+00]
  [-8.91735554e-01 -5.27537227e-01]
  [-3.62541288e-01 -1.19041717e+00]
  [-8.50100279e-01 -5.57519138e-01]
  [-1.6214

In [16]:

print("aor with a=r coin flips, o coin flip")
# Accuracy should level out around 2/3, but never gets close
h = []
seq_reps = 85
for i in range(seq_reps):
    a = random.choice([0,1])
    h += [a, random.choice([0,1]), a]
batch = np.array([h])

conditionals = model.apply(
    params=params,
    targets=batch,
    rng=None,
)

print(conditionals.shape)
print(conditionals[:, :, :2])

preds = []
for i in range(conditionals.shape[1]):
    preds.append(int(conditionals[0,i,0] < conditionals[0,i,1]))
true_seq = list(batch[0,:])
print(f"Padded true: {[0] + true_seq}")
print(f"Predictions: {[0] + preds}")
print(f"Accuracy without padding: {sum([pred == true_bit for pred, true_bit in zip(preds, true_seq)])/len(true_seq)}")




aor with a=r coin flips, o coin flip
[[1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1
  1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 1
  1 1 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
  0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1
  0 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1
  0 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 1
  0 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1
  0 1 0]]
(1, 255, 2)
[[[-3.48715782e-01 -1.22279429e+00]
  [-6.96350098e-01 -6.89954460e-01]
  [-8.69686127e-01 -5.43147504e-01]
  [-1.19961905e+00 -3.58546585e-01]
  [-1.69872642e+00 -2.02013820e-01]
  [-2.41085982e+00 -9.40228850e-02]
  [-3.16458702e+00 -4.31492627e-02]
  [-3.98582292e+00 -1.87518466e-02]
  [-4.79692459e+00 -8.28933716e-03]
  [-5.61970758e+00 -3.63225886e-03]
  [-4.08360153e-01 -1.09284723e+00]
  [-9.34250653e-01 -4.99029

In [24]:
# Variable Order Markov Source Evaluation

from neural_networks_solomonoff_induction.data.ctw_data_generator import CTWGenerator

voms = CTWGenerator(
    batch_size = 3,
    seq_length = 50,
    rng = 111,
    max_depth = 32,
    with_contexts = False,
)
sequences, log_dict = voms.sample()
sequences = np.argmax(sequences, axis=-1)
print(batch)
# print(sequences)
# print(log_dict)
print(log_dict.keys())
print(log_dict["categorical_probs"][2])
print(batch)
conditionals = model.apply(
    params = params,
    targets = sequences,
    rng = None,
)
print(conditionals)
regrets = -np.take_along_axis(conditionals, sequences, 0) + np.log(np.take_along_axis(log_dict["categorical_probs"], sequences, 0))
print(regrets.shape)

[[1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1
  1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 1
  1 1 1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
  0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1
  0 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1
  0 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 1
  0 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1
  0 1 0]]
dict_keys(['categorical_probs', 'params', 'context_lengths', 'tree_depths'])
[[0.9476607  0.05233933]
 [0.9476607  0.05233933]
 [0.9476607  0.05233933]
 [0.9476607  0.05233933]
 [0.9476607  0.05233933]
 [0.16239274 0.83760726]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.8681111 ]
 [0.13188893 0.86811

ValueError: `indices` and `arr` must have the same number of dimensions