In [1]:
import os

In [2]:
os.environ['TF_KERAS'] = '2'
os.environ['CUBLAS_WORKSPACE_CONFIG']=':16:8'
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:2'
os.environ['TF_DETERMINISTIC_OPS']="1"
os.environ["TF_USE_LEGACY_KERAS"]="1"

In [3]:
import mmap
import os
import pathlib
import math
import random
import contextlib
import tensorflow as tf
import numpy as np
import tf_keras as keras
from time import perf_counter
from tf_keras import mixed_precision
from utils.UtilClass import BitOutputStream, ArithmeticEncoder

2024-07-28 20:15:39.179676: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-28 20:15:39.187811: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-28 20:15:39.197198: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-28 20:15:39.199948: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-28 20:15:39.206903: I tensorflow/core/platform/cpu_feature_guar

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


I0000 00:00:1722190541.462428   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722190541.483790   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722190541.483911   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [5]:
ROOT_DIR = pathlib.Path(os.getcwd()).parent

In [6]:
def mmap_io(filename: str):
    with open(filename, mode="rb") as file_obj:
        with mmap.mmap(file_obj.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
            text = mmap_obj.read()
    return text

In [7]:
#@title System Info

def system_info():
  """Prints out system information."""
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
  else:
    print(gpu_info)
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
#   print("Pytorch version: ", torch.__version__)
  print("Tensorflow version: ", tf.__version__)
  !lscpu |grep 'Model name'
  !cat /proc/meminfo | head -n 3

system_info()

Sun Jul 28 20:15:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 ...    On  | 00000000:01:00.0 Off |                  N/A |
| N/A   47C    P3              N/A /  55W |     11MiB /  8188MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
path = os.path.join(ROOT_DIR, 'data', 'ready4cmix')

In [9]:
text = mmap_io(path)
vocab = sorted(set(text))
vocab_size = len(vocab)
char2idx = {u:i for i, u in enumerate(vocab)}
intlist = []
for idx, c in enumerate(text):
    intlist.append(char2idx[c])

In [10]:
vocab_size = math.ceil(vocab_size/8) * 8
file_len = len(intlist)
print ('Length of file: {} symbols'.format(file_len))
print ('Vocabulary size: {}'.format(vocab_size))

Length of file: 933340281 symbols
Vocabulary size: 208


In [11]:
batch_size = 256
seq_length =  15
rnn_units =  2000
num_layers = 5
embedding_size=1024
start_learning_rate = 5e-4
end_learning_rate = 2e-4

In [12]:

#@title Architecture

def build_model(vocab_size):
    """Builds the model architecture.

    Args:
        vocab_size: Int, size of the vocabulary.
    """
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_global_policy(policy)
    inputs = [
    keras.Input(batch_input_shape=[batch_size, seq_length])] # shape (256, 15)
    # In addition to the primary input, there are also two "state" inputs for each
    # layer of the network.
    for i in range(num_layers):
        inputs.append(keras.Input(shape=(None,)))
        inputs.append(keras.Input(shape=(None,)))
    embedding = keras.layers.Embedding(vocab_size, embedding_size)(inputs[0])
    # for idx in inputs:
    #     print(idx.shape)
    # print(embedding.shape)
    # Skip connections will be used to connect each LSTM layer output to the final
    # output layer. Each LSTM layer will get as input both the original input and
    # the output of the previous layer.
    skip_connections = []
    # In addition to the softmax output, there are also two "state" outputs for
    # each layer of the network.
    outputs = []
    predictions, state_h, state_c = keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            return_state=True,
                            recurrent_initializer='glorot_uniform',
                            )(embedding, initial_state=[
                            tf.cast(inputs[1], tf.float16),
                            tf.cast(inputs[2], tf.float16)])
    skip_connections.append(predictions)
    outputs.append(state_h)
    outputs.append(state_c)
    for i in range(num_layers - 1):
        layer_input = keras.layers.concatenate(
            [embedding, skip_connections[-1]])
        predictions, state_h, state_c = keras.layers.LSTM(rnn_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform')(
                layer_input, initial_state=[tf.cast(inputs[i*2+3], tf.float16),
                                            tf.cast(inputs[i*2+4], tf.float16)])
        skip_connections.append(predictions)
        outputs.append(state_h)
        outputs.append(state_c)
    # The dense output layer only needs to be computed for the last timestep, so
    # we can discard the earlier outputs.
    last_timestep = []
    for i in range(num_layers):
        last_timestep.append(tf.slice(skip_connections[i], [0, seq_length - 1, 0],
                                    [batch_size, 1, rnn_units]))
    if num_layers == 1:
        layer_input = last_timestep[0]
    else:
        layer_input = keras.layers.concatenate(last_timestep)
    dense = keras.layers.Dense(vocab_size, name='dense_logits')(layer_input)
    output = keras.layers.Activation('softmax', dtype='float32',
                                        name='predictions')(dense)
    outputs.insert(0, output)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [13]:
def get_symbol(index, length, freq, coder, compress, data):
    """Runs arithmetic coding and returns the next symbol.

    Args:
    index: Int, position of the symbol in the file.
    length: Int, size limit of the file.
    freq: ndarray, predicted symbol probabilities.
    coder: this is the arithmetic coder.
    compress: Boolean, True if compressing, False if decompressing.
    data: List containing each symbol in the file.

    Returns:
    The next symbol, or 0 if "index" is over the file size limit.
    """
    symbol = 0
    if index < length:
        if compress:
            symbol = data[index]
            coder.write(freq, symbol)
        else:
            symbol = coder.read(freq)
            data[index] = symbol
    return symbol

In [14]:
def reset_seed():
    """Initializes various random seeds to help with determinism."""
    SEED = 1234
    os.environ['PYTHONHASHSEED']=str(SEED)
    random.seed(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

In [15]:
def train(pos, seq_input, length, vocab_size, coder, model, optimizer, compress,
          data, states):
    """Runs one training step.

    Args:
    pos: Int, position in the file for the current symbol for the *first* batch.
    seq_input: Tensor, containing the last seq_length inputs for the model.
    length: Int, size limit of the file.
    vocab_size: Int, size of the vocabulary.
    coder: this is the arithmetic coder.
    model: the model to generate predictions.
    optimizer: optimizer used to train the model.
    compress: Boolean, True if compressing, False if decompressing.
    data: List containing each symbol in the file.
    states: List containing state information for the layers of the model.

    Returns:
    seq_input: Tensor, containing the last seq_length inputs for the model.
    cross_entropy: cross entropy numerator.
    denom: cross entropy denominator.
    """
    loss = cross_entropy = denom = 0
    split = math.ceil(length / batch_size)
    # Keep track of operations while running the forward pass for automatic
    # differentiation.
    with tf.GradientTape() as tape:
    # The model inputs contain both seq_input and the states for each layer.
        inputs = states.pop(0)
        inputs.insert(0, seq_input)
        # Run the model (for all batches in parallel) to get predictions for the
        # next characters.
        outputs = model(inputs)
        predictions = outputs.pop(0)
        states.append(outputs)
        p = predictions.numpy()
        symbols = []
        # When the last batch reaches the end of the file, we start giving it "0"
        # as input. We use a mask to prevent this from influencing the gradients.
        mask = []
        # Go over each batch to run the arithmetic coding and prepare the next
        # input.
        for i in range(batch_size):
            # The "10000000" is used to convert floats into large integers (since
            # the arithmetic coder works on integers).
            freq = np.cumsum(p[i][0] * 10000000 + 1)
            index = pos + 1 + i * split
            symbol = get_symbol(index, length, freq, coder, compress, data)
            symbols.append(symbol)
            if index < length:
                prob = p[i][0][symbol]
                if prob <= 0:
                    # Set a small value to avoid error with log2.
                    prob = 0.000001
                cross_entropy += math.log2(prob)
                denom += 1
                mask.append(1.0)
            else:
                mask.append(0.0)
        # "input_one_hot" will be used both for the loss function and for the next
        # input.
        input_one_hot = tf.expand_dims(tf.one_hot(symbols, vocab_size), 1)
        loss = keras.losses.categorical_crossentropy(
            input_one_hot, predictions, from_logits=False) * tf.expand_dims(
                tf.convert_to_tensor(mask), 1)
        scaled_loss = optimizer.get_scaled_loss(loss)
        # Remove the oldest input and append the new one.
        seq_input = tf.slice(seq_input, [0, 1],
                                [batch_size, seq_length - 1])
        seq_input = tf.concat([seq_input, tf.expand_dims(symbols, 1)], 1)
    # Run the backwards pass to update model weights.
    scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
    grads = optimizer.get_unscaled_gradients(scaled_gradients)
    # Gradient clipping to make training more robust.
    capped_grads = [tf.clip_by_norm(grad, 4) for grad in grads]
    optimizer.apply_gradients(zip(capped_grads, model.trainable_variables))
    return (seq_input, cross_entropy, denom)

In [16]:
def process(compress, length, vocab_size, coder, data):
    """This runs compression/decompression.

    Args:
    compress: Boolean, True if compressing, False if decompressing.
    length: Int, size limit of the file.
    vocab_size: Int, size of the vocabulary.
    coder: this is the arithmetic coder.
    data: List containing each symbol in the file.
    """
    start = perf_counter()
    reset_seed()
    model = build_model(vocab_size = vocab_size)
    checkpoint_path = tf.train.latest_checkpoint(
        os.path.join(ROOT_DIR, 'data', 'checkpoint'))
    if checkpoint_path:
        model.load_weights(checkpoint_path)
    model.summary()
    
    # Try to split the file into equal size pieces for the different batches. The
    # last batch may have fewer characters if the file can't be split equally.
    split = math.ceil(length / batch_size)

    learning_rate_fn = keras.optimizers.schedules.PolynomialDecay(
        start_learning_rate,
        split,
        end_learning_rate,
        power=1.0)
    optimizer = keras.optimizers.Adam(
        learning_rate=learning_rate_fn,
        beta_1=0,
        beta_2=0.9999,
        epsilon=1e-5)
    optimizer = mixed_precision.LossScaleOptimizer(optimizer)
    hidden = model.reset_states()
    # Use a uniform distribution for predicting the first batch of symbols. The
    # "10000000" is used to convert floats into large integers (since the
    # arithmetic coder works on integers).
    freq = np.cumsum(np.full(vocab_size, (1.0 / vocab_size)) * 10000000 + 1)
    # Construct the first set of input characters for training.
    symbols = []
    for i in range(batch_size):
        symbols.append(get_symbol(i*split, length, freq, coder, compress, data))
    # Replicate the input tensor seq_length times, to match the input format.
    seq_input = tf.tile(tf.expand_dims(symbols, 1), [1, seq_length])
    pos = cross_entropy = denom = last_output = 0
    template = '{:0.2f}%\tcross entropy: {:0.2f}\ttime: {:0.2f}'
    # This will keep track of layer states. Initialize them to zeros.
    states = []
    for i in range(seq_length):
        states.append([tf.zeros([batch_size, rnn_units])] * (num_layers * 2))
    # Keep repeating the training step until we get to the end of the file.
    while pos < split:
        seq_input, ce, d = train(pos, seq_input, length, vocab_size, coder, model,
                                optimizer, compress, data, states)
        cross_entropy += ce
        denom += d
        pos += 1
        time_diff = perf_counter() - start
        # If it has been over 20 seconds since the last status message, display a
        # new one.
        if time_diff - last_output > 20:
            last_output = time_diff
            percentage = 100 * pos / split
            if percentage >= 100: continue
            print(template.format(percentage, -cross_entropy / denom, time_diff))
    if compress:
        coder.finish()
    print(template.format(100, -cross_entropy / length, time.time() - start))
    system_info()
    if mode != "both" or not compress:
        model.save_weights(os.path.join(ROOT_DIR, 'data', 'models'))

In [17]:
def compress(path):
    with open(path, "wb") as out, contextlib.closing(BitOutputStream(out)) as bitout:
        length = len(intlist)
        out.write(length.to_bytes(5, byteorder='big', signed=False))
        for i in range(256):
            if i in char2idx:
                bitout.write(1)
            else:
                bitout.write(0)
        enc = ArithmeticEncoder(32, bitout)
        process(True, length, vocab_size, enc, intlist)
    print("Compressed size:", os.path.getsize(path))

In [18]:
path_to_file = os.path.join(ROOT_DIR, "data", "compressed.dat")
compress(path_to_file)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070 Laptop GPU, compute capability 8.9


I0000 00:00:1722190600.299626   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722190600.312218   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722190600.312357   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722190600.312409   98373 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(256, 15)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (256, 15, 1024)              212992    ['input_1[0][0]']             
                                                                                              

2024-07-28 20:16:42.911950: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.
2024-07-28 20:16:42.977086: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
I0000 00:00:1722190606.557915   98373 service.cc:146] XLA service 0x56441db84360 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1722190606.557930   98373 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2024-07-28 20:16:46.561688: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1722190606.709978   98373 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


0.00%	cross entropy: 8.23	time: 20.99
0.00%	cross entropy: 7.14	time: 41.58
0.00%	cross entropy: 6.62	time: 61.86
0.00%	cross entropy: 6.51	time: 96.11
0.00%	cross entropy: 6.45	time: 122.80
0.00%	cross entropy: 6.25	time: 144.84
0.00%	cross entropy: 6.10	time: 165.54
0.00%	cross entropy: 5.99	time: 189.58
0.00%	cross entropy: 5.91	time: 221.95
0.00%	cross entropy: 5.90	time: 250.36
0.00%	cross entropy: 5.83	time: 271.53
0.00%	cross entropy: 5.75	time: 293.38
0.00%	cross entropy: 5.69	time: 318.38
0.00%	cross entropy: 5.66	time: 346.78
0.00%	cross entropy: 5.63	time: 373.84
0.00%	cross entropy: 5.58	time: 395.70
0.00%	cross entropy: 5.52	time: 422.55
0.00%	cross entropy: 5.50	time: 443.66
0.00%	cross entropy: 5.49	time: 479.11
0.00%	cross entropy: 5.46	time: 500.60
0.00%	cross entropy: 5.43	time: 521.39
0.00%	cross entropy: 5.39	time: 550.03
0.00%	cross entropy: 5.38	time: 580.77
0.00%	cross entropy: 5.36	time: 616.12
0.00%	cross entropy: 5.34	time: 637.48
0.00%	cross entropy: 5.31	tim

KeyboardInterrupt: 