In [4]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
import wandb

import settransformer as sf
import tf_utils as tfu

import common

In [5]:
LOAD_PRETRAINED_WEIGHTS = True

In [6]:
strategy = tfu.strategy.gpu(0)

2022-01-01 23:43:07.773832: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-01 23:43:07.774646: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-01 23:43:07.779346: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-01 23:43:07.780129: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-01 23:43:07.780882: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [7]:
wandb_run = wandb.init(project="dna-embeddings", entity="sirdavidludwig")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msirdavidludwig[0m (use `wandb login --relogin` to force relogin)


In [17]:
config = wandb.config = {
    "learning_rate": 1e-3,
    "learning_rate_schedule": 0.95,
    "batch_size": 256,
    "sequence_length": 150,
    "kmers": 3,
    "embed_dim": 64,
    "num_heads": 8,
    "enc_stack": 2,
    "dec_stack": 2,
    "latent_dim": 64,
    "prenorm": True
}

## Data Generation

In [18]:
train_samples = common.find_shelves("./datasets", "train")
test_samples = common.find_shelves("./datasets", "test")

In [19]:
args = dict(
    length=config["sequence_length"],
    batch_size=config["batch_size"],
    include_quality_scores=False
)

train_gen = common.DnaSequenceGenerator(train_samples, **args)
test_gen = common.DnaSequenceGenerator(test_samples, **args)

train_gen.start()
test_gen.start()

In [20]:
train = train_gen.as_dist_dataset(strategy)
test = test_gen.as_dist_dataset(strategy)

2022-01-01 23:45:17.632356: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:695] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: "FlatMapDataset/_2"
op: "FlatMapDataset"
input: "TensorDataset/_1"
attr {
  key: "Targuments"
  value {
    list {
    }
  }
}
attr {
  key: "f"
  value {
    func {
      name: "__inference_Dataset_flat_map_flat_map_fn_182"
    }
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: 256
        }
        dim {
          size: 150
        }
      }
    }
  }
}
attr {
  key: "output_types"
  value {
    list {
      type: DT_INT32
    }
  }
}
. Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. You can do this by creating a new `tf.data.Options()` object then setting `options.experimental_di

---
## Model

In [21]:
kmers = config["kmers"]
sequence_length = config["sequence_length"]//config["kmers"]
num_tokens = 5**config["kmers"]
embed_dim = config["embed_dim"]
latent_dim = config["latent_dim"]
enc_stack = config["enc_stack"]
dec_stack = config["dec_stack"]
num_heads = config["num_heads"]
prenorm = config["prenorm"]

with strategy.scope():
    # Encoder
    y = x = keras.layers.Input((sequence_length,))
    
    # K-MER encoding
    # kmer_powers = np.full(kmers, 5)**np.arange(kmers - 1, -1, -1)
    # y = keras.layers.Lambda(lambda x: tf.reduce_sum(tf.reshape(x, (-1, sequence_length, kmers))**kmer_powers, axis=2))(y)
    
    y = keras.layers.Embedding(input_dim=num_tokens, output_dim=embed_dim)(y)
    y = common.FixedPositionEmbedding(length=sequence_length, embed_dim=embed_dim)(y)
    for _ in range(enc_stack):
        y = common.TransformerBlock(embed_dim, num_heads, ff_dim=embed_dim, prenorm=prenorm)(y)
    y = keras.layers.Flatten()(y)
    y = keras.layers.Dense(latent_dim)(y)
    y = keras.layers.LayerNormalization()(y)
    encoder = keras.Model(x, y, name="Encoder")

    # Decoder
    y = x = keras.layers.Input((encoder.output.shape[1:]))
    y = keras.layers.Dense(sequence_length*embed_dim)(y)
    y = keras.layers.Reshape((-1, embed_dim))(y)
    y = embed = common.FixedPositionEmbedding(length=sequence_length, embed_dim=embed_dim)(y)
    for _ in range(dec_stack):
        y = common.TransformerBlock(embed_dim, num_heads, ff_dim=embed_dim, prenorm=prenorm)(y)
    y = keras.layers.Dense(num_tokens, activation="softmax")(y)
    decoder = keras.Model(x, y, name="Decoder")

    # Coupled model
    y = x = keras.layers.Input(encoder.input.shape[1:])
    y = encoder(y)
    y = decoder(y)
    model = keras.Model(x, y, name="Autoencoder")
    model.encoder = encoder
    model.decoder = decoder
    model.compile(
#         optimizer=tfa.optimizers.AdamW(learning_rate=1e-3, weight_decay=0.0001),
        keras.optimizers.Nadam(1e-3),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        metrics=[keras.metrics.SparseCategoricalAccuracy()])
    model.summary()

Model: "Autoencoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
Encoder (Functional)         (None, 64)                498688    
_________________________________________________________________
Decoder (Functional)         (None, 50, 125)           501821    
Total params: 1,000,509
Trainable params: 1,000,509
Non-trainable params: 0
_________________________________________________________________


In [29]:
# model.load_weights("./models/embed_3mer_64.h5")

In [22]:
with strategy.scope():
    loss_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction="none")
    metric = keras.metrics.SparseCategoricalAccuracy()
    optimizer = keras.optimizers.Adam(config["learning_rate"])

In [23]:
def learning_rate_schedule(optimizer, scale = 0.95):
    optimizer.learning_rate.assign(optimizer.learning_rate*scale)

In [24]:
def loss_fn(real, pred):
    loss = loss_obj(real, pred)
    return tf.nn.compute_average_loss(loss, global_batch_size=config["batch_size"])

In [25]:
def encode_kmer_sequence_batch(batch):
    kmer_powers = np.full(kmers, 5)**np.arange(kmers - 1, -1, -1)
    return tf.reduce_sum(tf.reshape(batch, (batch.shape[0], sequence_length, kmers))*kmer_powers, axis=2)

In [26]:
def train_step(batch):
    kmer_batch = encode_kmer_sequence_batch(batch)
    with tf.GradientTape() as tape:
        pred = model(kmer_batch)
        loss = loss_fn(kmer_batch, pred)
    grads = tape.gradient(loss, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    metric.update_state(kmer_batch, pred)
    
    return loss

@tf.function
def dist_train_step(batch):
    losses = strategy.run(train_step, args=batch)
    return strategy.reduce(tf.distribute.ReduceOp.MEAN, losses, axis=None)

In [27]:
def validate_step(batch):
    kmer_batch = encode_kmer_sequence_batch(batch)
    pred = model(kmer_batch)
    loss = loss_fn(kmer_batch, pred)
    metric.update_state(kmer_batch, pred)
    return loss

@tf.function
def dist_validate_step(batch):
    losses = strategy.run(validate_step, args=batch)
    return strategy.reduce(tf.distribute.ReduceOp.MEAN, losses, axis=None)

In [28]:
steps = []
losses = []
val_losses = []
accuracies = []
val_accuracies = []
step = 0
log_frequency = 100

In [16]:
with strategy.scope():
    x_train = iter(train)
    x_test = iter(test)
    
    for step in range(step, step+10000):
        batch = next(x_train)
        loss = dist_train_step(batch)
        
        steps.append(step+1)
        losses.append(loss.numpy())
        accuracies.append(metric.result().numpy())
        
        metric.reset_states()
        
        if step > 2000 and step % 100 == 0:
            learning_rate_schedule(optimizer, config["learning_rate_schedule"])
        
        print(f"\r{steps[-1]} completed. loss={losses[-1]}; accuracy={accuracies[-1]}; learning rate={optimizer.learning_rate.numpy()}", end="")
        
        if step % log_frequency == 0:
            batch = next(x_test)
            loss = dist_validate_step(batch)
            
            val_losses.append(loss.numpy())
            val_accuracies.append(metric.result().numpy())
            metric.reset_states()

10000 completed. loss=2.994931936264038; accuracy=0.8650173544883728; learning rate=1.7384587408741936e-05

In [17]:
wandb.log({
    "Loss": wandb.plot.line_series(
        xs=steps[::log_frequency],
        ys=[losses[::log_frequency], val_losses],
        keys=["Training Loss", "Validation Loss"],
        title="Loss",
        xname="Step"),
    "Accuracy": wandb.plot.line_series(
        xs=steps[::log_frequency],
        ys=[accuracies[::log_frequency], val_accuracies],
        keys=["Training Accuracy", "Validation Accuracy"],
        title="Accuarcy",
        xname="Step")
})

In [19]:
model.save_weights("./models/embed_3mer_64.h5")

In [18]:
latent = encoder(encode_kmer_sequence_batch(next(test_gen)[0]))
print("mean:", np.mean(latent))
print("std:", np.std(latent))
print("shape:", latent.shape)

mean: 0.011043923
std: 1.029386
shape: (256, 4)


In [30]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…