# DNABERT

## Dependencies

First, it's important to bootstrap the notebook in order for local imports to work correctly.

In [1]:
import bootstrap

Installed dependencies

In [2]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import os
import shelve
import time
import tf_utils as tfu
import wandb

Local dependencies

In [3]:
from common.data import find_shelves, DnaKmerSequenceGenerator
from common.models import dnabert
from common.utils import load_model

---
## Strategy

In [4]:
strategy = tfu.strategy.gpu(0)

2022-05-02 04:03:48.673930: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:03:48.674145: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:03:48.679567: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:03:48.679767: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 04:03:48.679931: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from S

## Wandb API

In [5]:
api = wandb.Api()

Here, we connect to the latest run of pretraining DNABERT on W&B that we're interested in analyzing.

In [6]:
# run = api.runs(
#     path="sirdavidludwig/deep-learning-dna",
#     filters={"group": {"$regex": "dnabert:pretrain"}})[0]
# run.name

In [7]:
run = api.run(path="sirdavidludwig/deep-learning-dna/3tllv9ep")
run.name

'dnabert-1651398131'

## Dataset

Next we can fetch the dataset artifact.

In [8]:
temp_path = "./tmp"
os.makedirs(temp_path, exist_ok=True)

### Fetch from Artifact

In [9]:
dataset_artifact = run.used_artifacts()[0]
dataset_artifact.name

'dnasamples:v5'

In [10]:
dataset_dir = dataset_artifact.download(temp_path)
dataset_dir

[34m[1mwandb[0m: Downloading large artifact dnasamples:v5, 328.84MB. 63 files... Done. 0:0:0


'./tmp'

### Data Generator

In [11]:
sample_files = find_shelves(os.path.join(dataset_dir, "test"), prepend_path=True)
sample_files

['./tmp/test/fall_2016-10-07',
 './tmp/test/fall_2017-10-13',
 './tmp/test/spring_2016-04-22',
 './tmp/test/spring_2017-05-02',
 './tmp/test/spring_2018-04-23',
 './tmp/test/spring_2019-05-14',
 './tmp/test/spring_2020-05-11']

In [12]:
dataset = DnaKmerSequenceGenerator(
    sample_files,
    length=run.config["length"],
    kmer=run.config["kmer"],
    batch_size=run.config["batch_size"],
    batches_per_epoch=run.config["val_batches_per_epoch"],
    augment=run.config["data_augment"],
    balance=run.config["data_balance"]
)

In [13]:
dataset[0]

array([[67, 86, 56, ..., 37, 63, 65],
       [67, 86, 56, ..., 13, 65, 77],
       [51,  5, 27, ..., 27, 10, 52],
       ...,
       [12, 62, 62, ..., 88, 65, 77],
       [56, 30, 27, ..., 77, 10, 52],
       [30, 27, 11, ..., 10, 52, 12]], dtype=int32)

In [14]:
dataset[0].shape

(512, 148)

## Model

In [22]:
model_path = run.file("model.h5").download(temp_path, replace=True)

In [23]:
model_path

<_io.TextIOWrapper name='./tmp/model.h5' mode='r' encoding='UTF-8'>

In [28]:
model = dnabert.DnaBertPretrainModel.load(os.path.join(temp_path, "model.h5"))

2022-05-02 03:10:15.135301: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 03:10:15.135513: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 03:10:15.140374: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 03:10:15.140578: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-02 03:10:15.140738: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S



In [33]:
model.compile(optimizer=keras.optimizers.Adam(1e-4), loss=keras.losses.SparseCategoricalCrossentropy())

In [37]:
model.optimizer.learning_rate.assign

<bound method BaseResourceVariable.assign of <tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=1e-04>>

In [16]:
model.summary()

Model: "DNABERT_pretrain"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 148)]             0         
_________________________________________________________________
dna_bert_base (DnaBertBase)  (None, 149, 128)          2547584   
_________________________________________________________________
lambda_1 (Lambda)            (None, 148, 128)          0         
_________________________________________________________________
dense_16 (Dense)             (None, 148, 125)          16125     
Total params: 2,563,709
Trainable params: 2,563,709
Non-trainable params: 0
_________________________________________________________________
