In [None]:
from __future__ import division, print_function

## Get the saved Keras 1.2 model

In [None]:
!pwd

In [None]:
!ls

In [None]:
!rm tf_order_record_5_model_PQzyq_modelWeights.h5
!rm tf_order_record_5_model_PQzyq_modelJson.json
!wget https://raw.githubusercontent.com/AvantiShri/model_storage/4143cfce7e61611d4c42984578e420cd7556c4b6/deeplift/genomics/tf_order_record_5_model_PQzyq_modelWeights.h5
!wget https://raw.githubusercontent.com/AvantiShri/model_storage/4143cfce7e61611d4c42984578e420cd7556c4b6/deeplift/genomics/tf_order_record_5_model_PQzyq_modelJson.json

In [None]:
import h5py
import json
f = h5py.File("tf_order_record_5_model_PQzyq_modelWeights.h5")
print("keras version", f.attrs['keras_version'])
print("keras json", json.dumps(json.loads(open("tf_order_record_5_model_PQzyq_modelJson.json").read()),
                               indent=4),"\n")
print("layer weight names:", [layer_name+"/"+x
                              for layer_name in f['model_weights'].keys()
                              for x in f['model_weights'][layer_name].attrs['weight_names']],"\n")

In [None]:
import keras
print ("keras version",keras.__version__)
import numpy as np

#create a keras 2 model with the same architecture
#set the weights for each layer using the hdf5
#weights file
model = keras.models.Sequential()
model.add(keras.layers.Conv1D(filters=50, kernel_size=11,
                              strides=1,
                              input_shape=(200,4)))
model.add(keras.layers.Activation("relu"))
model.add(keras.layers.Conv1D(filters=50, kernel_size=11,
                              strides=1))
model.add(keras.layers.Activation("relu"))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(units=50))
model.add(keras.layers.Activation("relu"))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(units=3))
model.add(keras.layers.Activation("sigmoid"))
model.build()

#load the weights into each layer

#first convolution
model.layers[0].set_weights(
    [(np.array(f['model_weights']['convolution2d_1/convolution2d_1_W'])
              .squeeze().transpose((1,0,2))),
     np.array(f['model_weights']['convolution2d_1/convolution2d_1_b'])])
#second convolution
model.layers[2].set_weights(
    [np.array(f['model_weights']['convolution2d_2/convolution2d_2_W']).squeeze(),
     np.array(f['model_weights']['convolution2d_2/convolution2d_2_b'])])
#first dense layer
model.layers[5].set_weights(
    [np.array(f['model_weights']['dense_1/dense_1_W']),
     np.array(f['model_weights']['dense_1/dense_1_b'])])
#second dense layer
model.layers[8].set_weights(
    [np.array(f['model_weights']['dense_2/dense_2_W']),
     np.array(f['model_weights']['dense_2/dense_2_b'])])

## Verify the accuracy

In [None]:
!rm sequences.simdata.gz
!wget https://raw.githubusercontent.com/AvantiShri/model_storage/db919b12f750e5844402153233249bb3d24e9e9a/deeplift/genomics/sequences.simdata.gz
!rm test.txt.gz
!wget https://raw.githubusercontent.com/AvantiShri/model_storage/9aadb769735c60eb90f7d3d896632ac749a1bdd2/deeplift/genomics/test.txt.gz

In [None]:
try:
    import simdna
except ImportError:
    print("installing simdna package")
    !pip install -e "git://github.com/kundajelab/simdna.git@0.4.0#egg=simdna"
    print("\n**********************************************************")
    print("RESTART THE JUPYTER KERNEL TO PICK UP ON THE INSTALLATION!!!")
    print("************************************************************")

In [None]:
import simdna.synthetic as synthetic
reload(synthetic)
reload(synthetic.core)
import gzip
data_filename = "sequences.simdata.gz"
#read in the data in the testing set
test_ids_fh = gzip.open("test.txt.gz","rb")
ids_to_load = [x.rstrip("\n") for x in test_ids_fh]
data = synthetic.read_simdata_file(data_filename, ids_to_load=ids_to_load)

In [None]:
import numpy as np

#this model was trained on data one-hot encoded as a 2d image,
#with the row-axis being the axis for one-hot encoding.
def one_hot_encode_along_row_axis(sequence):
    #theano dim ordering, uses row axis for one-hot
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence,
                                 one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1):
        assert zeros_array.shape[0] == len(sequence)
    #zeros_array should be an array of dim 4xlen(sequence), filled with zeros.
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1

onehot_data = np.array([one_hot_encode_along_row_axis(seq) for seq in data.sequences])

In [None]:
from sklearn.metrics import roc_auc_score

predictions = model.predict(onehot_data)

print(roc_auc_score(y_score=predictions[:,0],
                    y_true=data.labels[:,0]))
print(roc_auc_score(y_score=predictions[:,1],
                    y_true=data.labels[:,1]))
print(roc_auc_score(y_score=predictions[:,2],
                    y_true=data.labels[:,2]))

## Save the converted model

In [None]:
model.save_weights("keras2_conv1d_record_5_model_PQzyq_modelWeights.h5")
open("keras2_conv1d_record_5_model_PQzyq_modelJson.json",'w').write(model.to_json())