<a href="https://colab.research.google.com/github/AvantiShri/colab_notebooks/blob/master/misc_examples/SmallerFactorizedModelBetter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
!pip install simdna

Collecting simdna
[?25l  Downloading https://files.pythonhosted.org/packages/14/c6/dc6cc2e9ac09c85d5ec6d896c6c43c8dd5ef50bb9c14423e9290131dce27/simdna-0.4.3.2.tar.gz (634kB)
[K     |▌                               | 10kB 21.2MB/s eta 0:00:01[K     |█                               | 20kB 1.7MB/s eta 0:00:01[K     |█▌                              | 30kB 2.2MB/s eta 0:00:01[K     |██                              | 40kB 2.4MB/s eta 0:00:01[K     |██▋                             | 51kB 2.0MB/s eta 0:00:01[K     |███                             | 61kB 2.3MB/s eta 0:00:01[K     |███▋                            | 71kB 2.5MB/s eta 0:00:01[K     |████▏                           | 81kB 2.8MB/s eta 0:00:01[K     |████▋                           | 92kB 2.9MB/s eta 0:00:01[K     |█████▏                          | 102kB 2.9MB/s eta 0:00:01[K     |█████▊                          | 112kB 2.9MB/s eta 0:00:01[K     |██████▏                         | 122kB 2.9MB/s eta 0:00:01[K 

In [3]:
!densityMotifSimulation.py --prefix pos --motifNames GATA_disc1 --mean-motifs 1 --min-motifs 1 --max-motifs 3 --rc-prob 0.5 --zero-prob 0.2 --numSeqs 1000 --seqLength 400 --seed 1234
!emptyBackground.py --prefix neg --seqLength 400 --numSeqs 1000

In [4]:
from glob import glob
import numpy as np
import sklearn
import sklearn.model_selection

ltr = {
    'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1]
}

def onehot_encode(seqs):
  return np.array([ [ltr[x] for x in seq] for seq in seqs])

def read_fasta(file):
  return [x.rstrip() for i,x in enumerate(open(file)) if i%2==1]

pos_seqs = read_fasta(glob("DensityEmbedding_prefix-pos*.fa")[0])
neg_seqs = read_fasta(glob("EmptyBackground_prefix-neg*.fa")[0])

all_onehot = onehot_encode(pos_seqs+neg_seqs)
all_labels = np.array([[1] for x in pos_seqs]+[[0] for x in neg_seqs])

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    all_onehot, all_labels, test_size=0.2, random_state=1234)

In [9]:
import tensorflow as tf
import numpy as np
import keras
from sklearn.metrics import roc_auc_score


def set_seed(seed):
  np.random.seed(seed)
  tf.set_random_seed(seed)


def make_model(filter_width, filt_per_layer, num_layers):
  model = keras.models.Sequential()
  model.add(keras.layers.Conv1D(filters=filt_per_layer,
                                kernel_size=filter_width,
                                input_shape=X_train.shape[1:],
                                activation="relu",
                                kernel_initializer="he_normal"))
  for i in range(1,num_layers):
    model.add(keras.layers.Conv1D(filters=filt_per_layer,
                                kernel_size=filter_width,
                                activation="relu",
                                kernel_initializer="he_normal"))
  model.add(keras.layers.GlobalAveragePooling1D())
  model.add(keras.layers.Dense(1, activation="sigmoid"))
  return model


def train_model(modelcreator, seed):
  set_seed(seed)
  model = modelcreator()
  early_stopping_callback = keras.callbacks.EarlyStopping(
                              patience=5, restore_best_weights=True)
  model.compile(optimizer="adam", loss="binary_crossentropy",
                metrics=["accuracy"])
  history = model.fit(X_train, y_train,
            validation_data=(X_test, y_test), epochs=30,
            callbacks=[early_stopping_callback])
  model.set_weights(early_stopping_callback.best_weights)
  y_test_preds = model.predict(X_test)
  return roc_auc_score(y_true=y_test, y_score=y_test_preds)


def train_many_models(modelcreator):
  aucs = []
  for seed in [100,200,300,400,500]:
    print("Seed:",seed)
    auc = train_model(modelcreator, seed)
    aucs.append(auc)
    print("AUC:",auc)
  print("AUCS:",aucs)
  return aucs

In [10]:
modarch1_factory = lambda: make_model(filter_width=15, filt_per_layer=10,
                              num_layers=3)
modarch2_factory = lambda: make_model(filter_width=7, filt_per_layer=10,
                              num_layers=5)

print("Arch 1")
modarch1_aucs = train_many_models(modarch1_factory)

print("Arch 2")
modarch2_aucs = train_many_models(modarch2_factory)

Arch 1
Seed: 100
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 1600 samples, validate on 400 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
AUC: 0.5427542754275427
Seed: 200
Train on 1600 samples, validate on 400 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
AUC: 0.5072257225722572
Seed: 300
Train on 1600 samples, validate on 400 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
AUC: 0.6024352435243524
Seed: 400
Train on 1600 samples, validate on 400 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Ep