In [4]:
!nvidia-smi

Thu Aug  8 13:43:25 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    72W / 250W |  16112MiB / 16280MiB |     73%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tensorflow as tf
from tensorflow import keras

import DataGenerator as DG
from DataGenerator import DataGenerator

from tensorflow.keras import layers
from tensorflow.keras.utils import multi_gpu_model
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import backend as K
#from keras_contrib.layers import crf
#from keras_contrib.losses import crf_loss
#from keras_contrib.metrics import crf_viterbi_accuracy
#Progress bar fix: use callbacks=[Logger.JupyterProgbarLogger()] in fit method
#verbose=0 is also required
import JupyterProgbarLogger as Logger
from tqdm import tqdm_notebook as tqdm
#from kerastuner.tuners import RandomSearch


import sklearn.metrics as metrics

import numpy as np
import random
import math

from datetime import datetime
import os
from shutil import copy
from functools import partial

import matplotlib.pyplot as plt

import h5py

import multiprocessing as mp

mp.set_start_method("spawn",force=True)

###FIX NUMPY LOAD FOR DICTIONARIES
np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

###Tensorflow session
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

KeyboardInterrupt: 

In [None]:
#Data Variables
BATCH_SIZE=32
DATA_AMOUNT=800000
VALIDATION_AMOUNT=30000
OFFSET_AMOUNT=0
VAL_OFFSET = 800100
USE_TUNER = False
#Broken
USE_CRF = False
#frames per sample / 3rd dimension for 3D CNN
depth=10
#Data file
filepath = "images_synthetic_mouse_uncompressed.h5"
#If you want to copy the file to a scratch directory
scratch_dir = "/mnt/disks/sec/Amit/"#"/n/scratch2/ap487/"
#Copy file to scratch directory if it is specified, and change filepath to match.
if not os.path.exists((scratch_dir+filepath)):
    copy(filepath,scratch_dir)
    print("File Copied to scratch directory")
filepath=scratch_dir+filepath
print("Using path: ",filepath)

In [None]:
def conv_block(x,
               num_layers,
               num_filters,
               block_id,
               conv_parameters,
               kernel=(3,3),
               padding='same',
               activation=layers.Activation('relu')):
    block_id = str(block_id)
    for i in range(0,num_layers):
        x = layers.TimeDistributed(layers.Conv2D(num_filters, kernel, **conv_parameters),name='Conv2D-'+block_id+chr(97+i))(x)
    x = layers.TimeDistributed(activation,name='Activation-'+block_id)(x)
    x = layers.TimeDistributed(layers.BatchNormalization(),name='BatchNormalization-'+block_id)(x)
    x = layers.TimeDistributed(layers.MaxPooling2D((2, 2), padding=padding),name='MaxPooling2D-'+block_id)(x)
    return x
def build_model(tuner,
                input_shape=(80, 80, 1),
                stride_length=(1, 1),
                kernel=(3,3),
                kernel_initializer='glorot_uniform',
                activation=layers.Activation('relu'),
                dense_activation=layers.Activation('relu'),
                output_activation=layers.Activation('softmax'),
                batch_momentum=.999,
                dropout_chance=0.1,
                combine=True,
                padding='same',
                batch_norm=False,
                gpus = 1
            ):
    name = "LSTM CNN"
    if depth > 1:
        input_shape = (depth,)+input_shape
    inputs = layers.Input(shape=input_shape)
    x = inputs
    conv_parameters = {
        'padding': padding,
        'strides': stride_length,
        'kernel_initializer': kernel_initializer
    }
    # encode net
    x = conv_block(x,2,32,1,conv_parameters)
    x = conv_block(x,2,64,2,conv_parameters)
    x = conv_block(x,2,128,3,conv_parameters)
    x = conv_block(x,2,128,4,conv_parameters)
    x = conv_block(x,2,256,5,conv_parameters)
    #x = layers.TimeDistributed(layers.GlobalAveragePooling2D(),name='GlobalAveragePooling')(x)
    x = layers.TimeDistributed(layers.Flatten(),name='Flatten')(x)
    if dropout_chance>0:
        x = layers.SpatialDropout1D(dropout_chance,name='Dropout')(x)
    if USE_CRF:
        x = layers.LSTM(512,return_sequences=True)(x)
        output = crf.CRF(64)(x)
    else:
        x = layers.GRU(512,return_sequences=False,name='GRU')(x)
        x = layers.Dense(60, activation='relu',name='FC1')(x)
        output = output_activation(x)
    model = keras.models.Model(inputs, output)
    if gpus > 1:
        single_model = model
        model = multi_gpu_model(model,gpus=4,cpu_relocation=True)
        model.__setattr__("callback_model",single_model)
    model.compile(
        optimizer=keras.optimizers.SGD(
            learning_rate=1e-4,
            momentum=.9,
            nesterov=True,
            decay=1e-6
        ),
        loss='sparse_categorical_crossentropy',#crf_loss,
        metrics=['accuracy']#crf_viterbi_accuracy]
    )
    return model,name

In [None]:
if not depth == 1:
    if USE_TUNER:
        tuner = RandomSearch(
            build_model,
            objective='val_accuracy',
            max_trials=40,
            executions_per_trial=1,
            directory='actionrecog',
            project_name='Action Recognition',
            is_generator=True)
        print(tuner.search_space_summary())
    else:
        model,name = build_model(None,input_shape=(80,80,1),gpus=1,dropout_chance=0.25)
        model.summary()
else:
    model,name = build_model()
    model.summary()

In [24]:
def create_training_node(model,
                  outdir = 'training',
                  job_name='train',
                  cores=2,
                  mem='8GB',
                  max_time='120:00:00',
                  data_file = 'images_synthetic_mouse.h5',
                  data_amount = 0,
                  data_offset = 0,
                  validation_amount = 0,
                  validation_offset = 0,
                  batch_size = 8,
                  slide = 5,
                  verbosity = 1,
                  epochs = 40
                 ):
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    model.save(outdir + '/initial.h5')
    cmd = 'sbatch '
    cmd += '--job-name ' + job_name + ' '
    cmd += '--output "' + os.getcwd() + '/train-%j.log" '
    cmd += '--partition small-gpu-b-preemptible '
    cmd += '-n ' + str(cores) + ' '
    cmd += '--mem ' + mem + ' '
    cmd += '--time ' + max_time + ' '
    cmd += '--wrap="' + os.path.expanduser("~") + '/miniconda3/envs/tf2_gpu/bin/python train_model.py continue '
    cmd += '--data-amount ' + str(data_amount) + ' '
    cmd += '--data-offset ' + str(data_offset) + ' '
    cmd += '--validation-amount ' + str(validation_amount) + ' '
    cmd += '--validation-offset ' + str(validation_offset) + ' '
    cmd += '--batch-size ' + str(batch_size) + ' '
    cmd += '--slide ' + str(slide) + ' '
    cmd += '--verbosity ' + str(verbosity) + ' '
    cmd += '--epochs ' + str(epochs) + ' '
    cmd += "'" + data_file +"' "
    cmd += "'" + os.getcwd() + '/' + outdir + '-%s/' +"' "
    cmd += "'" + outdir  +'/initial.h5'+"' "
    cmd += '"'
    return cmd
print(create_training_node(model,
              data_file=filepath,
              outdir='training-1',
              data_amount = 800000,
              data_offset = 0,
              validation_amount=30000,
              validation_offset=800100,
              batch_size=32,
              slide = 1,
              epochs=10,
              verbosity=0))

sbatch --job-name train --output "/home/g17amitprasad_gmail_com/2019-notebooks/Action Recognition/train-%j.log" --partition small-gpu-b-preemptible -n 2 --mem 8GB --time 120:00:00 --wrap="/home/g17amitprasad_gmail_com/miniconda3/envs/tf2_gpu/bin/python train_model.py continue --data-amount 800000 --data-offset 0 --validation-amount 30000 --validation-offset 800100 --batch-size 32 --slide 1 --verbosity 0 --epochs 10 '/mnt/disks/sec/Amit/images_synthetic_mouse_uncompressed.h5' '/home/g17amitprasad_gmail_com/2019-notebooks/Action Recognition/training-1-%s/' 'training-1/initial.h5' "


In [13]:
#Initialize generators
data_gen = DataGenerator(filepath,data_amount=DATA_AMOUNT,batch_size=BATCH_SIZE,frames_per_sample=depth,offset=OFFSET_AMOUNT,sliding_window=1)
validation_gen = DataGenerator(filepath,data_amount=VALIDATION_AMOUNT,batch_size=BATCH_SIZE,frames_per_sample=depth,offset=VAL_OFFSET,sliding_window=1)

In [20]:
#model = keras.models.load_model('model-2019-08-01-01:13:21.h5')
if USE_TUNER:
    tuner.search(generator=data_gen, validation_data=validation_gen,
                use_multiprocessing=True, workers=10,
                epochs = 8)
    model = tuner.get_best_models(num_models=5)[0]
else:
    start_time = datetime.today().strftime('%Y-%m-%d-%H:%M:%S')
    history = model.fit_generator(generator=data_gen,
                    validation_data=validation_gen,
                    epochs=100,
                    verbose=0,
                    use_multiprocessing=True,
                    workers=16,
                    max_queue_size=10,
                    callbacks=[Logger.JupyterProgbarLogger(count_mode='steps',measure_gpu=True),
                              keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0, patience=6,verbose=1, mode='auto',restore_best_weights=True),
                              keras.callbacks.ModelCheckpoint(filepath='models/model-'+start_time+'-progress-{epoch:02d}.h5')]
                   )

HBox(children=(IntProgress(value=0, description='Epoch 1/100[CPU:  3%]', max=24997, style=ProgressStyle(descri…

KeyboardInterrupt: 

In [None]:
filename = "model-"+datetime.today().strftime('%Y-%m-%d-%H:%M:%S')+".h5"
model.save(filename)
print("Model saved to ",filename)

In [1]:
!nvidia-smi

Wed Aug  7 20:37:54 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0   141W / 250W |   8809MiB / 16280MiB |     63%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    