<a href="https://colab.research.google.com/github/Deepu1992/VideoClassification/blob/master/Model_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SETUP AND CONFIGURATION

##TURN SECTIONS ON/OFF

In [1]:
MOUNT_GDRIVE         = True
CHECK_GPU_INFO       = False
DOWNLOAD_VIDEOS      = False
FOLDER               = 'mini'

#ROI EXTRACTION
EXTRACT_ROI          = False
ROI_ROT              = False 
ERASE_ROI_COLOUR     = False        

#FEATURE EXTRACTION
EXTRACT_FEATURES     = True

#MODEL TRAINING
TRAIN_MODEL          = True
CONTINUE_TRAINING    = False

##PARAMETER DEFINITION

In [2]:
#FEATURE EXTRACTION
DEPTH_OF_BASE        = 14 
RESIZE_SHAPE         = (50, 50, 3)
TRAIN_STREAM_SIZE    = 1000
TEST_STREAM_SIZE     = 500

#MODEL TRAINING
MODEL_VERSION        =  "MODEL_V4_"
MINI_BATCH_SIZE      =  64
LEARNING_RATE        =  .0001
#KEEP_POOLING_LAYERS  =  True
WEIGHT_BY_FRAME      =  False 
CLASS_WTS            =  {0: 5, 1: 95}
FRAME_SUBSET_TYPE    =  'last_n' #['last_n_subset','last_n','random',
                                        #'even_spaced', 'random_subsection']
MIN_FRAME_COUNT      =  40 #Videos less than this threshold are not used for training.
EPOCH_PER_SET        =  20
VALIDATION_SIZE      =  .7
GLOBAL_ITER          =  40 
FIT_1_SAMPLE         =  False
MINI_BATCH_ITERATION =  False
LSTM_L1_REGULARIZATION    =  .01
LSTM_L2_REGULARIZATION    =  .001
L1_REGULARIZATION         = .00001
L2_REGULARIZATION         = .00001
assert not(MINI_BATCH_ITERATION and FIT_1_SAMPLE)

##CHECK GPU USAGE

In [3]:
if CHECK_GPU_INFO:
    # memory footprint support libraries/code
    !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
    !pip install gputil
    !pip install psutil
    !pip install humanize
    import psutil
    import humanize
    import os
    import GPUtil as GPU
    GPUs = GPU.getGPUs()
    # XXX: only one GPU on Colab and isn’t guaranteed
    gpu = GPUs[0]
    def printm():
      process = psutil.Process(os.getpid())
      print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
      print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
    printm() 

    #!cat /proc/meminfo
    #!cat /proc/cpuinfo

##MOUNT GOOGLE DRIVE

In [4]:
if MOUNT_GDRIVE:
  from google.colab import drive 
  drive.mount('/content/gdrive') 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


##DEFINE PATH VARIABLES


In [5]:
from os import mkdir
from os.path import isdir

#define paths
PATH_PROJ = "/content/gdrive/My Drive/Google Colab Data/StallCatcher/"
PATH_ROOT = PATH_PROJ + "{}/".format(FOLDER)
VIDEO_FILES_PATH =  PATH_ROOT + "VIDEO_FOLDER/"                    #PATH TO VIDEO FILES IN GDRIVE

#TRAIN_TEST_LABEL_INFO
PATH_TRAIN_TEST_LABEL = PATH_ROOT + "TRAIN_TEST_LABELS/" +  'TRAIN_TEST_SPLIT.csv'

#ROI
roi_rot   = ("ROI{}_rotated/"  if ROI_ROT else "ROI{}/")
roi_erase = roi_rot.format('_ERASED' if ERASE_ROI_COLOUR else '')
ROI_PATH = PATH_ROOT + "ROI_FOLDER/" + roi_erase                #PATH TO ROI DATA
if not isdir(ROI_PATH):
  mkdir(ROI_PATH)
  mkdir(ROI_PATH+"train/")
  mkdir(ROI_PATH+"test/")
  print("Folder" , ROI_PATH , "and subfolders created")

FEATURE_USED = "features_depth_{}_{}_{}/".format(DEPTH_OF_BASE,roi_erase[:-1],RESIZE_SHAPE[0])
FEATURE_PATH = PATH_ROOT  + "FEATURE_FOLDER/" + FEATURE_USED #PATH TO FEATURES EXTRACTED FROM ROI
#FEATURE_PATH = PATH_ROOT + "FEATURE_FOLDER/" + 'extracted_features_full_depth/'
if not isdir(FEATURE_PATH):
  mkdir(FEATURE_PATH)
  mkdir(FEATURE_PATH+"train/")
  mkdir(FEATURE_PATH+"test/")
  print("Folder" , FEATURE_PATH , "created")

MODEL_FOLDER = PATH_ROOT + "MODEL_FOLDER/"
UTILS_PATH = "/content/gdrive/My Drive/Colab Notebooks/StallCatcher/"
LOGS_DIR = MODEL_FOLDER + "logs/fit/"


#define filenames
METADATA_FILENAME = PATH_ROOT + 'train_metadata_{}.csv'.format(FOLDER) #FILES LISTED WILL BE USED FOR TRAINING AND VALIDATION
LABELS_FILENAME   = PATH_PROJ + 'train_labels.csv'

MODEL_SUB_FODLER  = MODEL_FOLDER + MODEL_VERSION[:-1] + "/"
MODEL_FILENAME    = MODEL_SUB_FODLER + MODEL_VERSION
BEST_MODEL_FILE   = MODEL_SUB_FODLER + 'MODEL_BEST'

if not isdir(MODEL_SUB_FODLER):
  mkdir(MODEL_SUB_FODLER)
  print("Folder" , MODEL_SUB_FODLER , "created")

MODEL_RESULTS     = MODEL_SUB_FODLER + "MODEL_RESULTS/"
HYP_PARAM_FILE    = MODEL_RESULTS + 'MODEL_SETTING_HYPER_PARAM.csv'

if not isdir(MODEL_RESULTS):
  mkdir(MODEL_RESULTS)
  print("Folder", MODEL_RESULTS , "created")

for i in ['PERFORMANCE', 'MCC', 'LOSS_GRAPH', 'ACC_GRAPH', 'MCC_GRAPH','OUT_HIST']:
      if not isdir(MODEL_RESULTS + i):
        mkdir(MODEL_RESULTS + i)
        print("Folder", MODEL_RESULTS + i , "created")

#RESULTS FORMATS
PERF_FILE_FORMAT  = MODEL_RESULTS + "PERFORMANCE/" + "Perf_"
MODEL_PERF_FILE   = PERF_FILE_FORMAT + MODEL_VERSION + "{}.csv"

MODEL_MCC_FORMAT     = MODEL_RESULTS + "MCC/" + MODEL_VERSION + "{}.csv"
LOSS_IMAGE_NAME      = MODEL_RESULTS + "LOSS_GRAPH/" + MODEL_VERSION + "{}.jpeg"
ACC_IMAGE_NAME       = MODEL_RESULTS + "ACC_GRAPH/"  + MODEL_VERSION + "{}.jpeg"
MCC_IMAGE_NAME       = MODEL_RESULTS + "MCC_GRAPH/"  + MODEL_VERSION + "{}.jpeg"
OUT_DIST_IMAGE_NAME  = MODEL_RESULTS + "OUT_HIST/"   + MODEL_VERSION + "{}.jpeg"

##IMPORT LIBRARIES

In [6]:
import pandas as pd
import os
import h5py
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import datetime
import warnings
from math import ceil

#warnings.filterwarnings('ignore') #u

from tensorflow.keras import models,layers,optimizers,losses
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import LSTM, Input, Flatten
from tensorflow.keras.regularizers import L1L2
from random import shuffle
from sklearn.metrics import confusion_matrix,matthews_corrcoef
from tqdm.notebook import tqdm
from time import sleep
from skimage.transform import resize   # for resizing images
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from random import sample,choice,randint

#check if running on GPU
if tf.test.gpu_device_name() == '/device:GPU:0':
  print("Tensorflow GPU Loaded") 

Tensorflow GPU Loaded


##LOAD USER DEFINED FUNCTIONS

In [7]:
if False:
  import sys
  sys.path.append(UTILS_PATH)
  from utilities import extract_roi_and_store_tensors

##LOAD METADATA

In [8]:
train_metadata = pd.read_csv( METADATA_FILENAME)
list_of_files = list(train_metadata.filename)
train_metadata.set_index('filename',inplace = True)

#DOWNLOADING AND PREPROCESSING

##DOWNLOAD VIDEOS

In [9]:
if DOWNLOAD_VIDEOS :
  #import library to interact with s3
  import boto3

  #setup s3 access
  s3r = boto3.resource('s3',
                      aws_access_key_id='AKIAR3X7R6S3PM6G56GS',
                      aws_secret_access_key= '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')
  buck = s3r.Bucket('drivendata-competition-clog-loss')

  failed = []
  
  already_downloaded_files = [f for f in listdir(VIDEO_FILES_PATH) if isfile(join(VIDEO_FILES_PATH, f))]
  videos_to_download = set.difference(set(list_of_files),set(already_downloaded_files)) 

  for f in tqdm(videos_to_download, position=0, leave=True):
      try:
        s3_path_of_video = "train/" + f
        buck.download_file(s3_path_of_video,VIDEO_FILES_PATH + f) 
      except:
        print("Failed - ", f)
        failed.append(f)
        print(".", sep = "", end = "")
  if len(failed) > 0:
    print(str(len(failed)) + " files failed to load." )

##EXTRACT ROI 

In [10]:
def get_roi(frame, viz = False):
    
    #get orange coloured cells as 255 and rest as 0                                                                                                                                    
    th = cv2.inRange(frame, (0, 13, 104), (98, 143, 255))  

    #return row and column indices of orange cells                                                                                             
    points = np.where(th>0)            
    if ERASE_ROI_COLOUR:
      frame[points] = 0
    #get coordinates                                                                                                                                  
    p2 = zip(points[0], points[1])                                                                                                                                       
    p2 = [p for p in p2]               

    #get bounding box                                                                                                                                  
    rect = cv2.boundingRect(np.float32(p2))
    
    #get rectangle
    row_min,row_max,col_min,col_max = rect[0],rect[0]+rect[2],rect[1],rect[1]+rect[3]

    roi = frame[row_min:row_max,col_min:col_max,:]  
  
    if viz:
        cv2.imshow("short", roi)                                                                                                                                                  
        cv2.waitKey(1000)            
        cv2.destroyAllWindows()
    return roi

In [11]:
def get_ROI_inclined_rect(frame, viz = False):

    #get orange coloured cells as 255 and rest as 0                                                                                                                                    
    th = cv2.inRange(frame, (0, 13, 104), (98, 143, 255)) 

    #return row and column indices of orange cells                                                                                             
    points = np.where(th>0)

    if ERASE_ROI_COLOUR:
      frame[points] = 0
    #get coordinates                                                                                                                                  
    p2 = zip(points[1], points[0])                                                                                                                                       
    p2 = [p for p in p2]        
    
    # find rotated rectangle
    rect = cv2.minAreaRect(np.float32(p2))
    
    # rotate img
    angle = rect[2]
    rows, cols = frame.shape[0], frame.shape[1]
    M = cv2.getRotationMatrix2D((cols / 2, rows / 2), angle, 1)
    img_rot = cv2.warpAffine(frame, M, (cols, rows))

    # rotate bounding box
    #rect0 = (rect[0], rect[1], 0.0)
    box = cv2.boxPoints(rect)
    pts = np.int0(cv2.transform(np.array([box]), M))[0]
    pts[pts < 0] = 0

    # crop
    img_crop = img_rot[pts[1][1]:pts[0][1],
               pts[1][0]:pts[2][0]]
    
    if viz:        
        cv2.imshow('xyz', img_crop)                                                                                                                                                  
        cv2.waitKey(1000)
        cv2.destroyAllWindows()              
    return img_crop 

In [12]:
def extract_roi_and_store_tensors(filenames_list, sample_type,\
                                  roi_path, video_path, batch_size = 1000):

    assert len(filenames_list) > 0
    
    print("Extracting ROI and loading to ", sample_type, "\n")
    sleep(1)

    #path to save roi
    temp_path = roi_path + sample_type + "/" + sample_type
    
    #h5 file id
    h5_file_count = 0
    h5f   = h5py.File( temp_path + str(h5_file_count) + '.h5', 'w')
    
    ind = 0
    for videoFile in tqdm(filenames_list):               
        

        #initiate next batch
        if ind%batch_size == 0 and ind != 0:
           print(ind)
           h5f.close()
           h5_file_count +=1
           h5f = h5py.File(temp_path + str(h5_file_count) + '.h5', 'w')        
        
        #update iteration count
        ind += 1

        #id image
        count = 0    
        
        #full video path
        video_input_path = video_path + videoFile
        
        # read video
        cap = cv2.VideoCapture(video_input_path)           
        
        #number of frames
        n_frame = int(cap.get(7))

        #to get shape of tensor
        do_once = True

        while(cap.isOpened()):
            #frameId = cap.get(1)
            ret, frame = cap.read()
            if (ret == False):
                break
           
            if True: #sample frame here if required using frameId
                
                #subset roi from image
                if ROI_ROT:
                  roi = get_ROI_inclined_rect(frame)
                else:
                  roi = get_roi(frame)

                if do_once:
                  #placeholder to store roi frames
                  h,w,c = roi.shape
                  roi_tensor = np.zeros((n_frame, h,w,c))  

                if roi.shape[0] != h or roi.shape[1] != w:
                  roi = resize(roi, (h, w), preserve_range = True,
                           anti_aliasing=False) 

                #write image
                roi_tensor[count] = roi

                #update counter
                count+=1

        #normalize and save
        h5f.create_dataset(videoFile, data= roi_tensor)                      
        
        #release cap object
        cap.release()


    h5f.close()      
    sleep(1)

In [13]:
if EXTRACT_ROI:

    #get list of video files available
    video_filenames  = [f for f in listdir(VIDEO_FILES_PATH) if isfile(join(VIDEO_FILES_PATH, f))]

    ##split data into train and test
    #temp_label_df = pd.read_csv(PATH_TRAIN_TEST_LABEL)
    #gb            = temp_label_df.groupby('sample_type')
    #train         = gb.get_group('train').loc[:,'filename'].tolist()
    #test          = gb.get_group('test').loc[:,'filename'].tolist()
    #del gb, temp_label_df
    train,test = train_test_split(video_filenames,test_size = 0.2) #-- run only during project setup
    
    #extract roi and save for train and test   
    extract_roi_and_store_tensors(filenames_list = test,  sample_type = 'test', 
                                  roi_path = ROI_PATH, 
                                  video_path = VIDEO_FILES_PATH,
                                  batch_size = TEST_STREAM_SIZE)
    extract_roi_and_store_tensors(filenames_list = train, sample_type = 'train', 
                                  roi_path = ROI_PATH, video_path = VIDEO_FILES_PATH,
                                  batch_size = TRAIN_STREAM_SIZE)

##EXTRACT FEATURES 

In [14]:
def extract_features(sample_type, roi_path, feature_path, conv_base,\
                     flatten_features = False,\
                     resize_to_shape =  (100,100,3),\
                     batch_size = 200):
    """extracts features using the model provided"""
    
    input_folder_path = roi_path + sample_type + "/"
    roi_files = [f for f in listdir(input_folder_path) if isfile(join(input_folder_path, f))]
 
    output_folder_path = feature_path + sample_type + "/"
    
    unread_roi    = []
    omitted_files = []
    OUT_SIZE      = None

    print("\n\nRunning pretrained model on {} ROI tensors".format(sample_type))
    sleep(1)
        
    print_format = "Processing batch {}/" + str(len(roi_files))

    #path to save roi
    temp_path = feature_path + sample_type + "/" + sample_type

    #h5 file id
    write_file_count = 0
    feature_write   = h5py.File( temp_path + str(write_file_count) + '.h5', 'w')
    
    samples_processed = 0

    for batch,roi_file_name in enumerate(roi_files):

        print(print_format.format(batch+1))

        try:       
            #create h5py read object for roi data
            if 'roi_read' in locals():
              roi_read.close()
            roi_read = h5py.File(input_folder_path  + roi_file_name,'r')
        
            #get video files in object
            dirFILES  = list(roi_read.keys())
            
            for file_ in tqdm(dirFILES):

              #initiate next batch
                if samples_processed%batch_size == 0 and samples_processed != 0:
                  print("Wrote file number " , str(write_file_count) , "to drive.")
                  feature_write.close()
                  write_file_count +=1
                  feature_write = h5py.File(temp_path + str(write_file_count) + '.h5', 'w')  
    
                try:
                    #read roi
                    roi_tensor = roi_read[file_][:]
            
                    #resize_value will hold the resized frames of the roi tensor
                    resized_tensor = np.zeros((roi_tensor.shape[0], resize_to_shape[0],
                                            resize_to_shape[1],resize_to_shape[2]))
                    
                    for frame in range(roi_tensor.shape[0]):
                        #resize    
                        resized_tensor[frame] = resize(roi_tensor[frame], resize_to_shape, 
                                            preserve_range = True,
                                            anti_aliasing=False)  
                        
                    #apply pre trained model on roi frame of sample            
                    feature = conv_base.predict(np.array(resized_tensor))
            
                    #reshape to (n_frames, 3*3*512)
                    if flatten_features:
                      OUT_SIZE = np.product(feature.shape[1:])
                      feature = feature.reshape(feature.shape[0], OUT_SIZE)
            
                    feature_write.create_dataset(file_, data= feature)   
                    samples_processed += 1
                except:
                    omitted_files.append(file_)   
            #close h5py objects
            #feature_write.close()
            roi_read.close()
        except:
          unread_roi.append(roi_file_name)
    feature_write.close()
    return omitted_files,unread_roi, OUT_SIZE

In [None]:
if EXTRACT_FEATURES:

    #load model
    conv_base = VGG16(weights='imagenet', include_top=False,input_shape=RESIZE_SHAPE)

    #conv_base.summary()
    #ith layer is the last layer of an output shape
    base_depth_options = [i for i,str_ in enumerate(conv_base.layers) if 'MaxPooling2D' in str(str_)]
    print("Depth of base options: ", base_depth_options)
    sleep(1)

    #extract first few layers
    shallow_base = models.Sequential(conv_base.layers[:DEPTH_OF_BASE+1])

    #extract feature for train,test   
    omitted_test_files,unread_roi_test,________  = extract_features(sample_type = 'test',  roi_path = ROI_PATH, 
                                           feature_path = FEATURE_PATH,
                                           flatten_features = True, 
                                           conv_base = shallow_base,
                                           resize_to_shape =  RESIZE_SHAPE,
                                           batch_size = TEST_STREAM_SIZE)
    sleep(1)
    omitted_train_files,unread_roi_trian,OUT_SIZE = extract_features(sample_type = 'train',  roi_path = ROI_PATH, 
                                           feature_path = FEATURE_PATH,
                                           flatten_features = True, 
                                           conv_base = shallow_base,
                                           resize_to_shape =  RESIZE_SHAPE,
                                           batch_size = TRAIN_STREAM_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Depth of base options:  [3, 6, 10, 14, 18]


Running pretrained model on test ROI tensors
Processing batch 1/8


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))


Processing batch 2/8


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

Wrote file number  0 to drive.

Processing batch 3/8
Processing batch 4/8


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

Wrote file number  1 to drive.

Processing batch 5/8


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

Wrote file number  2 to drive.

Processing batch 6/8


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

Wrote file number  3 to drive.

Processing batch 7/8


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

Wrote file number  4 to drive.

Processing batch 8/8


HBox(children=(FloatProgress(value=0.0, max=264.0), HTML(value='')))

Wrote file number  5 to drive.



Running pretrained model on train ROI tensors
Processing batch 1/16


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


Processing batch 2/16


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Wrote file number  0 to drive.

Processing batch 3/16


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Wrote file number  1 to drive.

Processing batch 4/16


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Wrote file number  2 to drive.

Processing batch 5/16


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Wrote file number  3 to drive.

Processing batch 6/16


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Wrote file number  4 to drive.


#MODEL TRAINING

##SETUP TENSORBOARD

In [None]:
# Load the TensorBoard notebook extension
#%load_ext tensorboard

In [None]:
# Clear any logs from previous runs
#!rm -rf ./logs/ 

##MODEL ARCHITECTURE

In [None]:
def get_OUT_SIZE():
      ######TEST##########
      sample_type  = 'test'
      sub_path     = FEATURE_PATH + sample_type + "/"
      options      = [f for f in listdir(sub_path) if isfile(join(sub_path, f))]
      feature_read = h5py.File(sub_path + options[0],'r')
      for key,value in feature_read.items():
          value = value[:]
          break
      feature_read.close()
      return np.product(value.shape[1:])

In [None]:
if TRAIN_MODEL:
    if 'OUT_SIZE' in globals() or 'OUT_SIZE' in locals():
      pass
    else:
        OUT_SIZE = get_OUT_SIZE() #or manually enter value
        print("OUT_SIZE of conv base" , OUT_SIZE)
    if CONTINUE_TRAINING:
        models_available = [f for f in listdir(MODEL_SUB_FODLER) if f.startswith(MODEL_VERSION)]
        models_keys      = [s.split('_')[2:] for s in models_available]
        models_keys      = sorted(models_keys, key = lambda k: (int(k[0]),int(k[1]),int(k[2]))) 
        latest_model     = models_keys[-1]
        model_name       = MODEL_VERSION + '_'.join(latest_model)
        model            = models.load_model(MODEL_SUB_FODLER +model_name)
        print("Successfully loaded model ", model_name)
    else:
        #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #turn off warnings  
        tf.get_logger().setLevel('INFO')

        #load pretrained keras model
        conv_base = VGG16(weights='imagenet', include_top=False,input_shape=RESIZE_SHAPE)
        
        for layer in conv_base.layers:
          layers.trainable = False
          
        #define model architecture
        #model = models.Sequential(conv_base.layers[DEPTH_OF_BASE:])
        #model.add(Flatten())

        model = models.Sequential()

        if FIT_1_SAMPLE:
           model.add(LSTM(1000, input_shape = (None,OUT_SIZE),
                              kernel_regularizer=L1L2(l1=LSTM_L1_REGULARIZATION,
                                                      l2=LSTM_L2_REGULARIZATION),
                              name = "1_LSTM"))         
        else:
          model.add(LSTM(1000, input_shape = (50,OUT_SIZE),
                              kernel_regularizer=L1L2(l1=LSTM_L1_REGULARIZATION,
                                                      l2=LSTM_L2_REGULARIZATION),
                              name = "1_LSTM"))
        
        model.add(layers.Dense(128, activation='relu',
                               kernel_regularizer=L1L2(l1=L1_REGULARIZATION,
                                                      l2=L2_REGULARIZATION),
                               name = "2_Dense"))
        
        model.add(layers.Dropout(0.5, name = "3_Dropout"))

        model.add(layers.Dense(32, activation='relu',
                               kernel_regularizer=L1L2(l1=L1_REGULARIZATION,
                                                      l2=L2_REGULARIZATION),
                               name = "4_Dense"))
        
        model.add(layers.Dropout(0.5, name = "5_Dropout"))

        model.add(layers.Dense(1, activation='sigmoid',
                               kernel_regularizer=L1L2(l1=L1_REGULARIZATION,
                                                      l2=L2_REGULARIZATION),
                               name = "6_Dense"))
        
        model.compile(optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
                                                      loss='binary_crossentropy',
                                                      metrics=['acc'])
    model.summary()

##FUNCTION TO LOAD DATA

In [None]:
if TRAIN_MODEL:    
    
    sub_path = FEATURE_PATH + 'train' + "/"
    train_options = [f for f in listdir(sub_path) if isfile(join(sub_path, f))]

    #load label dataframe      
    DF_LABEL = pd.read_csv(LABELS_FILENAME)      
    DF_LABEL.set_index('filename', inplace = True)
     
    def load_data(choice_train_set, test_prop = .3):

      #fetch train,test  features

      ######TEST##########
      sample_type  = 'test'
      sub_path     = FEATURE_PATH + sample_type + "/"
      options      = [f for f in listdir(sub_path) if isfile(join(sub_path, f))]
      feature_read = h5py.File(sub_path + choice(options),'r')
      
      test           = list(feature_read.keys())      
      test           = [i for i in test if i in train_metadata.index]

      min_ = int(test_prop*len(test)) 
      test = sample(test,min_) #sample down
      test_features = [feature_read[key][:] for key in test]

      #get wts
      test_min_rec   = min([i.shape[0] for i in test_features])
      test_wts = (test_min_rec/train_metadata.loc[test,'num_frames']).values
      feature_read.close()
      print("Test features loaded")


      ######TRAIN##########
      sample_type = 'train'
      sub_path     = FEATURE_PATH + sample_type + "/"
      feature_read = h5py.File(sub_path + choice_train_set,'r')
      
      train          = [i for i,v in feature_read.items() if v[:].shape[0]>MIN_FRAME_COUNT]
      train          = [i for i in train if i in train_metadata.index]
      train_features = [feature_read[key][:] for key in train]

      #get wts
      train_min_rec   = min([i.shape[0] for i in train_features])
      train_wts = (train_min_rec/train_metadata.loc[train,'num_frames']).values
      feature_read.close()
      print("Train features loaded")

      #convert to arrays
      train_features = np.array(train_features)
      test_features  = np.array(test_features)

      #fetch train,test labels
      test_labels  = np.array( list(DF_LABEL.loc[[i  for i in test]].stalled) )
      print("Test Labels loaded")
      train_labels = np.array( list(DF_LABEL.loc[[i  for i in train]].stalled))
      print("Train Labels loaded")

      return train_features, train_labels, test_features,\
            test_labels, train_wts, train_min_rec, test_wts, test_min_rec

##FUNCTIONS FOR FITTING

In [None]:
#log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
def plot_tuple(d, path, save = False, ylim_ = (-1,-1)):
  import matplotlib.pyplot as plt
  for (k, v) in d:
      if k not in ['test_mcc', 'train_mcc']:
       k = k if k[0] == 'v' else 'train_' + k
      plt.plot(range(1, len(v) + 1), v, '.-', label=k)
      # NOTE: changed `range(1, 4)` to mach actual values count
  plt.legend()  # To draw legend
  if ylim_[1]>0:
    plt.ylim(ylim_[0],ylim_[1])
  if save:
    plt.savefig(path)
  plt.show()
  del plt

def takespread(sequence, num):    
    sequence = list(range(sequence))
    length   = float(len(sequence))
    return np.array([sequence[int(ceil(i * length / num))] for i in range(num)])

def get_mcc(features,labels,type_ = FRAME_SUBSET_TYPE,
            FIT_1_SAMPLE = FIT_1_SAMPLE):
  try:
    #prepare index for batch processing

    features,labels = process_batch(features,labels,type_)
    pred = [model.predict([i]) for i in features]\
                   if FIT_1_SAMPLE else model.predict(features)
    pred = [item for sublist in pred for item in sublist]
    pred = [1 if i>.5 else 0 for i in pred]    

    labels = np.array(labels)
 
    return np.round(matthews_corrcoef(labels, pred),2)   
  except:
    return -2

def plot_output_hist(train_features,test_features,path, 
                     save = False, FIT_1_SAMPLE = FIT_1_SAMPLE):
    import matplotlib.pyplot as plt
    bins = np.linspace(0, 1, 20)
    
    prediction = [model.predict([i]) for i in train_features]\
                   if FIT_1_SAMPLE else model.predict(train_features)                   
    pred_train = [item for sublist in prediction for item in sublist]
    
    prediction = [model.predict([i]) for i in test_features]\
                   if FIT_1_SAMPLE else model.predict(test_features) 
    pred_test = [item for sublist in prediction for item in sublist]

    plt.hist(pred_train, bins, alpha=0.5, label='train', density=True, color = "skyblue", ec="skyblue")
    plt.hist(pred_test, bins, alpha=0.5, label='test', density=True,color = "red", ec="red")

    plt.legend(loc='upper right')
    plt.title("Output Distribution")
    if save:
       plt.savefig(path)
    plt.show()
    del plt

In [None]:
def process_batch(features,labels, type_ = 'even_spaced'):

    min_rec        = min([i.shape[0] for i in features])

    if type_ == 'even_spaced':          
          chosen_indexes = [takespread(i.shape[0],min_rec) for i in features]
          features       = np.array([features[i][chosen_indexes[i]] for i in range(features.shape[0])])

    elif type_ == 'random':
          chosen_indexes = [sorted(sample(list(range(i.shape[0])),min_rec)) for i in features]
          features       = np.array([features[i][chosen_indexes[i]] for i in range(features.shape[0])])

    elif type_ == 'last_n':
          features       = np.array([features[i][-min_rec:] for i in range(features.shape[0])])

    elif type_ == 'last_n_subset':
          features       = np.array([features[i][-min_rec::2] for i in range(features.shape[0])])

    elif type_ == 'random_subsection':
          chosen_indexes = [randint(0,i.shape[0] - min_rec) for i in features]
          features = [f[chosen_indexes[i]: chosen_indexes[i] + min_rec] for i,f in enumerate(features)]
          features = np.array(features)
    return features,labels



In [None]:
if False: #only for testing

  for i in range(len(train_options)):
      train_features, train_labels, test_features, test_labels,_,_,_,_ = load_data(train_options[i],
                                                                      .7)
      print("\n\nSet: ", i)
      for t in ['last_n','random', 'even_spaced','random_subsection']:
        test_mcc  = get_mcc(test_features,test_labels,t) #random, even_spaced
        train_mcc = get_mcc(train_features,train_labels,t)
        print(t,"\t\ttest:",test_mcc,"\ttrain:",train_mcc)

##TRAINING SETUP

In [None]:
if TRAIN_MODEL:
          
  n_sets = len(train_options)
  test_mcc_list  = []
  train_mcc_list = []
  train_features, train_labels, test_features,\
    test_labels, train_wts, train_min_rec, test_wts, test_min_rec = load_data(train_options[0],
                                                                      VALIDATION_SIZE)
  SAMPLES_IN_TRAIN_STREAM = train_features.shape[0]

  hyper_param = pd.DataFrame({
      'EPOCH_PER_SET'         : EPOCH_PER_SET,
      'GLOBAL_ITER'           : GLOBAL_ITER,
      'FRAME_SUBSET_TYPE'     : FRAME_SUBSET_TYPE,
      'MIN_FRAME_COUNT'       : MIN_FRAME_COUNT,
      'LSTM_L1_REGULARIZATION': LSTM_L1_REGULARIZATION,
      'LSTM_L2_REGULARIZATION': LSTM_L2_REGULARIZATION,
      'L1_REGULARIZATION'     : L1_REGULARIZATION,
      'L2_REGULARIZATION'     : L2_REGULARIZATION,
      'MINI_BATCH_SIZE'       : MINI_BATCH_SIZE,
      'CLASS_WTS'             : str(CLASS_WTS),
      'FEATURE_USED'          : FEATURE_USED,
      'ROI_ROT'               : ROI_ROT,
      'LEARNING_RATE'         : LEARNING_RATE,
      'WEIGHT_BY_FRAME'       : WEIGHT_BY_FRAME,
      'DEPTH_OF_BASE'         : DEPTH_OF_BASE,
      'RESIZE_SHAPE'          : str(RESIZE_SHAPE),
      'TEST_STREAM_SIZE'      : int(test_features.shape[0]/VALIDATION_SIZE),
      'TRAIN_STREAM_SIZE'     : train_features.shape[0],
      'OUT_SIZE'              : OUT_SIZE,
      'FIT_1_SAMPLE'          : FIT_1_SAMPLE,
      'MINI_BATCH_ITERATION'  : MINI_BATCH_ITERATION
                              },\
                              index = ['Value'])
  hyper_param.T.to_csv(HYP_PARAM_FILE)

  try:  
    HISTORY = pd.read_csv(PERF_FILE_FORMAT + model_name + '.csv')
    HISTORY = HISTORY.to_dict()
    for key,value in HISTORY.items():
        HISTORY[key] = list(value.values())
    

    print("Loaded performance history of ", model_name)
  except:
    HISTORY = {'loss':[], 'acc':[], 'val_loss':[], 'val_acc':[]}
  
  try:
    iters_completed = int(model_name.split("_")[-3]) 
    sets_completed  = int(model_name.split("_")[-2]) + 1
    print("Already completed-  Iterations: {} Set_Number: {}".format(iters_completed,
                                                               sets_completed))
  except:
    iters_completed = 0
    sets_completed  = 0

  do_once_ = True
  fname_list = []

In [None]:

def maxper(validation_labels, pred):
    # Save image in set directory 
    l = []
    max_ = -1
    best = -1
    for thres in range(1,100):
        pred = np.array(pred).flatten()
        pred_ = [1 if i> thres/100 else 0 for i in pred]
        try:
            m = matthews_corrcoef(validation_labels, pred_)
            if m> max_ :
              max_ = m
              best = thres
            l.append(m)
        except:
            pass
    print("MCC: " , np.round(max(l),2))
    return np.round(max(l),2),best,l

##FITTING MODEL

In [None]:
if TRAIN_MODEL:
  for iteration in range(iters_completed,GLOBAL_ITER):
    if do_once_:
      pass
    else:
      sets_completed = 0
    do_once_ = False
    failed_iterations = []
    for set_num_, choice_train_set in enumerate(train_options[sets_completed:]):

#          try:
              set_num = set_num_ + sets_completed
              do_once = False
              print("\n\n\nGlobal Iteration: ({}/{}) Set Number: ({}/{}) Epochs: {}  File: {} Version: {}".format(iteration,
                                                        GLOBAL_ITER,
                                                        set_num,
                                                        n_sets,
                                                        EPOCH_PER_SET,
                                                        choice_train_set,
                                                        MODEL_VERSION[:-1]) )
              
              
              train_features, train_labels, test_features,\
                test_labels, train_wts, train_min_rec, test_wts, test_min_rec = load_data(choice_train_set,
                                                                                  VALIDATION_SIZE)
                
    
              print("Avg Train Wt: {} Avg Test Wt: {}  Train Prop: {}  Test Prop: {}".format(\
                            np.round(np.mean(train_wts),2), np.round(np.mean(test_wts),2),
                            np.round(np.mean(train_labels),2), np.round(np.mean(test_labels),2)))
              #shuffle train
              index = list(range(train_features.shape[0]))
              shuffle(index)
              train_features = train_features[index]
              train_labels   = train_labels[index]
              train_wts      = train_wts[index]

              #shuffle/sample test
              index = list(range(test_features.shape[0]))
              shuffle(index)
              test_features = test_features[index]
              test_labels   = test_labels[index]
              test_wts      = test_wts[index]


              if not FIT_1_SAMPLE:
                #subset frames for train
                train_features,train_labels = process_batch(train_features,
                                                            train_labels,
                                                            type_ = FRAME_SUBSET_TYPE) 
                #subset frames for test
                test_features,test_labels  = process_batch(test_features,
                                              test_labels,
                                              type_ = FRAME_SUBSET_TYPE)

              if WEIGHT_BY_FRAME:            
                test_wts        = np.array([CLASS_WTS[i]*test_wts[ind] for ind,i in enumerate(test_labels)])
                train_wts       = np.array([CLASS_WTS[i]*train_wts[ind] for ind,i in enumerate(train_labels)])
              else:
                test_wts    = np.array([CLASS_WTS[i] for i in test_labels])
                train_wts   = np.array([CLASS_WTS[i] for i in train_labels])

              if FIT_1_SAMPLE:            
                warnings.filterwarnings("ignore")
                history = {}
                flag = True
                for e_ in tqdm(list(range(EPOCH_PER_SET))):
                  for i,s_ in enumerate(train_features):
                            history_ = model.fit(np.array([s_]),
                                      np.array([train_labels[i]]),
                                      batch_size       = 1, 
                                      epochs           = 1,  
                                      #validation_data  = (test_features, 
                                      #                    test_labels,
                                      #                    test_wts),
                                      verbose = False)
                            history_ = history_.history
                            if flag:
                              for k,v in history_.items():
                                if k not in history.keys():
                                  history[k] = []
                                else:
                                  history[k].append(v)
                              flag = False
                            else:
                              for k,v in history_.items():
                                history[k].append(v)
                              

              elif MINI_BATCH_ITERATION:

                ind = [i[0] for i in sorted(enumerate(train_features), key=lambda x:x[1].shape[0])]
                train_features = train_features[ind]
                train_labels   = train_labels[ind]


                sample_count =  train_features.shape[0]
                index_list   =  list(range(0,sample_count,MINI_BATCH_SIZE))
                index_list   =  [ list(range(index_list[ind],index_list[ind+1])) for ind in range(len(index_list)-1)]
                index_list   += [list(range(index_list[-1][-1]+1, sample_count))]
                index_list   =  [np.array(i) for i in index_list]

                assert sum([i.shape[0] for i in index_list]) == sample_count

                history = {}
                flag = True
                test_features_,test_labels_   = process_batch(test_features,
                                    test_labels,
                                    type_ = FRAME_SUBSET_TYPE)
                for e_ in tqdm(list(range(EPOCH_PER_SET))):              
                    for index_set in index_list:
                      train_features_,train_labels_ = process_batch(train_features[index_set],
                                                                    train_labels[index_set],
                                                                    type_ = FRAME_SUBSET_TYPE)                   

                      
                      batch_size_ = min(MINI_BATCH_SIZE,train_features_.shape[0])
                      history_ = model.fit(train_features_,
                            train_labels_,
                            batch_size       = batch_size_, 
                            epochs           = 1, 
                            sample_weight    = train_wts[index_set],      
                            validation_data  = (test_features, 
                                                test_labels,
                                                test_wts),
                            verbose = False)
                      history_ = history_.history
                      if flag:
                            for k,v in history_.items():
                              if k not in history.keys():
                                history[k] = []
                              else:
                                history[k].append(v)
                            flag = False
                      else:
                        for k,v in history_.items():
                          history[k].append(v)             
              else:
                history = model.fit(train_features,
                                    train_labels,
                                    batch_size       = MINI_BATCH_SIZE, 
                                    epochs           = EPOCH_PER_SET, 
                                    sample_weight    = train_wts,      
                                    validation_data  = (test_features, 
                                                        test_labels,
                                                        test_wts))#,
                                    #callbacks=[tensorboard_callback])
                history = history.history
              
              #filename
              fname =  "{}_{}_{}".format(iteration,
                                                        set_num,
                                                        EPOCH_PER_SET)
              fname_list.append(fname)


              for key in history.keys():
                  HISTORY[key] += history[key]        


              #plot and save accuracy          
              #plot_tuple(tuple((k, HISTORY[k]) for k in ('acc', 'val_acc')),\
              #          path= ACC_IMAGE_NAME.format(fname),\
              #          save = True)
              
              #plot and save loss - train,test
              ylim_ =  (np.min(HISTORY['loss'][-20:]), np.max(HISTORY['loss'][-20:]))
              plot_tuple(tuple((k, HISTORY[k][-50:]) for k in ('loss', 'val_loss')),\
                            path= LOSS_IMAGE_NAME.format(fname),\
                            save = True)#,
                            #ylim_ = [int(ylim_[1]-1),int(ylim_[1]+2)])

                          
              #save model
              model.save(MODEL_FILENAME + fname)
              print("Model saved to : ", MODEL_FILENAME + fname)
              model_name = MODEL_VERSION + fname
              perf_df = pd.DataFrame(HISTORY)
              perf_df.to_csv(MODEL_PERF_FILE.format(fname), index = False)   

              #if set_num%5 == 0 or set_num == len(train_options) -1:
              if True:
                  #mcc calculations
                  test_mcc  = get_mcc(test_features,test_labels,
                                      type_   = FRAME_SUBSET_TYPE)
                  train_mcc = get_mcc(train_features,train_labels,
                                      type_   = FRAME_SUBSET_TYPE )

                  #append values to lists
                  test_mcc_list.append(test_mcc)
                  train_mcc_list.append(train_mcc)
                  check = len(fname_list) == len(train_mcc_list)
                  MCC_df = pd.DataFrame({'Test MCC':test_mcc_list,'Train MCC':train_mcc_list},
                                        index = fname_list if check else None)
                  MCC_df.to_csv(MODEL_MCC_FORMAT.format(fname), index = check)

                  #plot graphs
                  plot_tuple((('train_mcc',train_mcc_list),('test_mcc', test_mcc_list)),
                            path= MCC_IMAGE_NAME.format(fname),
                            save = True) 
                  
                  #plot output histogram
                  plot_output_hist(train_features,test_features,
                            path= OUT_DIST_IMAGE_NAME.format(fname),
                            save = True)

 #         except:
 #           failed_iterations.append((set_num_, choice_train_set))
 #           print("Failed: ",failed_iterations[-1] )

In [None]:
%tensorboard --logdir logs/fit