In [65]:
import pandas as pd
import pickle
import os
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import yaml
import cv2
import numpy as np
import matplotlib.pyplot as plt
import joblib

class DataSet():
    def __init__(self , config):
        
        self.config= config
        self.vid_path = os.path.join('..',config['dataset']['path'])
        
        
        
        with open ('../../../data/temp_balanced_dataset_pd.pandas','rb') as file:
            self.df_dataset  = pickle.load(file)
            
#         vids = "../../../data/video"
#         vids =  os.listdir(vids)
#         self.df_dataset  =self.df_dataset .loc[self.df_dataset ['filename'].isin(vids)].reset_index(drop=True)
        
#     def split_train_test(self):
#         return train_test_split(self.df_dataset, self.df_dataset.stalled, test_size=0.15, random_state=42,
#                                 stratify=self.df_dataset)
    
    def split_train_test(self):
        return train_test_split(self.df_dataset, self.df_dataset.stalled, test_size=0.15, random_state=42)    
    
    def preprocess_standard(self, x):
        return x / 255.  
    
#     def prepare_image(self, img, size, preprocessing_function, aug=False):
#         img = scipy.misc.imresize(img, size)
#         img = np.array(img).astype(np.float64)
#         if aug: img = augment(img, np.random.randint(7))
#         img = preprocessing_function(img)
#         return img
    @staticmethod
    def getFrame( vidcap , sec , image_name ):
        vidcap.set(cv2.CAP_PROP_POS_MSEC, sec * 1000)
        hasFrames,image = vidcap.read()
        if(hasFrames):
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        return image ,hasFrames
    
    
    @staticmethod
    def extract_location_area_from_highlighted_curve(image , size,  preprocessing_function):
        # convert to hsv to detect the outlined orange area
        hsv = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
        lower_red = np.array([100,120,150])
        upper_red = np.array([110,255,255])
        # create a mask
        mask1 = cv2.inRange(hsv, lower_red, upper_red)
        mask1 = cv2.dilate(mask1, None, iterations=2)
        mask_ind = np.where(mask1>0)
        xmin , xmax = min(mask_ind[1]) , max(mask_ind[1])
        ymin , ymax = min(mask_ind[0]) , max(mask_ind[0])
        # remove orange line from the image
        image[mask_ind ]=0,0,0
        # fill the area to skip the data outside of this area
        ret,mask1 = cv2.threshold(mask1,10,255,cv2.THRESH_BINARY_INV)
        contours,hierarchy = cv2.findContours(mask1, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
        contours = [ctr for ctr in contours if cv2.contourArea(ctr) < 5*(mask1.shape[0]*mask1.shape[1])/6]
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        cv2.drawContours(mask1, [contours[-1]], -1, (0, 0, 0), -1)
        # remove data out of the outlined area
        image[mask1>0] = (0,0,0)

        image = image[ ymin:ymax , xmin:xmax ]


        mask2 = cv2.cvtColor(image , cv2.COLOR_RGB2GRAY)
        ret,mask2 = cv2.threshold(mask2,90,255,cv2.THRESH_BINARY)

        contours,hierarchy = cv2.findContours(mask2, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
        # contours = [ctr for ctr in contours if cv2.contourArea(ctr) < 5*(mask1.shape[0]*mask1.shape[1])/6]
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        

        vessels_image  = np.zeros_like(image)


        areas = []
        for ctr in contours:
            if cv2.contourArea(ctr) > 50:

                cv2.drawContours(image, [ctr], -1, (255, 0, 0), -1)
                cv2.drawContours(vessels_image, [ctr], -1, (255, 255, 255), -1)

                xxmin , xxmax = min(ctr[:,:,0])[0] , max(ctr[:,:,0])[0]
                yymin , yymax = min(ctr[:,:,1])[0] , max(ctr[:,:,1])[0]
    #             image = cv2.rectangle(image , (xxmin ,yymin) ,(xxmax , yymax),(0,255,0),1,1)
                areas.append([xxmin ,yymin ,xxmax ,yymax ,cv2.contourArea(ctr) ])
    #             print(xxmin ,yymin ,xxmax ,yymax)
#         plt.figure()
#         plt.imshow(np.hstack((image,vessels_image)))
        vessels_image = cv2.resize(vessels_image ,(size[0], size[1]))
        vessels_image = cv2.cvtColor(vessels_image , cv2.COLOR_RGB2GRAY)
        vessels_image = preprocessing_function(vessels_image)
    #     area
        return vessels_image.astype(np.float64) 


    
    def process_video(self, foldername ,filename, size, preprocessing_function):
        
        path = os.path.join(self.vid_path, foldername ,filename.split('.')[0])
        data__ = joblib.load(f"{path}.lzma")
        vessels_tensor = data__[0][:,:,:,1]
#         print(filename)
#         print(vessels_tensor.shape)

#         total_frames = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
#         frame_size = (int(vidcap.get(cv2.CAP_PROP_FRAME_WIDTH)) , int(vidcap.get(cv2.CAP_PROP_FRAME_HEIGHT )))
#         fps = vidcap.get(cv2.CAP_PROP_FPS)
#         Video_len = total_frames / fps
#         from_sec = 0
#         step  = 1.
#         time_stamp = np.linspace(from_sec , Video_len , int(total_frames / step) )

#         vessels_tensor = np.zeros([1,size[0],size[1]])

#         for frame in range(int(total_frames)):
#             image , hasframe = self.getFrame(vidcap ,time_stamp[frame] , frame)
#             if hasframe:
#                 vessels_image = self.extract_location_area_from_highlighted_curve(image ,size, preprocessing_function)
#                 vessels_tensor =np.append(vessels_tensor ,vessels_image[np.newaxis,...], axis=0)
                
        if vessels_tensor.shape[0]<100:
            vessels_tensor = np.append(vessels_tensor , np.zeros((100 - len(vessels_tensor),size[0], size[1])),axis=0)
        if vessels_tensor.shape[0]>100:
            vessels_tensor = vessels_tensor[:100]


        return vessels_tensor[...,np.newaxis]
    
    def get_class_one_hot(self, tag , min_value= 0):
        
        onHotTarget = np.ones((2), dtype=np.float64 )* min_value
       
        onHotTarget[int(tag)] = 1
        return onHotTarget
 
    def data_generator(self, data, which_net='standard', size=(150,150), batch_size=2): 
        if which_net == 'resnet50': 
            preprocessing_function=self.preprocess_input_resnet50
        elif which_net == 'densenet': 
            preprocessing_function=self.preprocess_input_densenet
        elif which_net == 'inception': 
            preprocessing_function=self.preprocess_input_inception
        elif which_net == 'vgg': 
            preprocessing_function=self.preprocess_input_vgg16
        elif which_net == 'standard': 
            preprocessing_function=self.preprocess_standard
            
#         filename, tag = data.loc[0,['filename','stalled']].values
#         processed_video = self.process_video(filename, size, preprocessing_function)
#         return self.process_video(filename, size, preprocessing_function)
        
        while True:
            for start in range(0, len(data), batch_size):
                x_batch = []
                y_batch = []
                end = min(start + batch_size, len(data))
                data_batch = data[start:end]
                for foldername , filename, tag in data_batch.loc[:,['folder_name','filename','stalled']].values:
                    processed_video = self.process_video(foldername ,filename, size, preprocessing_function)
                    x_batch.append(processed_video)
                    y_batch.append(self.get_class_one_hot(tag))
                x_batch = np.array(x_batch)
                y_batch = np.array(y_batch)
                yield x_batch, y_batch
#                 return x_batch, y_batch
                
                
config = '../../script/config.yml'   

with open (config , 'rb') as file:
    config = yaml.safe_load(file)
    
print(config)
dataset = DataSet(config)

{'train': {'resultDirectoryName': 'result', 'experimentFolder': 'dir1', 'experimentName': 'exp1', 'trainBatchSize': 1, 'valBatchSize': 1, 'num_Workers': 0, 'startEpoch': 0, 'endEpoch': 50, 'lr': {'init': 1e-05, 'lr_decay': 0.5, 'lr_decay_epoch': 10}}, 'options': {'logger': {'flag': True}}, 'model': {'load': {'flag': False, 'path': './'}}, 'dataset': {'path': '..\\..\\data', 'num_frames': 65, 'credentials_path': '..\\..\\data\\credentials.yml', 'remove_donloaded_video': True, 'save_dir': '..\\..\\data\\generated_Tensors', 'Multiprocessing_num_cores': 4, 'K': 10, 'filter': {'limit': {'flag': False, 'min': 0, 'max': 100}, 'tier1': [True, False], 'project_id': ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'M'], 'stalled': [0], 'crowd_score': [0.0, 0.0]}}}


In [59]:
from tqdm.notebook import tqdm

for data in tqdm(dataset.data_generator(dataset.df_dataset,batch_size=1)):
    print(data[0].shape ,data[1].shape)
# k[0].shape ,k[1].shape
# dataset.df_dataset.loc[0,['filename','stalled']].values

# dataset.df_dataset.stalled

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

100000.mp4
(1, 100, 150, 150, 1) (1, 2)
100289.mp4
(1, 100, 150, 150, 1) (1, 2)
100002.mp4
(1, 121, 150, 150, 1) (1, 2)
102952.mp4
(1, 100, 150, 150, 1) (1, 2)
100003.mp4
(1, 100, 150, 150, 1) (1, 2)
103390.mp4
(1, 100, 150, 150, 1) (1, 2)
100004.mp4
(1, 100, 150, 150, 1) (1, 2)
104360.mp4
(1, 100, 150, 150, 1) (1, 2)
100005.mp4
(1, 100, 150, 150, 1) (1, 2)
105028.mp4
(1, 100, 150, 150, 1) (1, 2)
100006.mp4
(1, 100, 150, 150, 1) (1, 2)
105159.mp4
(1, 100, 150, 150, 1) (1, 2)
100007.mp4
(1, 100, 150, 150, 1) (1, 2)
105668.mp4
(1, 100, 150, 150, 1) (1, 2)
100008.mp4
(1, 100, 150, 150, 1) (1, 2)
106076.mp4
(1, 100, 150, 150, 1) (1, 2)
100010.mp4
(1, 100, 150, 150, 1) (1, 2)
106540.mp4
(1, 100, 150, 150, 1) (1, 2)
100011.mp4
(1, 100, 150, 150, 1) (1, 2)
109827.mp4
(1, 100, 150, 150, 1) (1, 2)
100012.mp4
(1, 100, 150, 150, 1) (1, 2)
110488.mp4
(1, 100, 150, 150, 1) (1, 2)
100013.mp4
(1, 100, 150, 150, 1) (1, 2)
110787.mp4
(1, 100, 150, 150, 1) (1, 2)
100014.mp4
(1, 100, 150, 150, 1) (1, 2)


100111.mp4
(1, 100, 150, 150, 1) (1, 2)
229432.mp4
(1, 100, 150, 150, 1) (1, 2)
100112.mp4
(1, 100, 150, 150, 1) (1, 2)
230036.mp4
(1, 100, 150, 150, 1) (1, 2)
100113.mp4
(1, 100, 150, 150, 1) (1, 2)
230913.mp4
(1, 100, 150, 150, 1) (1, 2)
100114.mp4
(1, 100, 150, 150, 1) (1, 2)
231046.mp4
(1, 100, 150, 150, 1) (1, 2)
100115.mp4
(1, 100, 150, 150, 1) (1, 2)
231285.mp4
(1, 100, 150, 150, 1) (1, 2)
100116.mp4
(1, 100, 150, 150, 1) (1, 2)
232105.mp4
(1, 100, 150, 150, 1) (1, 2)
100117.mp4
(1, 100, 150, 150, 1) (1, 2)
232341.mp4
(1, 100, 150, 150, 1) (1, 2)
100118.mp4
(1, 100, 150, 150, 1) (1, 2)
232606.mp4
(1, 100, 150, 150, 1) (1, 2)
100119.mp4
(1, 100, 150, 150, 1) (1, 2)
232767.mp4
(1, 100, 150, 150, 1) (1, 2)
100121.mp4
(1, 100, 150, 150, 1) (1, 2)
238800.mp4
(1, 100, 150, 150, 1) (1, 2)
100122.mp4
(1, 100, 150, 150, 1) (1, 2)
239613.mp4
(1, 100, 150, 150, 1) (1, 2)
100123.mp4
(1, 100, 150, 150, 1) (1, 2)
240586.mp4
(1, 100, 150, 150, 1) (1, 2)
100124.mp4
(1, 100, 150, 150, 1) (1, 2)


KeyboardInterrupt: 

In [142]:
# with open ('../../../data/whole_train_dataset.pandas','rb') as file:
#             df_dataset  = pickle.load(file)
# df_dataset[1:10].loc[:,['filename','vid_id']].values
# df_dataset

In [46]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import *
from keras.layers import LSTM
from keras.datasets import imdb


model = Sequential()
model.add(TimeDistributed(Conv2D(32, (7, 7), strides=(2, 2), activation='relu', padding='same'), input_shape=(100, 150, 150, 1)))
model.add(TimeDistributed(Conv2D(32, (3,3), kernel_initializer="he_normal", activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
 
model.add(TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(64, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
 
model.add(TimeDistributed(Conv2D(128, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(128, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
 
model.add(TimeDistributed(Conv2D(256, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(256, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
 
model.add(TimeDistributed(Conv2D(512, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(Conv2D(512, (3,3), padding='same', activation='relu')))
model.add(TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2))))
 
model.add(TimeDistributed(Flatten()))
 
model.add(Dropout(0.5))
model.add(LSTM(256, return_sequences=False, dropout=0.5))
model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_33 (TimeDis (None, 100, 75, 75, 32)   1600      
_________________________________________________________________
time_distributed_34 (TimeDis (None, 100, 73, 73, 32)   9248      
_________________________________________________________________
time_distributed_35 (TimeDis (None, 100, 36, 36, 32)   0         
_________________________________________________________________
time_distributed_36 (TimeDis (None, 100, 36, 36, 64)   18496     
_________________________________________________________________
time_distributed_37 (TimeDis (None, 100, 36, 36, 64)   36928     
_________________________________________________________________
time_distributed_38 (TimeDis (None, 100, 18, 18, 64)   0         
_________________________________________________________________
time_distributed_39 (TimeDis (None, 100, 18, 18, 128) 

In [66]:
data = DataSet(config)
X_train, X_test, y_train, y_test = data.split_train_test()

In [67]:
from keras.utils import multi_gpu_model
from keras.optimizers import Adam, SGD

# parallel_model = multi_gpu_model(model, gpus=4)
# parallel_model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics = ['accuracy'])

batch_size = 1
epochs = 1
size = (150, 150)

train_steps = len(X_train) / batch_size
valid_steps = len(X_test) / batch_size
        
# parallel_model.fit_generator(data.data_generator(X_train, 'standard', size=size, batch_size=batch_size), 
#                     train_steps, epochs=epochs, verbose=5, 
#                     validation_data=data.data_generator(X_test, 'standard', size=size, batch_size=batch_size), 
#                     validation_steps=valid_steps)

model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics = ['accuracy'])

model.fit_generator(data.data_generator(X_train, 'standard', size=size, batch_size=batch_size), 
                    train_steps, epochs=epochs, verbose=1, 
                    validation_data=data.data_generator(X_test, 'standard', size=size, batch_size=batch_size), 
                    validation_steps=valid_steps)

Epoch 1/1
 120/8500 [..............................] - ETA: 8:56:54 - loss: 0.6856 - accuracy: 0.5417

ValueError: Error when checking input: expected time_distributed_33_input to have shape (100, 150, 150, 1) but got array with shape (99, 150, 150, 1)

In [5]:
import pickle
with open ('../../../data/temp_balanced_dataset_pd.pandas','rb') as file:
            df_dataset  = pickle.load(file)
df_dataset

Unnamed: 0,crowd_score,filename,folder_name,micro,nano,num_frames,project_id,stalled,tier1,url,vid_id
0,0.000000,100000.mp4,flowing_Tensors,0.0,0.0,54.0,M,0.0,1.0,s3://drivendata-competition-clog-loss/train/10...,0.0
1,0.765824,100289.mp4,stall_Tensors,1.0,1.0,59.0,G,1.0,1.0,s3://drivendata-competition-clog-loss/train/10...,280.0
2,0.000000,100002.mp4,flowing_Tensors,0.0,0.0,122.0,H,0.0,1.0,s3://drivendata-competition-clog-loss/train/10...,2.0
3,1.000000,102952.mp4,stall_Tensors,1.0,1.0,54.0,G,1.0,1.0,s3://drivendata-competition-clog-loss/train/10...,2871.0
4,0.000000,100003.mp4,flowing_Tensors,0.0,0.0,55.0,E,0.0,1.0,s3://drivendata-competition-clog-loss/train/10...,3.0
...,...,...,...,...,...,...,...,...,...,...,...
9995,1.000000,308238.mp4,stall_Tensors,1.0,1.0,59.0,K,1.0,1.0,s3://drivendata-competition-clog-loss/train/30...,203211.0
9996,0.000000,105588.mp4,flowing_Tensors,0.0,0.0,51.0,H,0.0,1.0,s3://drivendata-competition-clog-loss/train/10...,5433.0
9997,1.000000,308413.mp4,stall_Tensors,1.0,1.0,79.0,C,1.0,1.0,s3://drivendata-competition-clog-loss/train/30...,203383.0
9998,0.000000,105589.mp4,flowing_Tensors,0.0,0.0,47.0,H,0.0,1.0,s3://drivendata-competition-clog-loss/train/10...,5434.0
