In [1]:
import os
import numpy as np


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



In [8]:
main_path = "C:/HolisticVideoUnderstanding/sampled_train"
classes = os.listdir(main_path)

# build the scaler model
scaler = MinMaxScaler()

x_v = []
x_t = []
y = []


y_categorical = None

for idx, name in enumerate(classes):
    
    """
    if idx == 5:
        break
    """
    
    class_path = os.path.join(main_path, name)
    class_videos = os.listdir(class_path)
    
    class_set_v = []
    class_set_t = []
    class_id = []
    
    for idx_sub, class_video in enumerate(class_videos):
                        
        
        class_video_path = os.path.join(class_path, class_video)
        # Elmo_Mean.npz, Elmo
        # InceptionResnetV2_MaxPooling.npz, InceptionResnetV2
        class_video_visual_feature = np.load(os.path.join(class_video_path, "DenseNet_201.npz"))['DenseNet_201']
        class_video_textual_feature = np.load(os.path.join(class_video_path, "Elmo_Mean.npz"))['Elmo']
        
        class_video_visual_feature = scaler.fit_transform(class_video_visual_feature.reshape(-1, 1))
        class_video_textual_feature = scaler.fit_transform(class_video_textual_feature.reshape(-1, 1))
        
        class_set_v.append(class_video_visual_feature)
        class_set_t.append(class_video_textual_feature)
        class_id.append(idx)
        
    
    x_v.extend(class_set_v)
    x_t.extend(class_set_t)
    y.extend(class_id)
    
y_categorical = to_categorical(y)

In [9]:
len(x_v), len(x_t), len(y), len(y_categorical)

(6400, 6400, 6400, 6400)

In [10]:
for i in range(len(x_v)):
    print(np.min(x_v[i]), np.max(x_v[i]), np.mean(x_v[i]))

0.0 1.0 0.045157
0.0 1.0 0.04532809
0.0 0.9999999 0.05905854
0.0 1.0 0.036080893
0.0 1.0 0.05544835
0.0 1.0 0.044050418
0.0 1.0 0.048093364
0.0 0.99999994 0.0459336
0.0 1.0 0.05554258
0.0 1.0 0.066049784
0.0 0.99999994 0.060846638
0.0 1.0 0.050224375
0.0 0.9999999 0.058129504
0.0 0.99999994 0.056826457
0.0 1.0 0.06632224
0.0 1.0 0.04629124
0.0 1.0 0.05559886
0.0 0.99999994 0.074452095
0.0 1.0 0.052044515
0.0 0.99999994 0.06079121
0.0 1.0 0.053066473
0.0 1.0 0.043033566
0.0 1.0 0.051231325
0.0 1.0 0.037751026
0.0 0.9999999 0.041781712
0.0 1.0 0.10983076
0.0 1.0000001 0.053909462
0.0 1.0 0.0482887
0.0 1.0 0.08336263
0.0 1.0 0.040716745
0.0 1.0 0.03428728
0.0 1.0 0.03650201
0.0 1.0 0.04562941
0.0 0.99999994 0.06830027
0.0 1.0 0.060613804
0.0 1.0 0.03674289
0.0 0.99999994 0.067678116
0.0 1.0 0.05172567
0.0 1.0 0.042584635
0.0 1.0 0.052599523
0.0 1.0 0.05241932
0.0 1.0 0.026068829
0.0 1.0 0.07257236
0.0 1.0 0.060806055
0.0 1.0 0.02766833
0.0 0.9999999 0.05648254
0.0 1.0 0.06042107
0.0 1.0 0

In [11]:
x_train_v, x_test_v, x_train_t, x_test_t, y_train, y_test = train_test_split(x_v, x_t, y_categorical, test_size = 0.20, random_state = 12345)
x_train_v, x_test_v, x_train_t, x_test_t, y_train, y_test = np.array(x_train_v), np.array(x_test_v), np.array(x_train_t), np.array(x_test_t),  np.array(y_train), np.array(y_test)

In [16]:
inputs_v = keras.Input(shape=(1920,))
dense_v = layers.Dense(960, activation="relu")(inputs_v)
dense_v = layers.Dense(480, activation="relu")(dense_v)
model_v = keras.Model(inputs=inputs_v, outputs=dense_v, name="feature_v")

inputs_t = keras.Input(shape=(1024,))
dense_t = layers.Dense(512, activation="relu")(inputs_t)
dense_t = layers.Dense(128, activation="relu")(dense_t)
model_t = keras.Model(inputs=inputs_t, outputs=dense_t, name="feature_t")


# combine the output of the two branches
combined = layers.concatenate([model_v.output, model_t.output])

# apply a FC layer and then a regression prediction on the
# combined outputs
z = layers.Dense(128, activation="relu")(combined)
z = layers.Dense(64, activation="sigmoid")(z)

model = keras.Model(inputs=[model_v.input, model_t.input], outputs=z)

model.summary()

model.compile(
    loss=keras.losses.CategoricalCrossentropy(),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"]
)

history = model.fit([x_train_v, x_train_t], y_train, batch_size=32, epochs=10, validation_split=0.1)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 1920)]       0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
dense_18 (Dense)                (None, 960)          1844160     input_7[0][0]                    
__________________________________________________________________________________________________
dense_20 (Dense)                (None, 512)          524800      input_8[0][0]                    
____________________________________________________________________________________________

In [17]:
test_scores = model.evaluate([x_test_v, x_test_t], y_test, verbose=2)

40/40 - 0s - loss: 1.4817 - accuracy: 0.6156
