#TRAINING AUTOENCODER MODEL

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os
import random
import pandas as pd
import cv2
import time

from google.colab import drive
drive.mount('/content/drive')

WINDOW_SIZE = 5
MAX_PERSONS = 6
NO_COLS = len(["neck_x", "neck_y", "right shoulder_x", "right shoulder_y", "right elbow_x", "right elbow_y", "right wrist_x",
              "right wrist_y", "left shoulder_x", "left shoulder_y", "left elbow_x", "left elbow_y", "left wrist_x", "left wrist_y"])
CHANNELS = 1

epochs = 25
batch_size = 22
lr = 1e-3
split1 = 0.7
split2 = 0.9

Mounted at /content/drive


In [None]:
files = os.listdir("/content/drive/MyDrive/GSoC/npy_files/")
random.seed(42)
random.shuffle(files)
files = ["/content/drive/MyDrive/GSoC/npy_files/"+fil for fil in files]
samples = len(files)
l1 = int(samples*split1)
l2 = int(samples*split2)
files_train, files_val, files_test = files[:l1], files[l1:l2], files[l2:]


x_train , y_train, x_val, y_val = [], [], [], []

for fil in files_train[:14]:
    with open(fil, "rb") as npf:
        data = np.load(npf, allow_pickle=True)
    for frame, d, lb in data:

        x_train.append(np.array([d], dtype=np.float32))   # 1 channel required
        y_train.append(lb)

for fil in files_val:
    with open(fil, "rb") as npf:
        data = np.load(npf, allow_pickle=True)
    for frame, d, lb in data:

        x_val.append(np.array([d], dtype=np.float32))   # 1 channel required
        y_val.append(lb)


x_train = np.array(x_train, dtype=np.float32)
y_train = np.array(y_train, dtype=int)
x_val = np.array(x_val, dtype=np.float32)
y_val = np.array(y_val, dtype=int)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os
import random
import cv2
import time


WINDOW_SIZE = 5
MAX_PERSONS = 6
NO_COLS = 14  # Number of columns in your data (adjust as needed)
CHANNELS = 1
LATENT_DIM = 64  # Latent dimension for the encoder


def get_encoder():

    inp = layers.Input(shape=(WINDOW_SIZE, MAX_PERSONS, NO_COLS, CHANNELS))

    x = layers.ConvLSTM2D(filters=64, kernel_size=(3, 3), padding="same", return_sequences=True, activation="relu")(inp)
    x = layers.BatchNormalization()(x)

    x = layers.Conv3D(filters=64, kernel_size=(3, 3, 3), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv3D(filters=64, kernel_size=(3, 3, 3), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool3D(pool_size=(3, 3, 3), strides=(1, 1, 1))(x)

    x = layers.Conv3D(filters=128, kernel_size=(3, 3, 3), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv3D(filters=128, kernel_size=(3, 3, 3), padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.GlobalAveragePooling3D()(x)

    latent = layers.Dense(units=LATENT_DIM, activation="relu")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=latent)
    return model


def get_decoder():
    inp = layers.Input(shape=(LATENT_DIM,))
    x = layers.Dense(units=128, activation="relu")(inp)
    x = layers.Dense(units=256, activation="relu")(x)  # Additional layer
    x = layers.Dense(units=512, activation="relu")(x)  # Additional layer
    x = layers.Dense(units=WINDOW_SIZE * MAX_PERSONS * NO_COLS * CHANNELS, activation="relu")(x)
    decoded = layers.Reshape((WINDOW_SIZE, MAX_PERSONS, NO_COLS, CHANNELS))(x)

    model = tf.keras.models.Model(inputs=inp, outputs=decoded)
    return model


# Create the encoder-decoder model
encoder = get_encoder()
decoder = get_decoder()

# Connect the encoder and decoder
encoder_input = layers.Input(shape=(WINDOW_SIZE, MAX_PERSONS, NO_COLS, CHANNELS))
encoded = encoder(encoder_input)
decoded = decoder(encoded)

# Create the combined model
autoencoder = tf.keras.models.Model(inputs=encoder_input, outputs=decoded)

# Compile the autoencoder
# autoencoder.compile(
#     loss='mse',  # Use mean squared error for reconstruction loss
#     optimizer=tf.keras.optimizers.Adam(learning_rate=lr)
# )

autoencoder.compile(
    loss='mse',  # Use mean squared error for reconstruction loss
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr)  # Adjust the learning rate
)

# Print the autoencoder summary
print(autoencoder.summary())

x_train, _, x_val, _ = [], [], [], []

for fil in files_train[:14]:
    with open(fil, "rb") as npf:
        data = np.load(npf, allow_pickle=True)
    for _, d, _ in data:
        d = np.expand_dims(d, axis=-1)
        x_train.append(d)

for fil in files_val:
    with open(fil, "rb") as npf:
        data = np.load(npf, allow_pickle=True)
    for _, d, _ in data:
        d = np.expand_dims(d, axis=-1)
        x_val.append(d)

x_train = np.array(x_train, dtype=np.float32)
x_val = np.array(x_val, dtype=np.float32)


logs_dir = "/content/models_dir/logs/"
os.makedirs(logs_dir, exist_ok=True)

# Define callbacks
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", patience=4, verbose=1, factor=0.5
)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=8, verbose=1, restore_best_weights=True
)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "/content/models_dir/best_autoencoder_model_2.h5",
    monitor="val_loss",
    mode="min",
    verbose=1,
    save_best_only=True
)
# Learning rate scheduler to adjust learning rate schedule if needed
def lr_schedule(epoch):
    if epoch < 10:
        return 0.001
    elif epoch < 20:
        return 0.0001
    else:
        return 0.00001

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

callbacks = [reduce_lr, tensorboard, early_stopping, model_checkpoint, lr_scheduler]

# Train the autoencoder
autoencoder.fit(
    x_train, x_train,  # Use x_train as both input and target for reconstruction
    batch_size=batch_size,
    epochs=30,
    validation_data=(x_val, x_val),  # Use x_val as both input and target for validation
    callbacks=callbacks,
)



Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 5, 6, 14, 1)]     0         
                                                                 
 model (Functional)          (None, 64)                1045184   
                                                                 
 model_1 (Functional)        (None, 5, 6, 14, 1)       388388    
                                                                 
Total params: 1,433,572
Trainable params: 1,432,676
Non-trainable params: 896
_________________________________________________________________
None
Epoch 1/30
Epoch 1: val_loss improved from inf to 2809.87866, saving model to /content/models_dir/best_autoencoder_model_2.h5
Epoch 2/30
Epoch 2: val_loss improved from 2809.87866 to 2381.36499, saving model to /content/models_dir/best_autoencoder_model_2.h5
Epoch 3/30
Epoch 3: val_loss improved from 2381

<keras.callbacks.History at 0x78a87e002170>

In [None]:
autoencoder.save("/content/models_dir/best_autoencoder_model_2.h5")


In [None]:
mse = autoencoder.evaluate(x_val, x_val, verbose=0)
print("Validation Mean Squared Error:", mse)

Validation Mean Squared Error: 1683.8480224609375


#Validation

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os
import random
import cv2
import time

WINDOW_SIZE = 5
MAX_PERSONS = 6
NO_COLS = 14
CHANNELS = 1
LATENT_DIM = 64
num_classes = 6

x_train, _, x_val, _ = [], [], [], []

for fil in files_train[:14]:
    with open(fil, "rb") as npf:
        data = np.load(npf, allow_pickle=True)
    for _, d, _ in data:
        d = np.expand_dims(d, axis=-1)
        x_train.append(d)

for fil in files_val:
    with open(fil, "rb") as npf:
        data = np.load(npf, allow_pickle=True)
    for _, d, _ in data:
        d = np.expand_dims(d, axis=-1)
        x_val.append(d)

x_train = np.array(x_train, dtype=np.float32)
x_val = np.array(x_val, dtype=np.float32)



encoder_model = tf.keras.models.load_model('/content/drive/MyDrive/GSoC/best_autoencoder_model_2.h5')

y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes)
y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes)

encoded_shape = x_train_encoded.shape[1:]  # Excluding the sample dimension

x_train_encoded_flattened = x_train_encoded.reshape(-1, np.prod(encoded_shape))
x_val_encoded_flattened = x_val_encoded.reshape(-1, np.prod(encoded_shape))

def get_improved_classifier(input_dim, num_classes):
    model = tf.keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(units=256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(units=128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(units=64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(units=num_classes, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

improved_classifier_model = get_improved_classifier(
    np.prod(encoded_shape), num_classes
)

# Train the classifier using flattened encoded representations and one-hot encoded labels
improved_classifier_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = improved_classifier_model.fit(
    x_train_encoded_flattened, y_train_encoded,
    batch_size=batch_size,
    epochs=30,
    validation_split=0.2
)

# Evaluate the improved classifier's accuracy on the flattened validation encoded representations and one-hot encoded labels
improved_classifier_accuracy = improved_classifier_model.evaluate(x_val_encoded_flattened, y_val_encoded)[1]
print("Improved Classifier Accuracy:", improved_classifier_accuracy)









Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Improved Classifier Accuracy: 0.4726839065551758


#PREDICTION

In [None]:
autoencoder_model = tf.keras.models.load_model('/content/drive/MyDrive/GSoC/best_autoencoder_model_2.h5')


In [None]:
import numpy as np

x_val = []
y_val = []

with open("/content/drive/MyDrive/GSoC/2014-11-11_0000_US_KNBC_The_Ellen_DeGeneres_Show_1930-2276_npy-detect_w5_p6_r0.025.npy", "rb") as npf:
    data = np.load(npf, allow_pickle=True)

for item in data:
    if isinstance(item, tuple) and len(item) >= 2:
        frame = item[0]  # Extract the frame or label
        d = item[1]      # Extract the data
        x_val.append(np.array([d], dtype=np.float32))  # Assuming 'd' is the data
        y_val.append(frame)  # Append the frame or label
    else:
        # Handle other cases where the structure of 'data' doesn't match expectations
        pass

x_val = np.array(x_val, dtype=np.float32)
y_val = np.array(y_val, dtype=int)

In [None]:

x_test = []

with open("/content/drive/MyDrive/GSoC/npy_files/2014-11-11_0000_US_KNBC_The_Ellen_DeGeneres_Show_1930-2276_npy-train_w5_p6_r0.025.npy", "rb") as npf:
    data = np.load(npf, allow_pickle=True)
for frame, d, lb in data:
    d = np.expand_dims(d, axis=-1)  # Add an additional dimension for the channel
    x_test.append(d)

x_test = np.array(x_test, dtype=np.float32)


In [None]:
predictions = autoencoder_model.predict(np.array(x_test, dtype=np.float32), verbose=1)




In [None]:
results = []
for frame_data, prediction in zip(data, predictions):
    frame = frame_data[0]  # Assuming frame data is a sequence with frame at index 0
    d = frame_data[1]      # Assuming d is at index 1 in frame_data

    results.append([frame, prediction[0] > .35])  # Use your desired threshold


In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(results, columns=["frame", "gesture"])
frames_with_gesture = df[df["gesture"].apply(np.any)]["frame"].to_numpy()

np.save('/content/drive/MyDrive/GSoC/frames_with_gesture-autoencoder.npy', frames_with_gesture)



In [None]:
len(frames_with_gesture)


9838