# Train model based on VGGish features

__VGGish__: 128-dimensional audio features extracted at 1Hz. The audio features were extracted using a VGG-inspired acoustic model described in Hershey et. al., trained on a preliminary version of YouTube-8M. The features were PCA-ed and quantized to be compatible with the audio features provided with YouTube-8M. They are stored as TensorFlow Record files.

To my understanding: VGGish model turns audio classification problems into image classification problems. They create 2D image-like patches by computing log-mel spectrograms of multiple frames, and feed that into models. The input thus become some transformations of visual representation of spectrum of frequencies of the signal as it changes with time.

Reference: https://arxiv.org/pdf/1609.09430.pdf

__PCA__: Two primary reasons for use
- Data reduction: condense the information contained in a large number of original variables into a smaller set of new composite dimensions, with a minimum loss of information.
- Interpretation: discover important features of a large data set that often reveals relationships that were previously unsuspected, thereby allowing interpretations that would not ordinarily result.

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
from tensorflow import keras
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from shutil import *

In [3]:
# The directory should be arranged in form:
# .
# ├── audioset_v1_embeddings
# ├── class_labels_indices.csv
# └── Model_on_VGG.ipynb

path = "audioset_v1_embeddings/"
eva = "eval/"
bal = "bal_train/"
unbal = "unbal_train/"

-------------------------------
-------------------------------
-------------------------------
-------------------------------

## 1: convert .tfrecord info into .csv 

In [4]:
index_label = pd.read_csv("class_labels_indices.csv")

In [5]:
def map_index_to_label(index_label, index):
    # Maps index to readable labels
    # Return label is a list, that could contain more than 1 item, but refer to the same label
    # Return labels are all in lower form, no CAP
    # e.g. ['male speech', 'man speaking']
    #
    # input: pandas.DataFrame index_to_label_df, int index
    # output: list readable_label
    
    # Get the real index of df, in case of mismatch
    index = index_label.loc[index_label['index'] == index].index[0] 
    labels = [label.strip().lower() for label in index_label.iloc[index]['display_name'].split(",")]
    return labels

In [6]:
def map_label_to_index(index_label, label):
    # Maps readable labels to index
    #
    # input: pandas.DataFrame index_to_label_df, str label
    # output: int index
    label = label.lower()
    labelCap = label.capitalize()
    for index, row in index_label.iterrows():
        labels = [label.strip() for label in row['display_name'].split(",")]
        if label in labels or labelCap in labels:
            return index_label.iloc[index]['index']
    return -1

In [34]:
def read_tfrecord(filename):
    # Read in a tfrecord file
    # Store information in list of lists
    # Audio_embedding is a list of 10 embeddings, each represents 1 second feature
    #
    # input: str filename
    # output: pandas dataframe with columns:
    #        [str video_id, float start_time, float end_time, list label_index, list embed]
    
    if not filename.endswith('.tfrecord'):
        print("This file is not a .tfrecord file.")
        return
    raw_dataset = tf.data.TFRecordDataset(filename)
    return_df = pd.DataFrame(columns=
                             ['video_id', 'start_time_seconds', 'end_time_seconds', 'labels', 'audio_embedding'])
    for raw_record in raw_dataset:
        cur_record_list = []
        example = tf.train.SequenceExample()
        example.ParseFromString(raw_record.numpy())
        
        cur_record_list.append(example.context.feature['video_id'].bytes_list.value[0].decode("utf-8"))
        cur_record_list.append(example.context.feature['start_time_seconds'].float_list.value[0])
        cur_record_list.append(example.context.feature['end_time_seconds'].float_list.value[0])
        cur_record_list.append(example.context.feature['labels'].int64_list.value)
        
        # Original embeddings are stored in hex format, now convert them to readable int
        embeds = []
        for i in range(len(example.feature_lists.feature_list['audio_embedding'].feature)):
            hexembed = example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0].hex()
            arrayembed = [int(hexembed[i:i+2], 16) for i in range(0, len(hexembed), 2)]
            embeds.append(arrayembed)
        cur_record_list.append(embeds)
        
        return_df.loc[len(return_df)] = cur_record_list
    return return_df

In [35]:
def convert_csv_tfrecord_dir(dir_path, dest):
    # Read and concat all tfrecord files in a directory
    # and save to csv in appending mode
    #
    # input: str directory_path, (must contain '/' in the end, e.g. 'unbal_train/')
    # output: int count_tfrecord_files
    cnt = 0
    for file in os.listdir(dir_path):
        if file.endswith(".tfrecord"):
            df = read_tfrecord(dir_path+file)
            # Original method, that read all data into a df, then store it as csv
            # takes too much RAM, and causes crashes when reach limit
            # Now update to batch save to avoid exploding uses of RAM
            # But still, CPU usage is too high, hope to upgrade later
            if cnt == 0:
                df.to_csv(dest, index=False, header=True)
            else:
                df.to_csv(dest, mode='a', index=False, header=False)
            cnt += 1
    return cnt

In [36]:
convert_csv_tfrecord_dir(path+eva, path+'eval.csv')

4062

In [9]:
# convert_csv_tfrecord_dir(path+bal, path+'bal_train.csv')

In [10]:
# Takes a looooooooooong time!
# convert_csv_tfrecord_dir(path+unbal, path+'unbal_train.csv')

-------------------------------
-------------------------------
-------------------------------
-------------------------------

## 2: prepare data for model training

In [37]:
eval_info = pd.read_csv(path+"eval.csv")
# bal_info = pd.read_csv(path+"bal_train.csv")
# unbal_info = pd.read_csv(path+"unbal_train.csv")

In [38]:
eval_info.head()

Unnamed: 0,video_id,start_time_seconds,end_time_seconds,labels,audio_embedding
0,W85w938G5ZM,30.0,40.0,"[137, 260, 266]","[[162, 68, 162, 174, 183, 49, 234, 95, 108, 10..."
1,W8KC0GO8VvY,240.0,250.0,"[195, 210]","[[117, 255, 128, 77, 77, 255, 23, 221, 255, 25..."
2,W8vOwbkWZfQ,30.0,40.0,"[0, 16, 357]","[[126, 147, 202, 181, 207, 168, 110, 35, 205, ..."
3,W8XlZWqklbY,200.0,210.0,"[286, 287, 289, 290]","[[108, 143, 103, 82, 182, 13, 188, 93, 225, 11..."
4,W8yvnhWdyAs,30.0,40.0,"[0, 5, 137]","[[42, 91, 176, 145, 33, 235, 255, 241, 8, 0, 2..."


In [39]:
def get_df_with_label(df, label):
    # Create a sub-df from df where labeled as <label>
    #
    # input: df df, label string
    # output: df df_elements_with_<label>
    index = map_label_to_index(index_label, label)
    # labels read from csv file become str instead of list
    mask = df.labels.apply(lambda x: True if index in [int(l) for l in x[1:-1].split(',')] else False)
    df1 = df[mask]
    return df1

In [40]:
def drop_sub_df(df, df_subset):
    # (df - df_subset), where df_subset is a subset of df
    # 
    # input: df df, df subset_of_df
    # output: df 
    df_new = df.merge(df_subset, how='left', indicator=True)
    df_new = df_new[df_new['_merge'] == 'left_only']
    del df_new['_merge']
    return df_new

In [41]:
def df_sample(df, size):
    # Get a subset from df, where contains n elemets, randomly sampled from df
    #
    # input: df df, size number_of_elements_in_subset
    # output: df
    return df.sample(n = size) 

In [102]:
def df_add_col_with_value(df, col_name, value):
    # Add a column to given df with specified value
    #
    # input: df df, col_name column_name, value default_value_of_new_column
    # output: df df
    df[col_name] = value
    return df

def get_xy_from_df(df, norm=False):
    # Return the value of specific column as list
    # Do something different to 'audio_embedding', turn list of strs to list of ints
    #
    # input: df df, col_name column_name
    # output: list column_values
    total = df['audio_embedding'].tolist()
    total_y = df['y'].tolist()
    ret = []
    ret_y = []
    for index, embeds_10s in enumerate(total):
        tmp = embeds_10s[1:-1]
        tmp = tmp.replace(',', ' ').split(']')
        for embed_1s in tmp:
            embed_1s = embed_1s.replace('[', '').strip().split()
            if norm:
                ret.append([int(num)/255 for num in embed_1s])
            else:
                ret.append([int(num) for num in embed_1s])
            ret_y.append(total_y[index])
    return ret, ret_y

In [89]:
def df_get_labelled_unlabelled(df, label, ratio_of_label_unlabel):
    # Create a new df, that contains all rows with <label>
    # and n * len(<label>) rows without <label>
    # The new df contains a new columns that indicate whether contain <label>
    #
    # input: df df, str label, int ratio_of_label:unlabel
    # output: df new_df
    labelled = get_df_with_label(df, label)
    unlabelled = drop_sub_df(df, labelled)
    unlabelled = df_sample(unlabelled, len(labelled) * ratio_of_label_unlabel)
    labelled = df_add_col_with_value(labelled, 'y', 1)
    unlabelled = df_add_col_with_value(unlabelled, 'y', 0)
    new_df = pd.concat([labelled, unlabelled],ignore_index=True)
    # Shuffle df
    new_df = new_df.iloc[np.random.permutation(len(new_df))]
    return new_df

In [109]:
def balance_df_size(dfs, a, b, c):
    # Rebalance the size of three dataframes train, val, test
    # Reset the ratio of train:val:test to a:b:c
    # Keep the original ratio of label:unlabel in new dfs
    #
    # input: list[df] dfs, int a, int b, int c
    # output: list[df] rebalanced_dfs
    total = pd.concat(dfs, ignore_index=True)
    labeled = total.loc[total['y'] == 1]
    unlabeled = drop_sub_df(total, labeled)
    
    train_labeled = labeled.sample(frac=(a/(a+b+c)))
    tmp_labeled = drop_sub_df(labeled, train_labeled)
    val_labeled = tmp_labeled.sample(frac=(b/(b+c)))
    test_labeled = drop_sub_df(tmp_labeled, val_labeled)
    
    train_unlabeled = unlabeled.sample(frac=(a/(a+b+c)))
    tmp_unlabeled = drop_sub_df(unlabeled, train_unlabeled)
    val_unlabeled = tmp_unlabeled.sample(frac=(b/(b+c)))
    test_unlabeled = drop_sub_df(tmp_unlabeled, val_unlabeled)
    
    train = pd.concat([train_labeled, train_unlabeled], ignore_index=True)
    val = pd.concat([val_labeled, val_unlabeled], ignore_index=True)
    test = pd.concat([test_labeled, test_unlabeled], ignore_index=True)
    return [train, val, test]

def data_for_model(dfs, label, ratio_of_label_unlabel, a=0, b=0, c=0, norm=False):
    # Combine previous functions and form x, y value lists for model
    # Return train_x/y, val_x/y, test_x/y, in one call
    # ratio_of_label_unlabel is used for df_get_labelled_unlablled, as the ratio of label:unlabel
    # a, b, c is ratio of train:val:test, if they are not specified, then do not modify
    # norm decide whether normalize the embeddings.
    #
    # input: list[df] dfs, str label, int ratio_of_label:unlabel, int a, int b, int c, bool norm
    # output: lists train/val/test_x/y
    train = df_get_labelled_unlabelled(dfs[0], label, ratio_of_label_unlabel)
    val = df_get_labelled_unlabelled(dfs[1], label, ratio_of_label_unlabel)
    test = df_get_labelled_unlabelled(dfs[2], label, ratio_of_label_unlabel)
    
    if a!=0 or b!=0 or c!=0:
        train, val, test = balance_df_size([train, val, test], a, b, c)
        
    train_x, train_y = get_xy_from_df(train, norm=norm)
    val_x, val_y = get_xy_from_df(val, norm=norm)
    test_x, test_y = get_xy_from_df(test, norm=norm)
    return train_x, train_y, val_x, val_y, test_x, test_y

In [110]:
def visualize_training(history, filename):
    # Visulize training result
    #
    # input: history output_of_model.fit
    # output: None
    history_dict = history.history
    history_dict.keys()
    acc = history_dict['accuracy']
    val_acc = history_dict['val_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(acc) + 1)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,7))
    
    # "bo" is for "blue dot"
    ax1.plot(epochs, loss, 'bo', label='Training loss')
    # b is for "solid blue line"
    ax1.plot(epochs, val_loss, 'b', label='Validation loss')
    ax1.set_title('Training and validation loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    ax2.plot(epochs, acc, 'bo', label='Training acc')
    ax2.plot(epochs, val_acc, 'b', label='Validation acc')
    ax2.set_title('Training and validation accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy')
    ax2.legend(loc='lower right')
    
#     plt.show()
    plt.savefig(filename, bbox_inches='tight')

def print_test_result(model, test_x, text_y):
    results = model.evaluate(test_x, test_y, verbose=2)
    for name, value in zip(model.metrics_names, results):
        print("%s: %.3f" % (name, value))

In [111]:
def find_pic_with_keyword(src_dirs, dst_dir, keywords):
    # Copy the targeted files from src_dirs to dst_dir
    # Eg. find_pic_with_keyword(['results1/', 'results3/'], 'tmp/', ['1_elu_'])
    # dirs must contain '/' in the end
    # 
    # input: list[str] src_dirs, str dst_dir, list[str] keywords
    # output: None
    rmtree(dst_dir)
    os.mkdir(dst_dir)
    for src_dir in src_dirs:
        files = [f for f in listdir(src_dir) if isfile(join(src_dir, f))]
        mov_files = [f for f in files if all(k in f for k in keywords)]
        for f in mov_files:
            copyfile(src_dir+f, dst_dir+f)

-------------------------------
-------------------------------
-------------------------------
-------------------------------

## 3.1: build and train model [exploration]
### First trial: 
Focus on balanced data input, where ratio is set to 1.  
Check results1/README.md for detail information:  
https://github.com/googleinterns/activity-recognition/blob/snore-AudioPrep/snore/VGG/results1/README.md

In [19]:
def model_config_train_1(data, activation, optimizer, metrics, epochs):
    # Get train, val, test sets, and config to train model
    #
    # input: list[list] data, str activation, str optimizer, list[str] metrics, int epochs
    # output: history output_of_model.fit
    train_x, train_y, val_x, val_y, test_x, test_y = data
    
    model = keras.Sequential([
        keras.Input(shape=(128,)),
        keras.layers.Dense(128, activation=activation),
        keras.layers.Dense(1)
    ])
    model.compile(optimizer=optimizer,
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=metrics)
    history = model.fit(train_x, train_y,
                   epochs=epochs,
                   validation_data=(val_x, val_y),
                    verbose=1)
    return history

In [None]:
ratio_list = [1, 20, 40]
activation_list = ['elu', 'exponential', 'relu', 'selu', 'sigmoid', 'softmax', 'softplus', 'softsign', 'tanh']
optimizer_list = ['adadelta', 'adagrad', 'adam', 'adamax', 'ftrl', 'nadam', 'rmsprop', 'sgd']
metrics_list = [['accuracy']]
epochs_list = [20, 30, 40, 50]

for ratio in ratio_list:
    data = data_for_model([unbal_info, bal_info, eval_info], 'snoring', ratio)
    for activation in activation_list:
        for optimizer in optimizer_list:
            for metrics in metrics_list:
                for epochs in epochs_list:
                    history = model_config_train_1(data, activation, optimizer, metrics, epochs)
                    visualize_training(history, 
                                       'results1/'+str(ratio)+'_'+activation+'_'+
                                       optimizer+'_'+'_'.join(metrics)+'_'+str(epochs)+'.png')

-------------------------------
-------------------------------
-------------------------------
-------------------------------

## 3.2: build and train model [increase val size]
### Second trial: 
__Set train : val : test to 8 : 1 : 1__  
__Give unbalanced data for training, in order to get larger training size.__  
Check results2/README.md for detail information:  
https://github.com/googleinterns/activity-recognition/blob/snore-AudioPrep/snore/VGG/results2/README.md

In [28]:
def model_config_train_2(data, activation, optimizer, metrics, epochs):
    # Get train, val, test sets, and config to train model
    #
    # input: list[list] data, str activation, str optimizer, list[str] metrics, int epochs
    # output: history output_of_model.fit
    train_x, train_y, val_x, val_y, test_x, test_y = data
    
    model = keras.Sequential([
        keras.Input(shape=(128,)),
        keras.layers.Dense(128, activation=activation),
        keras.layers.Dense(1)
    ])
    model.compile(optimizer=optimizer,
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=metrics)
    history = model.fit(train_x, train_y,
                   epochs=epochs,
                   validation_data=(val_x, val_y),
                    verbose=1)
    return history

In [None]:
ratio_list = [1, 10]
activation_list = ['elu', 'relu', 'selu', 'sigmoid', 'softsign', 'tanh']
optimizer_list = ['adagrad', 'adam', 'adamax', 'ftrl', 'nadam', 'rmsprop']
metrics_list = [['accuracy']]
epochs_list = [20, 40, 60, 80, 100]

for ratio in ratio_list:
    data = data_for_model([unbal_info, bal_info, eval_info], 'snoring', ratio, a=8, b=1, c=1)
    for activation in activation_list:
        for optimizer in optimizer_list:
            for metrics in metrics_list:
                for epochs in epochs_list:
                    history = model_config_train_2(data, activation, optimizer, metrics, epochs)
                    visualize_training(history, 
                                       'results2/'+str(ratio)+'_'+activation+'_'+
                                       optimizer+'_'+'_'.join(metrics)+'_'+str(epochs)+'.png')

-------------------------------
-------------------------------
-------------------------------
-------------------------------

## 3.3: build and train model [normalization]
### Third trial: 
__Since VGG turns audio problem to image problem, the 128-dimension embeddings are at range [0, 255]. Should be better if we normalize them.__  
__Train : val : test = 8 : 1 : 1__  
Check results3/README.md for detail information:  
https://github.com/googleinterns/activity-recognition/blob/snore-AudioPrep/snore/VGG/results3/README.md

In [24]:
def model_config_train_3(data, activation, optimizer, metrics, epochs):
    # Get train, val, test sets, and config to train model
    #
    # input: list[list] data, str activation, str optimizer, list[str] metrics, int epochs
    # output: history output_of_model.fit
    train_x, train_y, val_x, val_y, test_x, test_y = data
    
    model = keras.Sequential([
        keras.Input(shape=(128,)),
        keras.layers.Dense(128, activation=activation),
        keras.layers.Dense(1)
    ])
    model.compile(optimizer=optimizer,
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=metrics)
    history = model.fit(train_x, train_y,
                   epochs=epochs,
                   validation_data=(val_x, val_y),
                    verbose=1)
    return history

In [None]:
ratio_list = [1]
activation_list = ['elu', 'exponential', 'relu', 'selu', 'sigmoid', 'softmax', 'softplus', 'softsign', 'tanh']
optimizer_list = ['adadelta', 'adagrad', 'adam', 'adamax', 'ftrl', 'nadam', 'rmsprop', 'sgd']
metrics_list = [['accuracy']]
epochs_list = [30, 50]

for ratio in ratio_list:
    data = data_for_model([unbal_info, bal_info, eval_info], 
                          'snoring', 1, a=8, b=1, c=1, norm=True)
    for activation in activation_list:
        for optimizer in optimizer_list:
            for metrics in metrics_list:
                for epochs in epochs_list:
                    history = model_config_train_3(data, activation, optimizer, metrics, epochs)
                    visualize_training(history, 
                                       'results3/trail3_'+str(ratio)+'_'+activation+'_'+
                                       optimizer+'_'+'_'.join(metrics)+'_'+str(epochs)+'.png')

In [51]:
find_pic_with_keyword(['results1/', 'results3/'], 'tmp/', ['_softsign_'])

-------------------------------
-------------------------------
-------------------------------
-------------------------------

## 3.4: fixed a bug in importing tfrecord, get 10 times more data!!!
### Fourth trial: 
__Found a bug in importing .tfrecord, now we get 10 TIMES MORE data for training, validating, and testing!!!!__  
__Eval.csv, bal.csv, and unbal.csv have been changed directly, so previous trials could not be repeated with original configuration after this commit.__  
__Train : val : test = 7 : 2 : 1__  
Check results4/README.md for detail information:  