In [None]:
#Install missing dependencies

!pip install pydub

In [1]:
#Import necessary libraries

import joblib
import pandas as pd
import matplotlib.pyplot as plt
from pydub import AudioSegment
from pydub.utils import make_chunks
import numpy as np
import glob
import librosa
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Input, Activation, Flatten, Dense
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import compute_sample_weight
from imblearn.over_sampling import SMOTE

In [2]:
#Read data labels

train = pd.read_csv('/content/drive/MyDrive/AVEC_Challenge_Packed/labels/train_split.csv')
dev = pd.read_csv('/content/drive/MyDrive/AVEC_Challenge_Packed/labels/dev_split.csv') 

In [3]:
#Create training label variables

train_ids = train.Participant_ID
train_targets = train.PHQ_Score
train_binary = train.PHQ_Binary

In [4]:
#Create validation label variables

dev_ids = dev.Participant_ID
dev_targets = dev.PHQ_Score
dev_binary = dev.PHQ_Binary

In [7]:
#Read training audio files and transcripts

train_audio = sorted(glob.glob('/content/drive/MyDrive/avec/train/*.wav'))
train_transcripts = sorted(glob.glob('/content/drive/MyDrive/avec/train/*.csv'))

In [8]:
#Read validation audio files and transcripts

dev_audio = sorted(glob.glob('/content/drive/MyDrive/avec/dev/*.wav'))
dev_transcripts = sorted(glob.glob('/content/drive/MyDrive/avec/dev/*.csv'))

In [9]:
def audio_feature_extraction(audio_file, transcript_file):
  """
  Preprocesses and creates mel-spectrograms from audio data
  """

  transcript = pd.read_csv(transcript_file)
  timestamps = []

  #Get timestamps for patient responses
  for tup in transcript.loc[:, ['Start_Time', 'End_Time']].itertuples(False, name=None):
    timestamps.append(tup)

  #Read audio data
  audio = AudioSegment.from_wav(audio_file)

  #New audio for patient responses
  newAudio = audio[0:0]

  #Patient audio segmentation
  for idx in range(len(timestamps)-1):
    #Extract and append patient responses to new audio
    t1 = timestamps[idx][0] * 1000
    t2 = timestamps[idx][1] * 1000
    oldAudio = audio[t1:t2]
    newAudio = newAudio + oldAudio

  #Split audio data into 15s audio slices
  audio_chunks = make_chunks(newAudio, 15000)
  features = []

  #Create mel-spectrogram features from each audio slices
  for chunk in audio_chunks[:-1]:
    spectrogram = librosa.stft(np.array(chunk.get_array_of_samples(), dtype='float64'))

    spectrogram_magnitude, phase = librosa.magphase(spectrogram)
    mel_scale = librosa.feature.melspectrogram(S=spectrogram_magnitude, sr=chunk.frame_rate, n_mels=80)

    mel_spectrogram = librosa.amplitude_to_db(mel_scale, ref=np.min)
    
    features.append(mel_spectrogram)

  return features


In [11]:
train_features = []
dev_features = []

In [12]:
#Create training features

for audio_file, transcript_file in zip(train_audio, train_transcripts):
  features = audio_feature_extraction(audio_file, transcript_file)
  features = np.asarray(features).astype('float64')
  train_features.append(features)

In [13]:
#Create validation features

for audio_file, transcript_file in zip(dev_audio, dev_transcripts):
  features = audio_feature_extraction(audio_file, transcript_file)
  features = np.asarray(features).astype('float64')
  dev_features.append(features)

In [6]:
#Map participant ids to features and targets

train_feature_id_map = dict(zip(train_ids, train_features))
dev_feature_id_map = dict(zip(dev_ids, dev_features))

train_target_id_map = dict(zip(train_ids, train_targets))
dev_target_id_map = dict(zip(dev_ids, dev_targets))

train_binary_id_map = dict(zip(train_ids, train_binary))
dev_binary_id_map = dict(zip(dev_ids, dev_binary))

In [7]:
#Get features that match the required shape

train_features_ = [feature for feature in train_features if feature.shape[2] == 469]
dev_features_ = [feature for feature in dev_features if feature.shape[2] == 469]

In [8]:
#Create new target variables

train_targets_ = pd.Series(dtype='int64')
dev_targets_ = pd.Series(dtype='int64')

train_binary_ = pd.Series(dtype='int64')
dev_binary_ = pd.Series(dtype='int64')

In [9]:
#Assign new training target variables to account for audio slices

for id, feature in train_feature_id_map.items():
  if feature.shape[2] == 469:
    train_targets_ = train_targets_.append(pd.Series([train_target_id_map[id]]*feature.shape[0]), ignore_index=True)
    train_binary_ = train_binary_.append(pd.Series([train_binary_id_map[id]]*feature.shape[0]), ignore_index=True)

In [10]:
#Assign new validation target variables to account for audio slices

for id, feature in dev_feature_id_map.items():
  if feature.shape[2] == 469:
    dev_targets_ = dev_targets_.append(pd.Series([dev_target_id_map[id]]*feature.shape[0]), ignore_index=True)
    dev_binary_ = dev_binary_.append(pd.Series([dev_binary_id_map[id]]*feature.shape[0]), ignore_index=True)

In [11]:
#Assign new feature variables to account for audio slices

train_features_ = np.concatenate(train_features_)
dev_features_ = np.concatenate(dev_features_)

In [None]:
#Check shape consistency

print(train_features_.shape)
print(train_targets_.shape)
print(train_binary_.shape)

In [None]:
print(dev_features_.shape)
print(dev_targets_.shape)
print(dev_binary_.shape)

In [14]:
def resample(features, targets, binary):
  """
  Oversamples the minority class to create a balanced dataset
  """
  sm = SMOTE(random_state=42)

  features_ = np.reshape(features, 
                         (features.shape[0], features.shape[1]*features.shape[2]))
  targets_ = np.array(targets).reshape(-1, 1)
  binary_ = np.array(binary).reshape(-1, 1)

  features_res, y = sm.fit_resample(features_, binary_)
  targets_res, y = sm.fit_resample(targets_, binary_)

  features_res = np.reshape(features_res,
                            (features_res.shape[0], features.shape[1], features.shape[2]))
  target_res = np.squeeze(targets_res)

  return features_res, target_res

In [15]:
#Resample training features and targets

train_features_, train_targets_ = resample(train_features_, train_targets_, train_binary_)

In [None]:
#Check shape consistency

print(train_features_.shape)
print(train_targets_.shape)

In [17]:
#Perform min-max normalization

scaler = MinMaxScaler()

In [18]:
X_train = train_features_

for sample in range(X_train.shape[0]):
  X_train[sample] = scaler.fit_transform(X_train[sample])

In [19]:
X_test = dev_features_

for sample in range(X_test.shape[0]):
  X_test[sample] = scaler.transform(X_test[sample])

In [20]:
#Save the sacler

joblib.dump(scaler, 'scaler.pkl')

In [21]:
y_train = train_targets_
y_test = dev_targets_

In [22]:
#Get sample weights

train_weights = compute_sample_weight('balanced', train_targets_)
test_weights = compute_sample_weight('balanced', dev_targets_)

In [24]:
def create_model():
  """
  Returns a one-dimensional convolution neural network for a regression task
  """
  inputs = Input(shape=(80,469,1))

  x = Conv2D(32, (1,7))(inputs)
  x = Activation('relu')(x)
  x = MaxPooling2D((4,3), (1,3))(x)
  x = Conv2D(32, (1,7), 2)(x)
  x = Activation('relu')(x)
  x = MaxPooling2D((1,3), (1,3))(x)
  x = Flatten()(x)
  x = Dense(128)(x)
  x = Activation('relu')(x)
  x = Dense(128)(x)
  x = Activation('relu')(x)
  x = Dropout(0.5)(x)

  outputs = Dense(1, activation='linear')(x)

  model = Model(inputs=inputs, outputs=outputs)

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, decay=0.0001),
                loss='huber',
                metrics=['mae'])

  return model

In [36]:
#Create model

model = create_model()

In [37]:
#Callback to prevent overfitting

callback = tf.keras.callbacks.EarlyStopping(monitor='val_mae', 
                                            patience=10, 
                                            mode='min', 
                                            restore_best_weights=True)

In [None]:
#Train the model

model_history = model.fit(x=X_train, 
                          y=y_train, 
                          batch_size=4, 
                          epochs=30, 
                          validation_data=(X_test, y_test, test_weights),
                          sample_weight=train_weights, 
                          callbacks=[callback])

In [29]:
def plot_metric(name, title):
  '''
  Plots model metrics
  '''
  plt.plot(model_history.history[name], color='blue', label=name)
  plt.plot(model_history.history['val_'+name], color='green', label='val_'+name)
  plt.xlabel('epochs')
  plt.ylabel(name)
  plt.title(title)
  plt.legend()
  plt.show()

In [None]:
#Plot model metrics

plot_metric('loss', 'Training loss vs. Validation loss')
plot_metric('mae', 'Training mae vs. Validation mae')

In [68]:
#model.save('model.h5')