##AudioVisualMerge
####Combine the predictions from audio2emotion and visual2emotion modules and generate one single output vector for music generation.

###import libraries

In [None]:
from keras.models import load_model
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

# modules for visual data parsing
import cv2
import imutils
import dlib
from imutils import face_utils

# modules for audio data parsing
import librosa
import subprocess
import librosa.display
import IPython.display as ipd
from IPython.core.display import display
import pandas as pd
from sklearn.preprocessing import StandardScaler
from joblib import load

###Predict emotion using audio and visual models seperately.

####Load video

#####**Demo video from Divit**

In [None]:
!gdown --id 1NFqhQls1CQhN5jHlYiikB86AJT-mY9YY
demo_path = "/content/demo.mp4"

Downloading...
From: https://drive.google.com/uc?id=1NFqhQls1CQhN5jHlYiikB86AJT-mY9YY
To: /content/demo.mp4
100% 23.5M/23.5M [00:00<00:00, 80.0MB/s]


#####Angry demo video

In [None]:
!gdown --id 1a7Ggt0vYCvG4LyUT1FccxRUhd7xcdDwt
angry_demo_path = "/content/angry.mov"

Downloading...
From: https://drive.google.com/uc?id=1a7Ggt0vYCvG4LyUT1FccxRUhd7xcdDwt
To: /content/angry.mov
100% 3.29M/3.29M [00:00<00:00, 27.5MB/s]


#####Calm demo video

In [None]:
!gdown --id 1Cb8rA59g9RvUZFZ8ho5LBc0QmOP9aXob
calm_demo_path = "/content/calm_demo.mov"

Downloading...
From: https://drive.google.com/uc?id=1Cb8rA59g9RvUZFZ8ho5LBc0QmOP9aXob
To: /content/calm_demo.mov
100% 17.4M/17.4M [00:00<00:00, 36.9MB/s]


#####Disgust demo video

In [None]:
!gdown --id 1TFy7uvQyPIprvrHTJqC9hl-kBzn7Nw7h
disgust_demo_path = "/content/disgust.mov"

Downloading...
From: https://drive.google.com/uc?id=1TFy7uvQyPIprvrHTJqC9hl-kBzn7Nw7h
To: /content/disgust.mov
100% 24.3M/24.3M [00:00<00:00, 155MB/s]


#####Happy demo video


In [None]:
!gdown --id 1fmEy4bPjUvXIM9ZMCMUEgpbXWciICddD
happy_demo_path = "content/happy.mov"

Downloading...
From: https://drive.google.com/uc?id=1fmEy4bPjUvXIM9ZMCMUEgpbXWciICddD
To: /content/happy.mov
100% 2.65M/2.65M [00:00<00:00, 238MB/s]


In [None]:
!gdown --id 1MU3nBa3nLHK1qiY7jCRG29P5BC6jGELx
happy_demo_path_2 = "content/happy.mov"

Downloading...
From: https://drive.google.com/uc?id=1MU3nBa3nLHK1qiY7jCRG29P5BC6jGELx
To: /content/happy.mov
100% 5.11M/5.11M [00:00<00:00, 54.0MB/s]


####**audio2emotion**

In [None]:
def get_audio_path(path):
  command = "ffmpeg -i " + path + " -ab 160k -ac 2 -ar 16000 -vn demo.wav"
  subprocess.call(command, shell=True)
  audio_path = "/content/demo.wav"
  return audio_path

In [None]:
a = get_audio_path(demo_path)

convert video (.mov) file to audio (.wav)

#####Audio Preprocessing
As audio has different length and the FFT will produce distortions. We need to cut a long audio file into several short segments. In other word, we will divide the signal into frames. Each audio frame will have 50% overlap since we loose infomration on the edge of each frame after using a window function. The audio prediction is made in every 3 seconds.

In [None]:
def extract_features(data):
    result = np.array([])

    # 13 MFCCs
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=16000, n_mfcc = 13).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=16000).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # spectral centroid
    spec = np.mean(librosa.feature.spectral_centroid(y = data, sr = 16000, n_fft = 1024, hop_length = 512)[0])
    result = np.hstack((result, spec)) # stacking horizontally

    return result

In [None]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [None]:
'''def slice_frame(file, sample_rate = 16000):
  data_per_frame = []

  signal, sr = librosa.load(file, sr = sample_rate)
  duration = librosa.get_duration(y = signal, sr = sample_rate)

  #slice the audio into every 1 seconds
  num_frames = int(duration)
  #print("num_frames:", num_frames)
  window_hop = sample_rate // 2

  for n in range(num_frames * 2 - 1):
    #print("n:", n)
    start = window_hop * n
    finish = start + sample_rate
    sliced_data = signal[start:finish]
    #print("sliced_data:", sliced_data)

    data_per_frame.append(sliced_data)
    #print("length of data_per_frame:", len(data_per_frame))
    #print("data_per_frame:", data_per_frame)

  return data_per_frame

    #calc_features_per_frame = extract_features(sliced_data)
    #result = np.array(calc_features_per_frame)
    #result = np.append(result)
    #print(result)
    #print("/n")

    #result_per_frame.append(result)
    #print(result_per_frame)
    #print("/s")
'''

'def slice_frame(file, sample_rate = 16000):\n  data_per_frame = []\n\n  signal, sr = librosa.load(file, sr = sample_rate)\n  duration = librosa.get_duration(y = signal, sr = sample_rate)\n\n  #slice the audio into every 1 seconds\n  num_frames = int(duration)\n  #print("num_frames:", num_frames)\n  window_hop = sample_rate // 2\n\n  for n in range(num_frames * 2 - 1):\n    #print("n:", n)\n    start = window_hop * n\n    finish = start + sample_rate\n    sliced_data = signal[start:finish]\n    #print("sliced_data:", sliced_data)\n    \n    data_per_frame.append(sliced_data)\n    #print("length of data_per_frame:", len(data_per_frame))\n    #print("data_per_frame:", data_per_frame)\n  \n  return data_per_frame\n\n    #calc_features_per_frame = extract_features(sliced_data)\n    #result = np.array(calc_features_per_frame)\n    #result = np.append(result)\n    #print(result)\n    #print("/n")\n    \n    #result_per_frame.append(result)\n    #print(result_per_frame)\n    #print("/s")\n'

In [None]:
def slice_frame(file, sample_rate = 16000):
  data_per_frame = []

  signal, sr = librosa.load(file, sr = sample_rate)
  duration = librosa.get_duration(y = signal, sr = sample_rate)

  samples_per_file = sample_rate * duration
  num_frames = int(duration / 3) #slice the audio into three seconds each
  sample_per_frame = int(samples_per_file / num_frames)
  window_hop = sample_per_frame // 2


  for n in range(num_frames * 2 - 1):
    start = window_hop * n
    finish = start + sample_per_frame
    sliced_data = signal[start:finish]

    data_per_frame.append(sliced_data)

  return data_per_frame


In [None]:
#load scaler from the audio2emotion source code
!gdown --id 1fKfZSgw7bcm7v_4teVYeWgOoPr0z60-i

Downloading...
From: https://drive.google.com/uc?id=1fKfZSgw7bcm7v_4teVYeWgOoPr0z60-i
To: /content/scaler.pkl
100% 1.25k/1.25k [00:00<00:00, 1.94MB/s]


In [None]:
scaler = load(open('scaler.pkl', 'rb'))

In [None]:
def frame_features(file):
  frame_data = slice_frame(file)

  calc_features = []
  for list in frame_data:
    #original data
    frame_result = extract_features(list)
    result = np.array(frame_result)
    #print("origin result: ", result)

    #data with noise
    noise_data = noise(list)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2))
    #print("noise result: ", res2)

    #data with stretching and pitching
    new_data = stretch(list)
    data_stretch_pitch = pitch(new_data, 16000)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3))
    #print("stretchpitch result: ", res3)

    #add scaler
    combine_array = np.array(result)
    combine_array_scaled = scaler.transform(combine_array)

    calc_features.append(combine_array_scaled)

  return calc_features

In [None]:
feature_lists = frame_features(a)

#####Use pretrained model to predict emotions
Note: {0: 'angry',
 1: 'calm',
 2: 'disgust',
 3: 'fear',
 4: 'happy',
 5: 'neutral',
 6: 'sad',
 7: 'surprise'}

In [None]:
!gdown --id 1tZwJn00OjoT3painOq9t9BvsbDGD5a0n

Downloading...
From: https://drive.google.com/uc?id=1tZwJn00OjoT3painOq9t9BvsbDGD5a0n
To: /content/audio_model.h5
100% 6.53M/6.53M [00:00<00:00, 82.8MB/s]


In [None]:
audio_model = load_model('/content/audio_model.h5')
audio_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 27, 256)           1536      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 14, 256)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 14, 256)           327936    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 7, 256)           0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 7, 128)            163968    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 4, 128)           0

In [None]:
def audio_predict(path):
  feature_input = frame_features(path)
  audio_prediction_vectors = []
  audio_prediction_labels = []


  for list in feature_input:
    features = list
    #reshape the features to fit the model
    expand_features = np.expand_dims(features, axis = 2)

    #feed the features into the model for prediction
    frame_predict = audio_model.predict(expand_features)
    #print("frame_predict:", frame_predict)

    #prediction vector on every frame
    frame_predict_output = frame_predict[0]
    #print("frame_predict_output:", frame_predict_output)

    #prediction label on every frame
    frame_predict_maxindex = np.argmax(frame_predict_output)
    frame_predict_label = predictions_dict[frame_predict_maxindex]
    #print("frame_predict_label:", frame_predict_label)

    audio_prediction_vectors.append(frame_predict_output)
    audio_prediction_labels.append(frame_predict_label)

  # Duplicate the first and last prediction so length match
  first_prediction_vector = audio_prediction_vectors[0]
  first_prediction_label = audio_prediction_labels[0]
  last_prediction_vector = audio_prediction_vectors[-1]
  last_prediction_label = audio_prediction_labels[-1]

  audio_prediction_vectors.insert(0, first_prediction_vector)
  audio_prediction_vectors.append(last_prediction_vector)

  audio_prediction_labels.insert(0, first_prediction_label)
  audio_prediction_labels.append(last_prediction_label)

  return audio_prediction_vectors, audio_prediction_labels

In [None]:
audio_predict(a)



([array([1.3140957e-09, 5.7198145e-06, 2.0345067e-10, 1.3066876e-09,
         1.4270027e-10, 1.4440648e-08, 9.9999428e-01, 3.2274528e-16],
        dtype=float32),
  array([1.3140957e-09, 5.7198145e-06, 2.0345067e-10, 1.3066876e-09,
         1.4270027e-10, 1.4440648e-08, 9.9999428e-01, 3.2274528e-16],
        dtype=float32),
  array([2.4631059e-09, 9.9684203e-01, 1.9211394e-07, 4.0048440e-10,
         1.8781188e-09, 2.9459214e-04, 2.8631915e-03, 1.6728092e-12],
        dtype=float32),
  array([1.5063922e-11, 9.9946028e-01, 1.7257202e-08, 3.3309117e-13,
         2.6988366e-11, 2.6456502e-05, 5.1321933e-04, 6.3468518e-16],
        dtype=float32),
  array([1.8731585e-07, 3.6519300e-06, 9.9994409e-01, 4.1358261e-14,
         3.6641993e-08, 1.0048250e-06, 5.0961506e-05, 7.7407915e-12],
        dtype=float32),
  array([3.6091135e-09, 9.9152970e-01, 1.5805977e-03, 1.4986117e-12,
         1.3454746e-09, 4.2967231e-04, 6.4600967e-03, 2.5820386e-13],
        dtype=float32),
  array([2.1928912e-05

####**visual2emotion**
*Note*: The visual2emotion model detects emotions based on key frame. Essentially, we can make a decision whenever we want.

To coordinate better with audio data, we choose to make a decision every xxxx second.


#####Visual Preprocessing
Our visual model takes landmarks features as input, so we need to parse the video, extract the key frames and corresponding landmarks, and then feed into the pre-trained model.

In [None]:
# download dlib pretrained library
!gdown --id 1XqF2ec7KdVrrxrnahWeKCJrHOR_ucX2o

Downloading...
From: https://drive.google.com/uc?id=1XqF2ec7KdVrrxrnahWeKCJrHOR_ucX2o
To: /content/shape_predictor_68_face_landmarks.dat
100% 99.7M/99.7M [00:01<00:00, 52.0MB/s]


In [None]:
# Helper functions for video parsing

#dlib model setup
# initialize dlib's face detector (HOG-based) and create facial landmark predictor
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("/content/shape_predictor_68_face_landmarks.dat")

# dlib helper functions
# take a bounding predicted by dlib and convert it to the format (x, y, w, h) as
# we would normally do with OpenCV
def rect_to_bb(rect):
	x = rect.left()
	y = rect.top()
	w = rect.right() - x
	h = rect.bottom() - y
	# return a tuple of (x, y, w, h)
	return (x, y, w, h)

# initialize the list of (x, y)-coordinates
def shape_to_np(shape, dtype="int"):
	coords = np.zeros((68, 2), dtype=dtype)
	# loop over the 68 facial landmarks and convert them
	# to a 2-tuple of (x, y)-coordinates
	for i in range(0, 68):
		coords[i] = (shape.part(i).x, shape.part(i).y)
	# return the list of (x, y)-coordinates
	return coords

In [None]:
# Helper function for data parsing
# Use dlib to get landmark
# Store the latest landmark every 3 seconds as the output array
# return [landmarks1, landmarks2, landmarks3...]
def get_landmark(path):
  print(f"Parsing file <path: {path}> ")
  cap = cv2.VideoCapture(path)
  total_count = 0
  capture_count = 0

  result = [] # a list of landmarks
  latest_capture = None # latest landmarks captured
  first_capture_flag = False # flag to mark whether we've got the first capture
  second_capture_flag = False # flag to mark whether we've got the second capture
  # Read the image and parse the facial landmarks every 1000ms
  while True:
    total_count += 1
    capture_count += 1
    # use cap to read key frame
    ret, image = cap.read()
    print(f"Trying to capture frame {total_count}...")

    if ret is not True:
      print(f"Total frame count: {total_count}")
      print(f"{len(result)} frame of landmarks captured")
      break

    else:
      # Append latest capture to result every 1.5 seconds
      if capture_count >= 45:
        if first_capture_flag and second_capture_flag:
          result.append(latest_capture)
        capture_count = 0 # reset capture_count

      # resize the input image, and convert it to grayscale
      image = imutils.resize(image, width=500)
      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

      # detect faces in the grayscale image
      rects = detector(gray, 1)
      try:
        rect = rects[0] # only one face
        shape = predictor(gray, rect)
        shape = face_utils.shape_to_np(shape)

        # convert dlib's rectangle to a OpenCV-style bounding box
        # [i.e., (x, y, w, h)], then draw the face bounding box
        (x, y, w, h) = face_utils.rect_to_bb(rect)
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

        print("Captured!")
        latest_capture = shape[17:] # updated latest capture

        # If no frame is capture in the first 3 seconds
        if total_count > 45 and not first_capture_flag:
          result.append(latest_capture)
          first_capture_flag = True

        if total_count > 90 and not second_capture_flag:
          result.append(latest_capture)
          second_capture_flag = True

      except IndexError:
        continue


  # Release all space and windows once done
  cap.release()
  cv2.destroyAllWindows()

  print(f"Total frame count: {total_count}")
  print(f"{len(result)} frame of landmarks captured")
  return result

In [None]:
# Helper functions for feature extraction
# Calculate distance between two landmarks
def calc_distance(landmark1,landmark2):
  (x1, y1) = landmark1
  (x2, y2) = landmark2
  return ((x1-x2)**2+ (y1-y2)**2)**(0.5)

# Featurize and turn the facial landmarks into a 1-d array
def featurize(landmarks):
  res = []
  total_count = len(landmarks)
  for i in range(total_count - 1):
    for j in range(i + 1, total_count):
      res.append(calc_distance(landmarks[i], landmarks[j]))
  return np.array(res)

In [None]:
def parse_visual(path):
  landmarks_list = get_landmark(path)

  input = []
  for landmarks in landmarks_list:
    input.append(featurize(landmarks))
  return input

#####Use pretrained model to predict emotions
#####Use pretrained model to predict emotions
Note: Output 0-7 is corresponded to `['angry' 'calm' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']`.

In [None]:
predictions_dict = ['angry', 'calm', 'disgust', 'fear', 'happy', 'neutral', 'sad','surprise']

In [None]:
# load pretrained model from google drive
!gdown --id 1-GFtUsUYzXHXTWluajl7PpEwDj6yfuxx

Downloading...
From: https://drive.google.com/uc?id=1-GFtUsUYzXHXTWluajl7PpEwDj6yfuxx
To: /content/visual_model_full.h5
100% 12.8M/12.8M [00:00<00:00, 83.8MB/s]


In [None]:
visual_model = load_model("/content/visual_model_full.h5")
visual_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_4 (Batc  (None, 1275)             5100      
 hNormalization)                                                 
                                                                 
 dropout_4 (Dropout)         (None, 1275)              0         
                                                                 
 dense_4 (Dense)             (None, 512)               653312    
                                                                 
 batch_normalization_5 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_5 (Dense)             (None, 512)              

In [None]:
def visual_predict(path):
  features = parse_visual(path)

  # Get the prediction vector for each features array
  visual_prediction_vectors = []
  visual_predictions = []
  for frame_feature in features:
    frame_feature_cnn = np.expand_dims(frame_feature, axis = 0)

    # use pre-trained model to predict emotion and store prediction matrix
    predictions = visual_model.predict(frame_feature_cnn)
    visual_prediction_vectors.append(predictions)

  # And an array of prediction emotions
  labels = []
  for vector in visual_prediction_vectors:
    max_index = np.argmax(vector)
    labels.append(predictions_dict[max_index])

  return visual_prediction_vectors, labels

In [None]:
visual_predict(demo_path)

Parsing file <path: /content/demo.mp4> 
Trying to capture frame 1...
Trying to capture frame 2...
Trying to capture frame 3...
Trying to capture frame 4...
Trying to capture frame 5...
Trying to capture frame 6...
Trying to capture frame 7...
Trying to capture frame 8...
Trying to capture frame 9...
Trying to capture frame 10...
Trying to capture frame 11...
Trying to capture frame 12...
Trying to capture frame 13...
Trying to capture frame 14...
Trying to capture frame 15...
Trying to capture frame 16...
Trying to capture frame 17...
Trying to capture frame 18...
Trying to capture frame 19...
Trying to capture frame 20...
Trying to capture frame 21...
Trying to capture frame 22...
Trying to capture frame 23...
Trying to capture frame 24...
Trying to capture frame 25...
Trying to capture frame 26...
Trying to capture frame 27...
Trying to capture frame 28...
Trying to capture frame 29...
Trying to capture frame 30...
Trying to capture frame 31...
Trying to capture frame 32...
Trying to

([array([[4.48430093e-09, 1.01707956e-22, 1.08314504e-17, 2.36827484e-03,
          3.69704470e-17, 7.79899319e-23, 5.22663268e-17, 9.97631669e-01]],
        dtype=float32),
  array([[4.48430093e-09, 1.01707956e-22, 1.08314504e-17, 2.36827484e-03,
          3.69704470e-17, 7.79899319e-23, 5.22663268e-17, 9.97631669e-01]],
        dtype=float32),
  array([[7.9598508e-21, 1.9515305e-29, 2.3106909e-24, 6.6140702e-13,
          3.6523025e-28, 0.0000000e+00, 7.7323056e-36, 1.0000000e+00]],
        dtype=float32),
  array([[4.8657107e-06, 2.3555293e-22, 2.6039717e-16, 2.4188936e-03,
          6.9179973e-17, 5.9586970e-20, 5.5619154e-10, 9.9757630e-01]],
        dtype=float32),
  array([[3.3228054e-15, 1.3063956e-32, 2.1026787e-33, 2.4097884e-16,
          3.5941585e-27, 8.4379505e-31, 8.9746447e-25, 1.0000000e+00]],
        dtype=float32),
  array([[1.1883124e-06, 1.8832536e-20, 3.8105299e-16, 7.1000628e-05,
          2.0429261e-21, 3.4668767e-26, 1.5389230e-17, 9.9992776e-01]],
        dtyp

###Combine the outputs from two models

`predictions_dict = ['angry', 'calm', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']`

In [None]:
audio_prediction_vectors, audio_prediction_labels = audio_predict("/content/demo.wav")
visual_prediction_vectors, visual_prediction_labels = visual_predict(demo_path)

Parsing file <path: /content/demo.mp4> 
Trying to capture frame 1...
Trying to capture frame 2...
Trying to capture frame 3...
Trying to capture frame 4...
Trying to capture frame 5...
Trying to capture frame 6...
Trying to capture frame 7...
Trying to capture frame 8...
Trying to capture frame 9...
Trying to capture frame 10...
Trying to capture frame 11...
Trying to capture frame 12...
Trying to capture frame 13...
Trying to capture frame 14...
Trying to capture frame 15...
Trying to capture frame 16...
Trying to capture frame 17...
Trying to capture frame 18...
Trying to capture frame 19...
Trying to capture frame 20...
Trying to capture frame 21...
Trying to capture frame 22...
Trying to capture frame 23...
Trying to capture frame 24...
Trying to capture frame 25...
Trying to capture frame 26...
Trying to capture frame 27...
Trying to capture frame 28...
Trying to capture frame 29...
Trying to capture frame 30...
Trying to capture frame 31...
Trying to capture frame 32...
Trying to

In [None]:
def predict(path):
  audio_path = get_audio_path(path)
  #audio_prediction_vectors, audio_prediction_labels = audio_predict(audio_path)
  #visual_prediction_vectors, visual_prediction_labels = visual_predict(path)

  print(f"audio_prediction {audio_prediction_labels}")
  print(f"visual_prediction {visual_prediction_labels}")
  print(f"Length check: audio {len(audio_prediction_labels)}, visual: {len(visual_prediction_labels)}")

  combined_labels = []
  for i in range(len(visual_prediction_vectors)):
    audio_prediction_vector = list(audio_prediction_vectors[i])
    visual_prediction_vector = list(visual_prediction_vectors[i][0])

    # calcuate combine vector for each interval
    combined_vector = []
    for j in range(8):
      combined_vector.append(audio_prediction_vector[j] + 3*visual_prediction_vector[j])

    max_index = np.argmax(combined_vector)
    combined_labels.append(predictions_dict[max_index])

  print(f"Combined labels: {combined_labels}")
  return audio_prediction_labels,visual_prediction_labels, combined_labels

In [None]:
audio_prediction_labels,visual_prediction_labels, combined_labels = predict(demo_path)

audio_prediction ['sad', 'sad', 'calm', 'calm', 'disgust', 'calm', 'disgust', 'calm', 'calm', 'sad', 'disgust', 'disgust', 'calm', 'disgust', 'angry', 'sad', 'calm', 'disgust', 'calm', 'disgust', 'disgust', 'disgust', 'sad', 'calm', 'calm', 'calm', 'disgust', 'calm', 'calm', 'calm', 'happy', 'happy', 'happy']
visual_prediction ['surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'disgust', 'disgust', 'disgust', 'fear', 'surprise', 'surprise', 'surprise', 'surprise', 'sad', 'happy', 'happy', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'sad', 'surprise', 'surprise', 'surprise', 'surprise']
Length check: audio 33, visual: 33
Combined labels: ['surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'surprise', 'disgust', 'disgust', 'disgust', 'fear', 'surprise', 'surprise', 'surprise', 'surprise', 'sad', 'happy', 'happy', 'surprise', 'surprise'

###Display emotion prediction result
Using our own demo video, we can display the prediction from audio and visual predictions, as well as the combined prediction, along with the demo video using OpenCV.

Note: we already have `audio_prediction_labels`, `visual_prediction_labels`, `combined_labels` computed from previous parts.

In [None]:
from google.colab.patches import cv2_imshow

cap = cv2.VideoCapture(demo_path)

frame_count = 0
while(True):
    frame_count += 1
    # Capture frames in the video
    ret, frame = cap.read()

    # describe the type of font
    # to be used.
    font = cv2.FONT_HERSHEY_SIMPLEX

    interval_count = frame_count // 45
    text = "Visual: " + visual_prediction_labels[interval_count] + \
           "\nAudio: " + audio_prediction_labels[interval_count] + \
           "\nCombined " + combined_labels[interval_count]

    # Use putText() method for
    # inserting text on video
    cv2.putText(frame,
                text,
                (50, 50),
                font, 1,
                (0, 255, 255),
                2,
                cv2.LINE_4)

    # Display the resulting frame
    cv2_imshow(frame, reverse=True)

    # creating 'q' as the quit
    # button for the video
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# release the cap object
cap.release()
# close all windows
cv2.destroyAllWindows()

###Music generation

####Import libraries and helpers

In [None]:
from google.colab import drive
import os
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev

Selecting previously unselected package fluid-soundfont-gm.
(Reading database ... 124013 files and directories currently installed.)
Preparing to unpack .../fluid-soundfont-gm_3.1-5.1_all.deb ...
Unpacking fluid-soundfont-gm (3.1-5.1) ...
Selecting previously unselected package libfluidsynth1:amd64.
Preparing to unpack .../libfluidsynth1_1.1.9-1_amd64.deb ...
Unpacking libfluidsynth1:amd64 (1.1.9-1) ...
Setting up fluid-soundfont-gm (3.1-5.1) ...
Setting up libfluidsynth1:amd64 (1.1.9-1) ...
Processing triggers for libc-bin (2.27-3ubuntu1.6) ...


In [None]:
!pip install -qU pyfluidsynth pretty_midi

[K     |████████████████████████████████| 5.6 MB 13.7 MB/s 
[K     |████████████████████████████████| 51 kB 7.7 MB/s 
[?25h  Building wheel for pretty-midi (setup.py) ... [?25l[?25hdone


In [None]:
import collections
import datetime
import fluidsynth
from fluidsynth import *
import glob
import numpy as np
import pathlib
import pandas as pd
import pretty_midi
import seaborn as sns
import tensorflow as tf
from google.colab import files
import random
from IPython import display
from matplotlib import pyplot as plt
from typing import Dict, List, Optional, Sequence, Tuple

In [None]:
def midi_to_notes(midi_file: str):
  pm = pretty_midi.PrettyMIDI(midi_file)
  instrument = pm.instruments[0]
  instrument_name = pretty_midi.program_to_instrument_name(instrument.program)
  notes = collections.defaultdict(list)

  # Sort the notes by start time
  sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
  prev_start = sorted_notes[0].start

  for note in sorted_notes:
    start = note.start
    end = note.end
    notes['pitch'].append(note.pitch)
    notes['start'].append(start)
    notes['end'].append(end)
    notes['step'].append(start - prev_start)
    notes['duration'].append(end - start)
    prev_start = start

  return pd.DataFrame({name: np.array(value) for name, value in notes.items()}), instrument_name

In [None]:
def notes_to_midi(
  notes: pd.DataFrame,
  out_file: str,
  instrument_name: str,
  velocity: int = 100,  # note loudness
) -> pretty_midi.PrettyMIDI:
  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          instrument_name))
  prev_start = 0
  for i, note in notes.iterrows():
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start
  pm.instruments.append(instrument)
  pm.write(out_file)
  return pm

In [None]:
def predict_next_note(notes, model, temperature) -> int:
  """Generates a note IDs using a trained sequence model."""
  assert temperature > 0
  # Add batch dimension
  inputs = tf.expand_dims(notes, 0)
  predictions = model.predict(inputs)
  pitch_logits = predictions['pitch']
  step = predictions['step']
  duration = predictions['duration']
  pitch_logits /= temperature
  pitch = tf.random.categorical(pitch_logits, num_samples=1)
  pitch = tf.squeeze(pitch, axis=-1)
  duration = tf.squeeze(duration, axis=-1)
  step = tf.squeeze(step, axis=-1)
  # `step` and `duration` values should be non-negative
  step = tf.maximum(0, step)
  duration = tf.maximum(0, duration)
  return int(pitch), float(step), float(duration)

In [None]:
def mse_with_positive_pressure(y_true: tf.Tensor, y_pred: tf.Tensor):
  mse = (y_true - y_pred) ** 2
  positive_pressure = 10 * tf.maximum(-y_pred, 0.0)
  return tf.reduce_mean(mse + positive_pressure)

In [None]:
def music_generation(raw_notes, model, num_predictions, instrument_name):
  temperature = 2.0
  #num_predictions = 120
  key_order = ['pitch', 'step', 'duration']
  seq_length = 25
  vocab_size = 128
  sample_notes = np.stack([raw_notes[key] for key in key_order], axis=1)
  # The initial sequence of notes; pitch is normalized similar to training
  # sequences
  input_notes = (
    sample_notes[:seq_length] / np.array([vocab_size, 1, 1]))
  generated_notes = []
  prev_start = 0
  for _ in range(num_predictions):
    pitch, step, duration = predict_next_note(input_notes, model, temperature)
    start = prev_start + step
    end = start + duration
    input_note = (pitch, step, duration)
    generated_notes.append((*input_note, start, end))
    input_notes = np.delete(input_notes, 0, axis=0)
    input_notes = np.append(input_notes, np.expand_dims(input_note, 0), axis=0)
    prev_start = start
  generated_notes = pd.DataFrame(generated_notes, columns=(*key_order, 'start', 'end'))
  out_file = 'output.mid'
  out_pm = notes_to_midi(generated_notes, out_file=out_file, instrument_name=instrument_name)
  files.download(out_file)

In [None]:
def create_model():
  seq_length = 25
  input_shape = (seq_length, 3)
  learning_rate = 0.005
  inputs = tf.keras.Input(input_shape)
  x = tf.keras.layers.LSTM(128)(inputs)
  outputs = {
  'pitch': tf.keras.layers.Dense(128, name='pitch')(x),
  'step': tf.keras.layers.Dense(1, name='step')(x),
  'duration': tf.keras.layers.Dense(1, name='duration')(x),
  }
  model = tf.keras.Model(inputs, outputs)
  loss = {
      'pitch': tf.keras.losses.SparseCategoricalCrossentropy(
          from_logits=True),
      'step': mse_with_positive_pressure,
      'duration': mse_with_positive_pressure,
  }
  optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
  model.compile(
    loss=loss,
    loss_weights={
        'pitch': 0.05,
        'step': 1.0,
        'duration':1.0,
    },
    optimizer=optimizer,
  )
  return model

####Generate music

In [None]:
emotion = predict(disgust_demo_path)

Parsing file <path: /content/disgust.mov> 
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Trying to captu

In [None]:
def generate_music(path):
  # Globals

  emotion = predict(path)
  ROOT_PATH = '/content/gdrive/My Drive/MLS/MLS Group Project/database/'

  os.chdir(ROOT_PATH)
  emotion_types = {"happy": "Q1", "surprise": "Q1", "angry": "Q2", "fearful": "Q2", "disgust": "Q2", "sad": "Q3", "calm": "Q4", "neutral": "Q4"}
  folder_name = "MER_audio/" + emotion_types[emotion]

  # mapping
  emotion_types = {"happy": "Q1", "surprise": "Q1", "angry": "Q2", "fearful": "Q2", "disgust": "Q2", "sad": "Q3", "calm": "Q4", "neutral": "Q4"}
  folder_name = "MER_audio/" + emotion_types[emotion]

  files1=[os.path.join(ROOT_PATH, folder_name) + "/" + i for i in os.listdir(os.path.join(ROOT_PATH, folder_name)) if i.endswith(".midi")]

  random_number = random.randint(0, len(files1)-1)
  sample_file = files1[random_number]

  raw_notes, instrument_name = midi_to_notes(sample_file)
  raw_notes.head()

  model_checkpoint_path = "./training_checkpoints/" + emotion_types[emotion] + "/"
  print(f"model_checkpoint_path: {model_checkpoint_path}")

  ckpt_path = tf.train.latest_checkpoint(model_checkpoint_path)
  print(f"ckpt_path: {ckpt_path}")

  model = create_model()
  model.load_weights(ckpt_path)

  num_predictions = 240

  music_generation(raw_notes, model, num_predictions, instrument_name)


In [None]:
generate_music(disgust_demo_path)

Parsing file <path: /content/disgust.mov> 
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Trying to capture...
Captured!
Trying to capture...
Trying to captu