##AudioVisualMerge
####Combine the predictions from audio2emotion and visual2emotion modules and generate one single output vector for music generation.

###import libraries

In [1]:
from keras.models import load_model
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

# modules for visual data parsing
import cv2
import imutils
import dlib
from imutils import face_utils

# modules for audio data parsing
import librosa
import subprocess
import librosa.display
import IPython.display as ipd
from IPython.core.display import display


###Predict emotion using audio and visual models seperately.

####Load video

#####Angry demo video

In [2]:
!gdown --id 1a7Ggt0vYCvG4LyUT1FccxRUhd7xcdDwt
angry_demo_path = "/content/angry.mov"

Downloading...
From: https://drive.google.com/uc?id=1a7Ggt0vYCvG4LyUT1FccxRUhd7xcdDwt
To: /content/angry.mov
100% 3.29M/3.29M [00:00<00:00, 19.0MB/s]


#####Calm demo video

In [3]:
!gdown --id 1Cb8rA59g9RvUZFZ8ho5LBc0QmOP9aXob
calm_demo_path = "/content/calm_demo.mov"

Downloading...
From: https://drive.google.com/uc?id=1Cb8rA59g9RvUZFZ8ho5LBc0QmOP9aXob
To: /content/calm_demo.mov
100% 17.4M/17.4M [00:00<00:00, 73.9MB/s]


#####Disgust demo video

In [4]:
!gdown --id 1TFy7uvQyPIprvrHTJqC9hl-kBzn7Nw7h
disgust_demo_path = "/content/disgust.mov"

Downloading...
From: https://drive.google.com/uc?id=1TFy7uvQyPIprvrHTJqC9hl-kBzn7Nw7h
To: /content/disgust.mov
100% 24.3M/24.3M [00:00<00:00, 75.0MB/s]


#####Happy demo video

In [5]:
!gdown --id 1worrFlWI0maGNUzC_7yC8nxUUaVz3x7k
happy_demo_path = "content/happy2.mov"

Downloading...
From: https://drive.google.com/uc?id=1worrFlWI0maGNUzC_7yC8nxUUaVz3x7k
To: /content/happy2.mov
100% 29.7M/29.7M [00:00<00:00, 81.6MB/s]


In [6]:
!gdown --id 1MU3nBa3nLHK1qiY7jCRG29P5BC6jGELx
happy_demo_path_2 = "content/happy.mov"

Downloading...
From: https://drive.google.com/uc?id=1MU3nBa3nLHK1qiY7jCRG29P5BC6jGELx
To: /content/happy.mov
100% 5.11M/5.11M [00:00<00:00, 31.7MB/s]


####**audio2emotion**

In [7]:
def get_audio_path(path):
  command = "ffmpeg -i " + path + " -ab 160k -ac 2 -ar 16000 -vn demo.wav"
  subprocess.call(command, shell=True)
  audio_path = "/content/demo.wav"
  return audio_path


convert video (.mov) file to audio (.wav)

In [8]:
#command = "ffmpeg -i /content/calm.mov -ab 160k -ac 2 -ar 16000 -vn calm.wav"
#subprocess.call(command, shell=True)

In [9]:
#audio_path = "/content/calm.wav"

In [10]:
#signal, sr = librosa.load(audio_path, sr = 16000)
#ipd.display(ipd.Audio(signal, rate = 16000))

#####Audio Preprocessing
As audio has different length and the FFT will produce distortions. We need to cut a long audio file into several short segments. In other word, we will divide the signal inot short frames. Each audio frame will be the same size as the FTT. Each audio frame will have 50% overlap since we loose infomration on the edge of each frame after using a window function.

In [11]:
SAMPLE_RATE = 16000
FFT_SIZE = 1024
NUM_MFCC = 27
HOP_SIZE = 512

Make prediction in every 3 seconds and calculate MFCC(n = 30)  for each frames and use the mean MFCC as features.

In [12]:
'''def frame_calc_mfcc(file, fft = FFT_SIZE, hop = HOP_SIZE, sample_rate = SAMPLE_RATE):

  feature_data = []
  signal, sr = librosa.load(file, sr = sample_rate)

  mfcc = librosa.feature.mfcc(y = signal, sr = sample_rate, n_mfcc= 30, n_fft = 1024, hop_length = 512 )
  mean_mfcc = np.mean(mfcc.T, axis = 0)

  spec = librosa.feature.spectral_centroid(y = signal, sr = sample_rate, n_fft = 1024, hop_length = 512)[0]
  mean_spec = np.mean(spec.T, axis = 0)

  feature_data = np.concatenate(mean_mfcc, mean_spec)

  return feature_data'''

'def frame_calc_mfcc(file, fft = FFT_SIZE, hop = HOP_SIZE, sample_rate = SAMPLE_RATE):\n\n  feature_data = []\n  signal, sr = librosa.load(file, sr = sample_rate)\n\n  mfcc = librosa.feature.mfcc(y = signal, sr = sample_rate, n_mfcc= 30, n_fft = 1024, hop_length = 512 )\n  mean_mfcc = np.mean(mfcc.T, axis = 0)\n\n  spec = librosa.feature.spectral_centroid(y = signal, sr = sample_rate, n_fft = 1024, hop_length = 512)[0]\n  mean_spec = np.mean(spec.T, axis = 0)\n\n  feature_data = np.concatenate(mean_mfcc, mean_spec)\n\n  return feature_data'

In [13]:
def frame_calc_mfcc(file, fft = FFT_SIZE, hop = HOP_SIZE, sample_rate = SAMPLE_RATE):

  mfcc_per_frame = []
  signal, sr = librosa.load(file, sr = sample_rate)
  #print("signal:", len(signal))

  duration = librosa.get_duration(y = signal, sr = sample_rate)
  #print("duration:", duration)
  samples_per_file = sample_rate * duration
  num_frames = int(duration / 3) #make prediction in every 3 seconds
  #print("num_frame:", num_frames)
  sample_per_frame = int(samples_per_file / num_frames)
  window_hop = sample_per_frame // 2

  for n in range(num_frames*2 - 1):
    #print("num_frames:", n)
    start = window_hop * n
    finish = start + sample_per_frame

    mfcc = librosa.feature.mfcc(y=signal[start:finish], sr = SAMPLE_RATE, n_mfcc = NUM_MFCC, n_fft = FFT_SIZE, hop_length = HOP_SIZE)
    mean_mfcc = np.mean(mfcc.T, axis = 0)

    mfcc_per_frame.append([mean_mfcc])

  return mfcc_per_frame

In [14]:
#add the file path of your test file, and remove the comment
#filepath = 'enter/your/test/file/path/here'

In [15]:
#feature_input = frame_calc_mfcc(fearful_demo_path)
#feature_input = np.concatenate (feature_input, axis = 0)
#feature_input.shape

#####Use pretrained model to predict emotions
Note: {0: 'angry',
 1: 'calm',
 2: 'disgust',
 3: 'fear',
 4: 'happy',
 5: 'neutral',
 6: 'sad',
 7: 'surprise'}

In [16]:
!gdown --id 1tZwJn00OjoT3painOq9t9BvsbDGD5a0n

Downloading...
From: https://drive.google.com/uc?id=1tZwJn00OjoT3painOq9t9BvsbDGD5a0n
To: /content/audio_model.h5
100% 6.53M/6.53M [00:00<00:00, 37.0MB/s]


In [17]:
audio_model = load_model('/content/audio_model.h5')
audio_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 27, 256)           1536      
                                                                 
 max_pooling1d (MaxPooling1  (None, 14, 256)           0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 14, 256)           327936    
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 7, 256)            0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 7, 128)            163968    
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 4, 128)            0

In [18]:
def audio_predict(path):
  feature_input = frame_calc_mfcc(path)
  print(np.shape(feature_input))
  feature_input = np.concatenate(feature_input, axis = 0)
  print(feature_input)
  print(np.shape(feature_input.shape))

  #use pre-trained model to predict emotion and store prediction matrix
  demo_predict = audio_model.predict(feature_input)
  demo_predict = np.array(demo_predict)
  combined_prediction = demo_predict.mean(axis=0)
  return combined_prediction

In [19]:
#y_test_predictions = np.argmax(audio_model.predict(feature_input), axis = -1)
#y_test_predictions

####**visual2emotion**
*Note*: The visual2emotion model detects emotions based on key frame. Essentially, we can make a decision whenever we want.

To coordinate better with audio data, we choose to make a decision every xxxx second.


#####Preprocessing
Our visual model takes landmarks features as input, so we need to parse the video, extract the key frames and corresponding landmarks, and then feed into the pre-trained model.

In [20]:
# download dlib pretrained library
!gdown --id 1XqF2ec7KdVrrxrnahWeKCJrHOR_ucX2o

Downloading...
From: https://drive.google.com/uc?id=1XqF2ec7KdVrrxrnahWeKCJrHOR_ucX2o
To: /content/shape_predictor_68_face_landmarks.dat
100% 99.7M/99.7M [00:04<00:00, 22.7MB/s]


In [21]:
# Helper functions for video parsing

#dlib model setup
# initialize dlib's face detector (HOG-based) and create facial landmark predictor
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("/content/shape_predictor_68_face_landmarks.dat")

# dlib helper functions
# take a bounding predicted by dlib and convert it to the format (x, y, w, h) as
# we would normally do with OpenCV
def rect_to_bb(rect):
	x = rect.left()
	y = rect.top()
	w = rect.right() - x
	h = rect.bottom() - y
	# return a tuple of (x, y, w, h)
	return (x, y, w, h)

# initialize the list of (x, y)-coordinates
def shape_to_np(shape, dtype="int"):
	coords = np.zeros((68, 2), dtype=dtype)
	# loop over the 68 facial landmarks and convert them
	# to a 2-tuple of (x, y)-coordinates
	for i in range(0, 68):
		coords[i] = (shape.part(i).x, shape.part(i).y)
	# return the list of (x, y)-coordinates
	return coords

In [22]:
# Helper function for data parsing
# Use dlib to get landmark
def get_landmark(path):
  print(f"Parsing file <path: {path}> ")
  cap = cv2.VideoCapture(path)
  count = 0

  result = [] # a list of landmarks
  # Read the image and parse the facial landmarks every 1000ms
  while True:
    # use cap to read key frame
    ret, image = cap.read()
    print("Trying to capture...")

    if ret is not True:
      print(f"{int(count/30)} frames extracted")
      break

    else:
      # resize the input image, and convert it to grayscale
      image = imutils.resize(image, width=500)
      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

      # detect faces in the grayscale image
      rects = detector(gray, 1)
      try:
        rect = rects[0] # only one face
        shape = predictor(gray, rect)
        shape = face_utils.shape_to_np(shape)

        # convert dlib's rectangle to a OpenCV-style bounding box
        # [i.e., (x, y, w, h)], then draw the face bounding box
        (x, y, w, h) = face_utils.rect_to_bb(rect)
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # set the next frame to capture
        count += 30 # Note that at 30 fps, this advances one second
        cap.set(cv2.CAP_PROP_POS_FRAMES, count)

        print("Captured!")
        result.append(shape[17:])

      except IndexError:
        continue


  # Release all space and windows once done
  cap.release()
  cv2.destroyAllWindows()

  return result

In [23]:
# Helper functions for feature extraction
# Calculate distance between two landmarks
def calc_distance(landmark1,landmark2):
  (x1, y1) = landmark1
  (x2, y2) = landmark2
  return ((x1-x2)**2+ (y1-y2)**2)**(0.5)

# Featurize and turn the facial landmarks into a 1-d array
def featurize(landmarks):
  res = []
  total_count = len(landmarks)
  for i in range(total_count - 1):
    for j in range(i + 1, total_count):
      res.append(calc_distance(landmarks[i], landmarks[j]))
  return np.array(res)

In [24]:
def parse_visual(path):
  landmarks_list = get_landmark(path)
  input = []
  for landmarks in landmarks_list:
    input.append(featurize(landmarks))
  return input

#####Use pretrained model to predict emotions
#####Use pretrained model to predict emotions
Note: Output 0-7 is corresponded to `['angry' 'calm' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']`.

In [25]:
predictions_dict = ['angry', 'calm', 'disgust', 'fear', 'happy', 'neutral', 'sad','surprise']

In [26]:
# load pretrained model from google drive
!gdown --id 1-GFtUsUYzXHXTWluajl7PpEwDj6yfuxx

Downloading...
From: https://drive.google.com/uc?id=1-GFtUsUYzXHXTWluajl7PpEwDj6yfuxx
To: /content/visual_model_full.h5
100% 12.8M/12.8M [00:00<00:00, 61.0MB/s]


In [27]:
visual_model = load_model("/content/visual_model_full.h5")
visual_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_4 (Bat  (None, 1275)              5100      
 chNormalization)                                                
                                                                 
 dropout_4 (Dropout)         (None, 1275)              0         
                                                                 
 dense_4 (Dense)             (None, 512)               653312    
                                                                 
 batch_normalization_5 (Bat  (None, 512)               2048      
 chNormalization)                                                
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_5 (Dense)             (None, 512)              

In [28]:
def visual_predict(path):
  features = parse_visual(path)

  visual_prediction_vectors = []
  visual_predictions = []
  for frame_feature in features:
    frame_feature_cnn = np.expand_dims(frame_feature, axis = 0)

    # use pre-trained model to predict emotion and store prediction matrix
    predictions = visual_model.predict(frame_feature_cnn)
    visual_prediction_vectors.append(predictions)

  visual_prediction_vectors = np.array(visual_prediction_vectors)
  combined_prediction = visual_prediction_vectors.mean(axis=0)
  return combined_prediction[0]

In [29]:
visual_predict(calm_demo_path)

Parsing file <path: /content/calm_demo.mov> 
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
Captured!
Trying to capture...
9 frames extracted


array([1.3287886e-08, 2.1767798e-01, 9.9677406e-02, 5.4787469e-01,
       1.3473958e-01, 2.3933451e-12, 3.8795656e-07, 2.9921044e-05],
      dtype=float32)

###Combine the outputs from two models

`predictions_dict = ['angry', 'calm', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']`

In [30]:
def predict(path):
  audio_path = get_audio_path(path)
  audio_prediction_vector = audio_predict(audio_path)
  visual_prediction_vector = visual_predict(path)

  print(f"audio_prediction {audio_prediction_vector}")
  print(f"visual_prediction {visual_prediction_vector}")

  combined_vector = []
  for i in range(8):
    combined_vector.append(audio_prediction_vector[i] + 3*visual_prediction_vector[i])

  max_index = np.argmax(combined_vector)
  return predictions_dict[max_index]

In [31]:
predict(disgust_demo_path)

(9, 1, 27)
[[-3.46657562e+02  1.17551661e+01 -9.96286201e+00  1.42358208e+01
  -3.57775116e+01  1.12323418e+01 -1.98925762e+01 -1.19342613e+01
  -2.33161335e+01 -1.15734396e+01 -2.21743603e+01 -8.24034882e+00
  -2.09810772e+01 -1.55811138e+01 -1.37108145e+01 -4.39613104e+00
  -4.64449120e+00  6.95200872e+00  7.67198467e+00  3.60151339e+00
   4.41700649e+00 -1.90438583e-01 -8.73893023e-01  2.53818011e+00
  -1.93713903e+00 -2.23162532e-01 -5.71530342e-01]
 [-3.38022919e+02  3.75268459e+00 -6.18941355e+00  7.45384598e+00
  -3.06222363e+01  1.24524288e+01 -1.95443077e+01 -1.32581539e+01
  -2.30427113e+01 -1.34435568e+01 -1.90897446e+01 -1.01045313e+01
  -2.01415501e+01 -1.34943047e+01 -1.20203714e+01 -1.97235775e+00
  -3.92620850e+00  6.08094692e+00  3.89558506e+00  3.08981991e+00
   4.24008989e+00  2.47107792e+00  1.29197609e+00  1.55646265e+00
  -3.96131253e+00  1.14531076e+00  4.71282788e-02]
 [-3.53149353e+02  3.12716103e+01 -1.63190384e+01  1.04913330e+01
  -3.43415108e+01  1.13303347

'disgust'

###Music generation

####Import libraries and helpers

In [32]:
from google.colab import drive
import os
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [33]:
!apt-get update -qq && apt-get install -qq fluidsynth fluid-soundfont-gm build-essential libasound2-dev libjack-dev

Extracting templates from packages: 100%
Selecting previously unselected package libqt5core5a:amd64.
(Reading database ... 120874 files and directories currently installed.)
Preparing to unpack .../00-libqt5core5a_5.15.3+dfsg-2ubuntu0.2_amd64.deb ...
Unpacking libqt5core5a:amd64 (5.15.3+dfsg-2ubuntu0.2) ...
Selecting previously unselected package libevdev2:amd64.
Preparing to unpack .../01-libevdev2_1.12.1+dfsg-1_amd64.deb ...
Unpacking libevdev2:amd64 (1.12.1+dfsg-1) ...
Selecting previously unselected package libmtdev1:amd64.
Preparing to unpack .../02-libmtdev1_1.1.6-1build4_amd64.deb ...
Unpacking libmtdev1:amd64 (1.1.6-1build4) ...
Selecting previously unselected package libgudev-1.0-0:amd64.
Preparing to unpack .../03-libgudev-1.0-0_1%3a237-2build1_amd64.deb ...
Unpacking libgudev-1.0-0:amd64 (1:237-2build1) ...
Selecting previously unselected package libwacom-common.
Preparing to unpack .../04-libwacom-common_2.2.0-1_all.deb ...
Unpacking libwacom-common (2.2.0-1) ...
Selecting 

In [34]:
!pip install -qU pyfluidsynth pretty_midi

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.3/50.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone


In [35]:
import collections
import datetime
import fluidsynth
from fluidsynth import *
import glob
import numpy as np
import pathlib
import pandas as pd
import pretty_midi
import seaborn as sns
import tensorflow as tf
from google.colab import files
import random
from IPython import display
from matplotlib import pyplot as plt
from typing import Dict, List, Optional, Sequence, Tuple

In [36]:
def midi_to_notes(midi_file: str):
  pm = pretty_midi.PrettyMIDI(midi_file)
  instrument = pm.instruments[0]
  instrument_name = pretty_midi.program_to_instrument_name(instrument.program)
  notes = collections.defaultdict(list)

  # Sort the notes by start time
  sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
  prev_start = sorted_notes[0].start

  for note in sorted_notes:
    start = note.start
    end = note.end
    notes['pitch'].append(note.pitch)
    notes['start'].append(start)
    notes['end'].append(end)
    notes['step'].append(start - prev_start)
    notes['duration'].append(end - start)
    prev_start = start

  return pd.DataFrame({name: np.array(value) for name, value in notes.items()}), instrument_name

In [37]:
def notes_to_midi(
  notes: pd.DataFrame,
  out_file: str,
  instrument_name: str,
  velocity: int = 100,  # note loudness
) -> pretty_midi.PrettyMIDI:
  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          instrument_name))
  prev_start = 0
  for i, note in notes.iterrows():
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start
  pm.instruments.append(instrument)
  pm.write(out_file)
  return pm

In [38]:
def predict_next_note(notes, model, temperature) -> int:
  """Generates a note IDs using a trained sequence model."""
  assert temperature > 0
  # Add batch dimension
  inputs = tf.expand_dims(notes, 0)
  predictions = model.predict(inputs)
  pitch_logits = predictions['pitch']
  step = predictions['step']
  duration = predictions['duration']
  pitch_logits /= temperature
  pitch = tf.random.categorical(pitch_logits, num_samples=1)
  pitch = tf.squeeze(pitch, axis=-1)
  duration = tf.squeeze(duration, axis=-1)
  step = tf.squeeze(step, axis=-1)
  # `step` and `duration` values should be non-negative
  step = tf.maximum(0, step)
  duration = tf.maximum(0, duration)
  return int(pitch), float(step), float(duration)

In [39]:
def mse_with_positive_pressure(y_true: tf.Tensor, y_pred: tf.Tensor):
  mse = (y_true - y_pred) ** 2
  positive_pressure = 10 * tf.maximum(-y_pred, 0.0)
  return tf.reduce_mean(mse + positive_pressure)

In [40]:
def music_generation(raw_notes, model, num_predictions, instrument_name):
  temperature = 2.0
  #num_predictions = 120
  key_order = ['pitch', 'step', 'duration']
  seq_length = 25
  vocab_size = 128
  sample_notes = np.stack([raw_notes[key] for key in key_order], axis=1)
  # The initial sequence of notes; pitch is normalized similar to training
  # sequences
  input_notes = (
    sample_notes[:seq_length] / np.array([vocab_size, 1, 1]))
  generated_notes = []
  prev_start = 0
  for _ in range(num_predictions):
    pitch, step, duration = predict_next_note(input_notes, model, temperature)
    start = prev_start + step
    end = start + duration + 12
    input_note = (pitch, step, duration)
    generated_notes.append((*input_note, start, end))
    input_notes = np.delete(input_notes, 0, axis=0)
    input_notes = np.append(input_notes, np.expand_dims(input_note, 0), axis=0)
    prev_start = start
  generated_notes = pd.DataFrame(generated_notes, columns=(*key_order, 'start', 'end'))
  out_file = 'output.mid'
  out_pm = notes_to_midi(generated_notes, out_file=out_file, instrument_name=instrument_name)
  files.download(out_file)

In [41]:
def create_model():
  seq_length = 25
  input_shape = (seq_length, 3)
  learning_rate = 0.005
  inputs = tf.keras.Input(input_shape)
  x = tf.keras.layers.LSTM(128)(inputs)
  outputs = {
  'pitch': tf.keras.layers.Dense(128, name='pitch')(x),
  'step': tf.keras.layers.Dense(1, name='step')(x),
  'duration': tf.keras.layers.Dense(1, name='duration')(x),
  }
  model = tf.keras.Model(inputs, outputs)
  loss = {
      'pitch': tf.keras.losses.SparseCategoricalCrossentropy(
          from_logits=True),
      'step': mse_with_positive_pressure,
      'duration': mse_with_positive_pressure,
  }
  optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)
  model.compile(
    loss=loss,
    loss_weights={
        'pitch': 0.05,
        'step': 1.0,
        'duration':1.0,
    },
    optimizer=optimizer,
  )
  return model

####Generate music

In [42]:
emotion = predict(disgust_demo_path)

(9, 1, 27)
[[-3.46657562e+02  1.17551661e+01 -9.96286201e+00  1.42358208e+01
  -3.57775116e+01  1.12323418e+01 -1.98925762e+01 -1.19342613e+01
  -2.33161335e+01 -1.15734396e+01 -2.21743603e+01 -8.24034882e+00
  -2.09810772e+01 -1.55811138e+01 -1.37108145e+01 -4.39613104e+00
  -4.64449120e+00  6.95200872e+00  7.67198467e+00  3.60151339e+00
   4.41700649e+00 -1.90438583e-01 -8.73893023e-01  2.53818011e+00
  -1.93713903e+00 -2.23162532e-01 -5.71530342e-01]
 [-3.38022919e+02  3.75268459e+00 -6.18941355e+00  7.45384598e+00
  -3.06222363e+01  1.24524288e+01 -1.95443077e+01 -1.32581539e+01
  -2.30427113e+01 -1.34435568e+01 -1.90897446e+01 -1.01045313e+01
  -2.01415501e+01 -1.34943047e+01 -1.20203714e+01 -1.97235775e+00
  -3.92620850e+00  6.08094692e+00  3.89558506e+00  3.08981991e+00
   4.24008989e+00  2.47107792e+00  1.29197609e+00  1.55646265e+00
  -3.96131253e+00  1.14531076e+00  4.71282788e-02]
 [-3.53149353e+02  3.12716103e+01 -1.63190384e+01  1.04913330e+01
  -3.43415108e+01  1.13303347

In [43]:
def generate_music(path):
  # Globals

  emotion = predict(path)
  ROOT_PATH = '/content/gdrive/My Drive/MLS/MLS Group Project/source/music_generation'

  os.chdir(ROOT_PATH)
  emotion_types = {"happy": "Q1", "surprise": "Q1", "angry": "Q2", "fearful": "Q2", "disgust": "Q2", "sad": "Q3", "calm": "Q4", "neutral": "Q4"}
  folder_name = "MER_audio/" + emotion_types[emotion]

  # mapping
  emotion_types = {"happy": "Q1", "surprise": "Q1", "angry": "Q2", "fearful": "Q2", "disgust": "Q2", "sad": "Q3", "calm": "Q4", "neutral": "Q4"}
  folder_name = "MER_audio/" + emotion_types[emotion]

  files1=[os.path.join(ROOT_PATH, folder_name) + "/" + i for i in os.listdir(os.path.join(ROOT_PATH, folder_name)) if i.endswith(".midi")]

  random_number = random.randint(0, len(files1)-1)
  sample_file = files1[random_number]

  raw_notes, instrument_name = midi_to_notes(sample_file)
  raw_notes.head()

  model_checkpoint_path = "./training_checkpoints/" + emotion_types[emotion] + "/"
  print(f"model_checkpoint_path: {model_checkpoint_path}")

  ckpt_path = tf.train.latest_checkpoint(model_checkpoint_path)
  print(f"ckpt_path: {ckpt_path}")

  model = create_model()
  model.load_weights(ckpt_path)

  num_predictions = 240

  music_generation(raw_notes, model, num_predictions, instrument_name)


In [44]:
generate_music(disgust_demo_path)

(9, 1, 27)
[[-3.46657562e+02  1.17551661e+01 -9.96286201e+00  1.42358208e+01
  -3.57775116e+01  1.12323418e+01 -1.98925762e+01 -1.19342613e+01
  -2.33161335e+01 -1.15734396e+01 -2.21743603e+01 -8.24034882e+00
  -2.09810772e+01 -1.55811138e+01 -1.37108145e+01 -4.39613104e+00
  -4.64449120e+00  6.95200872e+00  7.67198467e+00  3.60151339e+00
   4.41700649e+00 -1.90438583e-01 -8.73893023e-01  2.53818011e+00
  -1.93713903e+00 -2.23162532e-01 -5.71530342e-01]
 [-3.38022919e+02  3.75268459e+00 -6.18941355e+00  7.45384598e+00
  -3.06222363e+01  1.24524288e+01 -1.95443077e+01 -1.32581539e+01
  -2.30427113e+01 -1.34435568e+01 -1.90897446e+01 -1.01045313e+01
  -2.01415501e+01 -1.34943047e+01 -1.20203714e+01 -1.97235775e+00
  -3.92620850e+00  6.08094692e+00  3.89558506e+00  3.08981991e+00
   4.24008989e+00  2.47107792e+00  1.29197609e+00  1.55646265e+00
  -3.96131253e+00  1.14531076e+00  4.71282788e-02]
 [-3.53149353e+02  3.12716103e+01 -1.63190384e+01  1.04913330e+01
  -3.43415108e+01  1.13303347

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>