In [1]:
# Imports
import librosa, librosa.display
import matplotlib.pyplot as plt
import os
import numpy as np
import math
import json
import cv2

In [2]:
# Paths
DATASET_AUDIO_TRAIN = "EmotionDataset/Train/Audio"
DATASET_IMAGE_TRAIN = "EmotionDataset/Train/Image"
JSON_TRAIN = "json_storage/data.json"

# Audio Var
SAMPLE_RATE = 22050
DURATION = 2
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

# Image Var
IMG_SIZE = 48

In [3]:
# Process audio and image data - store in data.json file
def save_data(audio_path, image_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=1):
    data = {
        "mapping": [],
        "mfcc": [],
        "image": [],
        "labels": []
    }
    
    # how many samples for each audio input
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments) 
    # number of mfccs if samples are made
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) 
    
    # walk through audio directories
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(audio_path)):
        if dirpath is not audio_path:
            dirpath_components = dirpath.split("/")
            semantic_label = dirpath_components[-1]
            # store the directories opened
            data["mapping"].append(semantic_label)
            
            print("\nProcessing {}".format(semantic_label))
            
            for f in filenames:
                try:
                    file_path = os.path.join(dirpath, f)
                    signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                except Exception as e:                                                    
                    print('Audio failed to process: ' + e)
                
                for s in range(num_segments):
                    # Process audio
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment
                        
                    mfcc = librosa.feature.mfcc(signal[start_sample:finish_sample],
                                               sr=sr,
                                               n_fft=n_fft,
                                               n_mfcc=n_mfcc,
                                               hop_length=hop_length)

                    mfcc = mfcc.T
                    
                    if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        # store mfcc data
                        data["mfcc"].append(mfcc.tolist())
                        # store audio type
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, s+1))
    
    # walk through image directories
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(image_path)):
        if dirpath is not image_path:
            dirpath_components = dirpath.split("/")
            semantic_label = dirpath_components[-1]
            # store the directories opened
            data["mapping"].append(semantic_label)
            
            print("\nProcessing {}".format(semantic_label))
            
            for f in filenames:
                file_path = os.path.join(dirpath, f)
                try:
                    # process image
                    img_array = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE) 
                    sized_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
                    # store image data
                    data["image"].append(sized_array.tolist())
                    print("{}".format(file_path))
                except Exception as e:                                                    
                    print('Image failed to process: ' + e)
    
    # dump stored data into json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
        

save_data(DATASET_AUDIO_TRAIN, DATASET_IMAGE_TRAIN, JSON_TRAIN, num_segments=1)


Processing Audio\Happy
EmotionDataset/Train/Audio\Happy\0_ant.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_bar.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_bath.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_car.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_cause.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_date.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_dream.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_egg.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_fall.wav, segment:1
EmotionDataset/Train/Audio\Happy\0_far.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_ant.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_bar.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_bath.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_car.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_cause.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_date.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_dream.wav, segment:1
EmotionDataset/Train/Audio\Happy\1_egg.wav, s

EmotionDataset/Train/Audio\Neutral\4_far.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_ant.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_bar.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_bath.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_car.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_cause.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_date.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_dream.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_egg.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_fall.wav, segment:1
EmotionDataset/Train/Audio\Neutral\5_far.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_ant.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_bar.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_bath.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_car.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_cause.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_date.wav, segment:1
EmotionDataset/Train/Audio\Neutral\6_

EmotionDataset/Train/Image\Neutral\69.jpg
EmotionDataset/Train/Image\Neutral\7.jpg
EmotionDataset/Train/Image\Neutral\71.jpg
EmotionDataset/Train/Image\Neutral\72.jpg
EmotionDataset/Train/Image\Neutral\73.jpg
EmotionDataset/Train/Image\Neutral\74.jpg
EmotionDataset/Train/Image\Neutral\75.jpg
EmotionDataset/Train/Image\Neutral\76.jpg
EmotionDataset/Train/Image\Neutral\77.jpg
EmotionDataset/Train/Image\Neutral\78.jpg
EmotionDataset/Train/Image\Neutral\79.jpg
EmotionDataset/Train/Image\Neutral\8.jpg
EmotionDataset/Train/Image\Neutral\80.jpg
EmotionDataset/Train/Image\Neutral\81.jpg
EmotionDataset/Train/Image\Neutral\82.jpg
EmotionDataset/Train/Image\Neutral\83.jpg
EmotionDataset/Train/Image\Neutral\84.jpg
EmotionDataset/Train/Image\Neutral\85.jpg
EmotionDataset/Train/Image\Neutral\86.jpg
EmotionDataset/Train/Image\Neutral\87.jpg
EmotionDataset/Train/Image\Neutral\88.jpg
EmotionDataset/Train/Image\Neutral\89.jpg
EmotionDataset/Train/Image\Neutral\9.jpg
EmotionDataset/Train/Image\Neutral\90