In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import gc
warnings.filterwarnings('ignore')
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for i, filename in enumerate(filenames):
        if i < 2:
            print(os.path.join(dirname, filename))

/kaggle/input/predict-human-emotions-from-audio/dataset/sample_submission.csv
/kaggle/input/predict-human-emotions-from-audio/dataset/train.csv
/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/13738.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/28919.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/23694.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/11507.mp3


In [2]:
!pip -q install tensorflow.io==0.17



In [3]:
train_csv = "/kaggle/input/predict-human-emotions-from-audio/dataset/train.csv"
test_csv = "/kaggle/input/predict-human-emotions-from-audio/dataset/test.csv"
TRAIN_fp = "/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/"
TEST_fp = "/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/"

train_files = [fn for fn in os.listdir(TRAIN_fp) if fn.split('.')[-1] in ['mp3', 'wav']]
test_files = [fn for fn in os.listdir(TEST_fp) if fn.split('.')[-1] in ['mp3', 'wav']]
len(train_files), len(test_files)

(5816, 2492)

In [4]:
train_df = pd.read_csv(train_csv)
train_df.shape, train_df.columns, train_df['emotion'].value_counts()

((5816, 2),
 Index(['filename', 'emotion'], dtype='object'),
 neutral     2630
 joy          967
 surprise     640
 anger        596
 sadness      344
 fear         328
 disgust      311
 Name: emotion, dtype: int64)

In [5]:
test_df = pd.read_csv(test_csv)
test_df.shape, test_df.columns

((2492, 1), Index(['filename'], dtype='object'))

In [6]:
import librosa.util.utils as util
import librosa
import librosa.display
import audioread

def audioread_load(path, offset=0.0,
                   duration=None, dtype=np.float32):
    
    """
    Load an audio buffer using audioread.
    This loads one block at a time, and then concatenates the results.
    """
    y = []
    with audioread.audio_open(path) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) * n_channels)

        n = 0
        for frame in input_file:
            frame = util.buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[: s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev) :]
                
            # tack on the current frame
            y.append(frame)
    if y:
        y = np.concatenate(y)
        if n_channels > 1:
            y = y.reshape((-1, n_channels)).T
    else:
        y = np.empty(0, dtype=dtype)
        
    if n_channels > 1:
        y = librosa.to_mono(y)

    return y, sr_native


sample_fp = f"{TRAIN_fp}/28967.mp3"
# y, sr = audioread_load(sample_fp, duration=5, sr=16000)
y, sr = librosa.load(sample_fp, sr=None)
yt,_ = librosa.effects.trim(y)
y.shape, sr, yt.shape

((44982,), 44100, (44982,))

In [7]:
classes = sorted(train_df['emotion'].unique().tolist())
map_class_to_id = dict(zip(classes, range(0, len(classes))))
map_class_to_id

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [8]:
filepaths =  train_df['filename'].apply(lambda x:os.path.join(TRAIN_fp,x))
class_ids = train_df['emotion'].apply(lambda n: map_class_to_id[n])
filepaths.head(2), class_ids.head(2)

(0    /kaggle/input/predict-human-emotions-from-audi...
 1    /kaggle/input/predict-human-emotions-from-audi...
 Name: filename, dtype: object,
 0    4
 1    4
 Name: emotion, dtype: int64)

In [9]:
# import tensorflow as tf
# main_ds = tf.data.Dataset.from_tensor_slices((filepaths, class_ids))
# main_ds.element_spec

In [10]:
from tensorflow_io import experimental as tfio_exp
import tensorflow_io as tfio
import tensorflow as tf

def load_audio(fp):
    y, sr = audioread_load(fp, duration=10)
    if sr != 22050:
        y = librosa.resample(y, sr, 22050)
        sr = 22050
    yt, _ = librosa.effects.trim(y)
    return yt, sr

#@tf.function
def load_tf_audio(fp, target_sr = 22050):
    ## tf audio-read
    duration = 5
    target_sr = 22050
    audio = tfio.audio.AudioIOTensor(fp, dtype=tf.float32)
    sr = tf.cast(audio.rate, dtype=tf.int64)
    samples = tf.cast(duration * sr, dtype=tf.int32)
    waveform = audio[:samples]
    waveform = tfio.audio.resample(waveform, sr, target_sr)
    waveform = tf.reduce_mean(waveform, axis=-1)
    waveform = tf.cast(waveform, tf.float32)
    
    target_samples = tf.cast(duration*target_sr, dtype=tf.int32)
    # pos = tfio_exp.audio.trim(waveform, axis=0, epsilon=0.1)
    # waveform = waveform[pos[0]:pos[1]]
    zeros = tf.math.maximum(samples - tf.shape(waveform)[0], 0)
    paddings = [[zeros // 2, zeros // 2 + zeros % 2]]
    # pad if audio is too short
    pad_audio = tf.pad(waveform, paddings=paddings, mode='CONSTANT')
    crop_audio = tf.image.random_crop(pad_audio, [samples])
    
    return crop_audio

In [11]:
# def get_waveform_and_label(fp, label):
#     return load_tf_audio(fp), label

# AUTOTUNE = tf.data.AUTOTUNE
# waveform_ds = main_ds.map(get_waveform_and_label, num_parallel_calls=1)
# waveform_ds.element_spec

In [12]:
def get_mel_spectogram(audio, sr):
    # tensor
    waveform = tf.cast(audio, tf.float32)
    # get spectrogram
    spectrogram = tfio_exp.audio.spectrogram(
        waveform, nfft=2048, window=512, stride=256)
    # Convert to mel-spectrogram
    mel_spectrogram = tfio_exp.audio.melscale(
        spectrogram, rate=sr, mels=128, fmin=0, fmax=max(9000, sr/2))
    db_mel_spect = tfio_exp.audio.dbscale(
        mel_spectrogram, top_db=80)
    # waveform.shape, processed.shape, mel_spectrogram.shape
    # db_mel =  tf.expand_dims(db_mel_spect, axis=-1)
    # resized = tf.image.resize(db_mel, [224, 224])
    Xdb = db_mel_spect.numpy()
    return Xdb

def save_image(inp, path):
    # print(path)
    # figsize=(8,4)
    f = plt.figure(frameon=False, facecolor="w")
    plt.imshow(inp, interpolation='nearest', aspect="auto")
    plt.axis('off')
    # save the image
    # plt.imsave(path, inp)
    plt.savefig(path, bbox_inches='tight',
                pad_inches=0)
    plt.close(f)
    return

#     librosa.display.specshow(inp, y_axis='mel',
#                              x_axis='time',
#                              fmax=9000)

In [13]:
from tqdm.notebook import tqdm

save_path = "/kaggle/working/TrainImages"
if not os.path.exists(save_path):
    os.mkdir(save_path)
    
train_data = []
for fn, label in tqdm(train_df.values[:]):
    path_load = os.path.join(TRAIN_fp,fn)
    waveform, sr = load_audio(path_load)
    mel_spect = get_mel_spectogram(waveform, sr)
    img_path = os.path.join(save_path, f"{fn}.png")
    save_image(mel_spect, img_path)
    train_data.append([f"{fn}.png", label]) 
    del waveform, mel_spect
    gc.collect()

train_data = pd.DataFrame(train_data, columns=['filename', 'emotion'])
train_data.to_csv("train_data.csv", index=False)
print("Train Images Done...")

  0%|          | 0/5816 [00:00<?, ?it/s]

Train Images Done...


In [14]:
from tqdm.notebook import tqdm

save_path = "/kaggle/working/TestImages"
if not os.path.exists(save_path):
    os.mkdir(save_path)
    
test_data = []
for fn in tqdm(test_df['filename'].values[:]):
    path_load = os.path.join(TEST_fp,fn)
    waveform, sr = load_audio(path_load)
    mel_spect = get_mel_spectogram(waveform, sr)
    img_path = os.path.join(save_path, f"{fn}.png")
    save_image(mel_spect, img_path)
    test_data.append([f"{fn}.png", ""])
    del waveform, mel_spect
    gc.collect()

test_data = pd.DataFrame(test_data, columns=['filename', 'emotion'])
test_data.to_csv("test_data.csv", index=False)
print("Test Images Done...")

  0%|          | 0/2492 [00:00<?, ?it/s]

Test Images Done...


In [15]:
import cv2
im = cv2.imread("/kaggle/working/TestImages/692.mp3.png")
im.shape

(217, 334, 3)

In [16]:
from IPython.display import FileLinks
# FileLinks('.')

In [17]:
# mel_spect = librosa.feature.melspectrogram(y=yt, sr=sr, n_fft=2048//2, hop_length=512//2)
# mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
# librosa.display.specshow(mel_spect, y_axis='mel', fmax=9000, x_axis='time');
#plt.title('Mel Spectrogram');
# plt.savefig('x.jpeg')

In [18]:

# def read_audio(fp):
#     with audioread.audio_open(fp) as f:
#         # totalsec contains the length in float
#         totalsec = f.duration
#     return totalsec

# train_durations = []
# for fn, label in tqdm(train_df.values):
#     path_load = os.path.join(TRAIN_fp,fn)
#     train_durations.append(read_audio(path_load))
    
    
# test_durations = []
# for fn in tqdm(os.listdir(TEST_fp)):
#     path_load = os.path.join(TEST_fp,fn)
#     test_durations.append(read_audio(path_load))

# sum(test_durations)/len(test_durations),min(test_durations), max(test_durations)

In [19]:
# from tqdm.notebook import tqdm

# save_path = "/kaggle/working/TrainImgFiles"
# if not os.path.exists(save_path):
#     os.mkdir(save_path)
    
# train_data = []
# for fn, label in tqdm(train_df.values):
#     path_load = os.path.join(TRAIN_fp,fn)
#     y, sr = audioread_load(path_load, duration=5)
#     yt, _ = librosa.effects.trim(y)
#     mel_spect = librosa.feature.melspectrogram(y=yt, sr=sr, n_fft=1024, hop_length=256)
#     mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
#     print(mel_spect.shape)
#     librosa.display.specshow(mel_spect, y_axis='mel', fmax=9000, x_axis='time')
# #     save_dir = os.path.join(save_path,label)
# #     if not os.path.exists(save_dir):
# #         os.mkdir(save_dir)
#     img_fn = f"{fn.split('.')[0]}.png"
#     plt.savefig(os.path.join(save_path, img_fn))
#     train_data.append([img_fn, label])

In [20]:
# import librosa as lb
# import soundfile as sf


# def audio_features(file_title, mfcc, chroma, mel):
#     with sf.SoundFile(file_title) as audio_recording:
#         audio = audio_recording.read(dtype="float32")
#         sample_rate = audio_recording.samplerate
        
#         if chroma:
#             stft=np.abs(lb.stft(audio))
#             result=np.array([])
#         if mfcc:
#             mfccs=np.mean(lb.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
#             result=np.hstack((result, mfccs))
#         if chroma:
#             chroma=np.mean(lb.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
#             result=np.hstack((result, chroma))
#         if mel:
#             mel=np.mean(lb.feature.melspectrogram(audio, sr=sample_rate).T,axis=0)
#             result=np.hstack((result, mel))
#         return result



# def loading_audio_data():
#     x = []
#     y = []
#     for file in glob.glob("data//Actor_*//*.wav"):
#         file_path=os.path.basename(file)
#         emotion = emotion_labels[file_path.split("-")[2]]
#         if emotion not in focused_emotion_labels:
#             continue
#         feature = audio_features(file, mfcc=True, chroma=True, mel=True)
        
#         x.append(feature)
#         y.append(emotion)
#     final_dataset = train_test_split(np.array(x), y, test_size=0.1, random_state=9)
#     return final_dataset
