<a href="https://colab.research.google.com/github/Basel-byte/Speech-Emotion-Recognition/blob/master/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Importing Packages**

In [None]:
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

###**Mounting Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###**Extracting Audio Files**

In [None]:
dir_path = "/content/drive/MyDrive/Crema"
if not os.path.exists(dir_path):
  !unzip /content/drive/MyDrive/Crema.zip -d /content/drive/MyDrive/Crema

###**Reading Audio Files**

In [None]:
labels = []
audio_waves = []
for file_name in os.listdir(dir_path):
  labels.append(file_name[9:12])
  audio_waves.append(librosa.load(os.path.join(dir_path, file_name)))
lb = LabelBinarizer()
encoded_labels = lb.fit_transform(labels)
print(encoded_labels.shape)
lb.classes_

###**Audio signal augmentation**

In [None]:
def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5) * 1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate=22050, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

###**Extraction of frequency and time-domain features**

In [None]:
def extract_features(signal): 
  mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
  sample = np.vstack((mfccs, librosa.feature.zero_crossing_rate(y=signal)))
  sample = np.vstack((sample, librosa.feature.rms(y=signal)))
  if mfccs.shape[1] < max_frames:
    sample = np.pad(sample, ((0, 0), (0, max_frames - mfccs.shape[1])), mode='constant')
  else:
    sample = sample[:, :max_frames]
  sampel = sample.reshape((42 * max_frames, 1))
  return sample

In [None]:
def get_augmented_features(signal):
  augmented_data = np.vstack(noise(signal), shift(signal))
  extract_fn = np.vectoriz(extract_features)
  augmented_data = extract_fn(augmented_data)
  return augmented_data

###**Preparing feature space for model 1**

In [None]:
max_frames = 200
n_mfcc = 40
n_features = n_mfcc + 2
data = np.empty((len(labels), n_features * max_frames, 1))
for i, audio in enumerate(audio_waves):
  signal, sr = audio
  data[i] = extract_features(signal)

###**Data split 70% training(5% validation), 30% testing**

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(data, encoded_labels, test_size=0.3, stratify=encoded_labels, random_state=42)
train_data, valid_data, train_labels, valid_labels = train_test_split(train_data, train_labels, test_size=0.05, stratify=train_labels, random_state=42)

###**Circular Padding of Spectogram**

In [None]:
def get_spectogram_circulary_padded(mel_spec, max_frames):
  n_frames = mel_spec.shape[1]
  if n_frames < max_frames:
    n_pad = max_frames - n_frames
    n = n_pad // n_frames
    mod = n_pad % n_frames
    pad = mel_spec[:, :mod]
    result = np.hstack((mel_spec, pad))
    for i in range(n):
      result = np.hstack((result, mel_spec))
    return result
  else:
    return mel_spec[:, :max_frames]

# get_spectogram_circulary_padded(librosa.feature.melspectrogram(y=audio_waves[20][0]), max_frames)

###**Preparing feature space for model 2**

In [None]:
data2 = np.empty((len(labels), 128, max_frames, 1))
for i, audio in enumerate(audio_waves):
  signal, sr = audio
  mel_spec = librosa.feature.melspectrogram(y=signal)
  mel_spec = get_spectogram_circulary_padded(mel_spec, max_frames)
  # delta1 = librosa.feature.delta(mel_spec, delta=1)
  # delta2 = librosa.feature.delta(mel_spec, delta=2)
  # data2[i] = np.dstack((mel_spec, delta1, delta2)) 
  data2[i] = mel_spec.reshape((128, max_frames, 1))

In [None]:
train_data2, test_data2, train_labels2, test_labels2 = train_test_split(data2, encoded_labels, test_size=0.3, stratify=encoded_labels, random_state=42)
train_data2, valid_data2, train_labels2, valid_labels2 = train_test_split(train_data2, train_labels2, test_size=0.05, stratify=train_labels2, random_state=42)

In [None]:
def visualize_audio_signal(audio_wave):
  signal, sr = audio_wave
  mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)
  librosa.display.specshow(mfccs, sr=sr, x_axis='time')
  plt.show()
  mel_spec = librosa.feature.melspectrogram(y=signal)
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
  librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel')
  time = np.arange(0, len(signal)) / sr
  fig, ax = plt.subplots()
  ax.plot(time, signal)
  ax.set(xlabel='Time(s)',ylabel='Amplitude')
  plt.show()
  df = pd.DataFrame(mfccs)
  df

visualize_audio_signal(audio_waves[0])

In [None]:
del audio_waves
del data
del data2