<a href="https://colab.research.google.com/github/DorAzaria/Sentiment-Analysis-Deep-Learning-Methods-For-Speech-Recognition/blob/main/preprocess/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/data/')

Mounted at /content/data/


# **IMPORTS**

---

In [2]:
import numpy as np
import pandas as pd
import os
import librosa
import sys
import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import random
import torch
import pickle
import torchaudio
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# **STATICS**
---



In [3]:
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model().to(device)
EMOTIONS = {0: 'surprise', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fear', 6: 'disgust'}  # surprise has been changed from 8 to 0
RAVDESS_PATH = '/content/data/MyDrive/dl/ravdess'
TESS_PATH = '/content/data/MyDrive/dl/tess'
URDU_PATH = '/content/data/MyDrive/dl/urdu'
EMOVO_PATH = '/content/data/MyDrive/dl/EMOVO'
AMERICAN_PATH = '/content/data/MyDrive/dl/american'
SHEMO_PATH = '/content/data/MyDrive/dl/ShEMO'
CREMA_PATH = '/content/data/MyDrive/dl/CREMA-D'
SAMPLE_RATE = 16000
count_calm = 0
data = pd.DataFrame(columns=['Emotion', 'Path'])
POSITIVE = 0
NEUTRAL = 1
NEGATIVE = 2

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth


  0%|          | 0.00/360M [00:00<?, ?B/s]

In [4]:
def distributeEmotion(emotion):

    if isinstance(emotion, str):
      emotion = emotion.lower()

    if emotion in {'ang', 'dis', 'fea', 'sad','angry' , 'anger', 'disgust', 'fear', 'fearful', 'sad', 'sadness', 4, 5, 6, 7, 'negative', 's', 'a', 'f'}:
      return NEGATIVE

    if emotion in {'neu','neutral', 'calm', 1, 2, 'n'}:
      return NEUTRAL

    if emotion in {'hap', 'happy', 'hapiness', 'ps', 'surprised', 'excited', 'encouraging', 3, 8, 'positive', 'h', 'w'}:
      return POSITIVE

    return -1
        

# **IMPORT RAVDESS**
---

In [5]:
for dirname, _, filenames in os.walk(RAVDESS_PATH):
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        identifiers = filename.split('.')[0].split('-')
        emotion = distributeEmotion(int(identifiers[2]))

        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

# **IMPORT TESS**
---

In [6]:
for dirname, _, filenames in os.walk(TESS_PATH):
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        identifiers = filename.split('.')[0].split('_')
        emotion = distributeEmotion(identifiers[2])

        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

# **IMPORT URDU**
---

In [7]:
for dirname, _, filenames in os.walk(URDU_PATH):
    emo_name = dirname[10:]
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        identifiers = filename.split('.')[0].split('_')
        emotion = distributeEmotion(emo_name)

        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

# **AMERICAN**
---

In [None]:
for dirname, _, filenames in os.walk(AMERICAN_PATH):
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        identifiers = filename.split('.')[0].split('_')
        emotion = distributeEmotion(identifiers[1])

        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

# **EMOVO - ITALIAN**
---

*   disgusto =  disgust
*   gioia = joy happiness
*   paura = fear
*   rabbia = anger
*   sorpresa = surprised
*   (stato emotivo neutro) = neutral
*   tristezza = sad

In [None]:
for dirname, _, filenames in os.walk(EMOVO_PATH):
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        identifiers = filename.split('.')[0].split('_')
        emotion = distributeEmotion(dirname)

        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

# **ShEMO**
---

In [None]:
for dirname, _, filenames in os.walk(SHEMO_PATH):
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        emotion = distributeEmotion(filename[3])
        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

# **CREMA-D**
---

In [None]:
for dirname, _, filenames in os.walk(CREMA_PATH):
    for filename in filenames:
        file_path = os.path.join('\\', dirname, filename)
        identifiers = filename.split('.')[0].split('_')
        emotion = distributeEmotion(identifiers[2])

        if emotion != -1:
            data = data.append({"Emotion": emotion,
                                "Path": file_path
                                },
                              ignore_index=True
                              )

In [8]:
data['Emotion'].value_counts()

2    2368
0    1184
1     688
Name: Emotion, dtype: int64

# IMPORT SUMMRARY
---
*   TOTAL - 16,122 
*   0) POSITIVE - 3601
*   1) NEUTRAL - 3043
*   2) NEGATIVE - 9478





# **SAMPLE & NORMALIZATION**
---

In [9]:
def speech_file_to_array_fn(path):
    signal = np.zeros((int(SAMPLE_RATE*3 ,)))
  
    waveform, sampling_rate = torchaudio.load(filepath=path, num_frames=SAMPLE_RATE * 3)

    waveform = waveform.to(device)
    waveform = waveform.detach().cpu().numpy()[0]

    if len(waveform) <= 48000 and len(waveform) >= 32000:
        signal[:len(waveform)] = waveform

        if sampling_rate < 48000: # if there is more to fill
          rest = len(signal) - len(waveform) # get the "rest length"
          reversed_waveform = np.flipud(waveform) # flip the array reverse
          reversed_waveform = reversed_waveform[:rest] # cut the array to the rest length
          signal[len(waveform):] = reversed_waveform # place the rest in the signal
          
        signal_final = np.array([np.array(signal)])
        signal_final = torch.from_numpy(signal_final).to(device)
        signal_final = signal_final.type(torch.cuda.FloatTensor).to(device)

        return signal_final

    return -1


def normalize_features(features):
    for i in range(len(features[0])):
        mlist = features[0][i]
        features[0][i] = 2 * (mlist - np.max(mlist)) / (np.max(mlist) - np.min(mlist)) + 1

In [10]:
def diff_speech_file_to_array_fn(path):
    signal = np.zeros((int(SAMPLE_RATE*3 ,)))
  
    waveform, sampling_rate = torchaudio.load(filepath=path, num_frames=SAMPLE_RATE * 3)

    waveform = waveform.to(device)
    waveform = waveform.detach().cpu().numpy()[0]

    if len(waveform) <= 48000 and len(waveform) >= 32000:
        signal[:len(waveform)] = waveform

        if sampling_rate < 48000: # if there is more to fill
          rest = len(signal) - len(waveform) # get the "rest length"
          filled_list = signal[:len(waveform)] # we don't want to choose zero values, so this list contains non-zero values only.
          signal[len(waveform):] = random.choices(filled_list, k=rest) # choose k values from the filled_list
          
        signal_final = np.array([np.array(signal)])
        signal_final = torch.from_numpy(signal_final).to(device)
        signal_final = signal_final.type(torch.cuda.FloatTensor).to(device)

        return signal_final

    return -1

# **SAMPLE DATA**
---
EACH SAMPLE SHAPE IS (1, 149, 32)

15203

In [11]:
signals = []

total_data = len(data)
with torch.inference_mode():
    for i, file_path in enumerate(data.Path):
        tor = diff_speech_file_to_array_fn(file_path)

        if isinstance(tor, torch.Tensor):
        
            emission, _ = model(tor)
            features = emission.detach().cpu().numpy()
            normalize_features(features)
            check = 0
            max = np.max(features)
            min = np.min(features)

            if max > 1:
                print(f'\n{j} max is not 1, current max: {max}')
                check += 1
            if min < -1:
                print(f'\n{j} min is not -1, current min: {min}')
                check += 1

            if check == 0:
                row = (file_path, features, data.iloc[i]['Emotion'])
                signals.append(row)
            else:
                total_data -= 1

        percent = (len(signals) / total_data) * 100
        print("\r Processed {}/{} files. ({}%) ".format(len(signals), total_data, int(percent)), end='')


 Processed 4235/4240 files. (99%) 

# **SAVE DATA**

---



In [None]:
file_pth = open('/content/dataset.pth', 'wb')
pickle.dump(signals, file_pth)

In [12]:
file_pth = open('/content/data/MyDrive/dl/dataset5.pth', 'wb')
pickle.dump(signals, file_pth)

In [13]:
counter = [0, 0, 0]

for tup in signals:
  counter[tup[2]] += 1

print(counter)

[1184, 688, 2363]
