<a href="https://colab.research.google.com/github/DorAzaria/Sentiment-Analysis-Deep-Learning-Methods-For-Speech-Recognition/blob/main/SER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Google Drive**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Install packages**

In [None]:
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa
!pip install datasets

**Terminal commands**

In [16]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1


**Import packages**

In [17]:
import numpy as np
import pandas as pd
import os
import librosa
import sys
import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import torch
import torchaudio
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

**Activate device**

In [18]:
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

WAV2VEC2_ASR_BASE_960H - Build “base” wav2vec2 model with an extra linear module. 

Pre-trained on 960 hours of unlabeled audio from LibriSpeech dataset

In [19]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
model = bundle.get_model().to(device)

**Sampling sound method**

In [20]:
EMOTIONS = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 0:'surprise'} # surprise has been changed from 8 to 0
DATA_PATH = '../content/drive/MyDrive/audio_speech_actors_01-24'
SAMPLE_RATE = 16000
signals = []

In [None]:
data = pd.DataFrame(columns=['Emotion', 'Emotion intensity', 'Gender','Path'])
for dirname, _, filenames in os.walk(DATA_PATH):
    for filename in filenames:
        file_path = os.path.join('/',dirname, filename)
        identifiers = filename.split('.')[0].split('-')
        emotion = (int(identifiers[2]))
        if emotion == 8: # surprise has been changed from 8 to 0
            emotion = 0
        if int(identifiers[3]) == 1: # intensity (1 = normal, 2 = strong)
            emotion_intensity = 'normal' 
        else: 
            emotion_intensity = 'strong'
        if int(identifiers[6])%2 == 0: # actor id. (even = female, odd = male)
            gender = 'female'
        else:
            gender = 'male'
        
        data = data.append({"Emotion": emotion,
                            "Emotion intensity": emotion_intensity,
                            "Gender": gender,
                            "Path": file_path
                             },
                             ignore_index = True
                          )

In [22]:
print("number of files is {}".format(len(data)))
data.head(3)

number of files is 1440


Unnamed: 0,Emotion,Emotion intensity,Gender,Path
0,4,strong,male,/../content/drive/MyDrive/audio_speech_actors_...
1,4,normal,male,/../content/drive/MyDrive/audio_speech_actors_...
2,4,normal,male,/../content/drive/MyDrive/audio_speech_actors_...


In [None]:
def speech_file_to_array_fn(path):
    waveform, sample_rate = librosa.load(file_path, duration=3, offset=0.5, sr=SAMPLE_RATE)
    waveform = waveform.to(device)
    signal = np.zeros((int(SAMPLE_RATE*3 + 1,)))
    signal[:len(waveform)] = waveform
    return signal

* **InferenceMode** is a new context manager analogous to no_grad to be used when you are certain your operations will have no interactions with autograd (e.g., model training). Code run under this mode gets better performance by disabling view tracking and version counter bumps.



In [None]:
with torch.inference_mode():
    for i, file_path in enumerate(data.Path):
        emission, _ = model(speech_file_to_array_fn(file_path))
        signals.append(emission)
        print("\r Processed {}/{} files".format(i,len(data)),end='')

In [None]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

In [None]:
decoder = GreedyCTCDecoder(labels=bundle.get_labels())
transcript = decoder(emission[0])

In [None]:
print(transcript)
IPython.display.Audio(("/content/drive/MyDrive/simon.wav"))