# Setup

## sanity check

In [1]:
import sys 
print(sys.executable)

/home/amar/projects/audino/env/bin/python


In [2]:
! which python

/home/amar/projects/audino/env/bin/python


In [3]:
! which pip

/home/amar/projects/audino/env/bin/pip


## imports

In [1]:
import os
from pprint import pprint

import IPython.display as ipd
from IPython.core.display import display
import pandas as pd

from pkgs.utils import config_system

## load configs

In [2]:
paths_dict = config_system.load_paths_config()
pprint(paths_dict)

{'audio_config_file': '/home/amar/projects/audino/configs/audio.yml',
 'books_config_file': '/home/amar/projects/audino/configs/books.yml',
 'configs_dir': '/home/amar/projects/audino/configs',
 'data_dir': '/home/amar/projects/audino/data',
 'datasets_dir': '/home/amar/projects/audino/datasets',
 'firebase_config_file': '/home/amar/projects/audino/configs/firebase_config.yml',
 'paths_config_file': '/home/amar/projects/audino/configs/paths.yml',
 'project_root': '/home/amar/projects/audino',
 'summary_config_file': '/home/amar/projects/audino/configs/summary.yml'}


# Organizing datasets

In [3]:
datasets_dir = paths_dict['datasets_dir']
datasets_dir

'/home/amar/projects/audino/datasets'

In [4]:
for dirpath, dirnames, filenames in os.walk(datasets_dir):
    print(f'There are {len(dirnames)} directories and {len(filenames)} files in {dirpath}')

There are 5 directories and 4 files in /home/amar/projects/audino/datasets
There are 24 directories and 0 files in /home/amar/projects/audino/datasets/RAVDESS
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_13
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_18
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_07
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_02
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_05
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_12
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_14
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_17
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_24
Ther

In [4]:
# list out the datasets
datasets = ['RAVDESS', 'CREMA-D', 'SAVEE', 'TESS']
ravdess_dir = os.path.join(datasets_dir, datasets[0])
crema_dir = os.path.join(datasets_dir, datasets[1])
savee_dir = os.path.join(datasets_dir, datasets[2])
tess_dir = os.path.join(datasets_dir, datasets[3])
print(ravdess_dir, crema_dir, savee_dir, tess_dir)

/home/amar/projects/audino/datasets/RAVDESS /home/amar/projects/audino/datasets/CREMA-D /home/amar/projects/audino/datasets/SAVEE /home/amar/projects/audino/datasets/TESS


## RAVDESS

In [6]:
for dirpath, dirnames, filenames in os.walk(ravdess_dir):
    print(f'There are {len(dirnames)} directories and {len(filenames)} files in {dirpath}')

There are 24 directories and 0 files in /home/amar/projects/audino/datasets/RAVDESS
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_13
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_18
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_07
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_02
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_05
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_12
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_14
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_17
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAVDESS/Actor_24
There are 0 directories and 60 files in /home/amar/projects/audino/datasets/RAV

### Populating ravdess

In [5]:
# filename breakdown
emotions = {
    '01': 'neutral', 
    '02': 'calm', 
    '03': 'happy', 
    '04': 'sad', 
    '05': 'angry', 
    '06': 'fear', 
    '07': 'disgust', 
    '08': 'surprised'
}
emotional_intensity = {
    '01': 'medium', 
    '02': 'high'
}
statements = {
    '01': 'Kids are talking by the door', 
    '02': 'Dogs are sitting by the door'
}

In [6]:
ravdess_rows = list()
for dirpath, dirnames, filenames in os.walk(ravdess_dir):
    if len(filenames) > 0:
        for filename in filenames:
            filename_breakdown = filename.split('-')
            ravdess_row = dict()
            ravdess_row['dataset_name'] = 'RAVDESS'
            ravdess_row['gender'] = 'male' if int(filename_breakdown[-1][:2]) % 2 != 0 else 'female'
            ravdess_row['emotion'] = emotions[filename_breakdown[2]]
            ravdess_row['emotional_intensity'] = emotional_intensity[filename_breakdown[3]]
            ravdess_row['text'] = statements[filename_breakdown[4]]
            ravdess_row['audio_file_name'] = filename
            ravdess_row['actor'] = 'ravdess_' + filename_breakdown[-1][:2]
            ravdess_rows.append(ravdess_row)

In [7]:
ravdess_df = pd.DataFrame(ravdess_rows)
ravdess_df.head()

Unnamed: 0,dataset_name,gender,emotion,emotional_intensity,text,audio_file_name,actor
0,RAVDESS,male,sad,medium,Dogs are sitting by the door,03-01-04-01-02-02-13.wav,ravdess_13
1,RAVDESS,male,calm,medium,Dogs are sitting by the door,03-01-02-01-02-02-13.wav,ravdess_13
2,RAVDESS,male,sad,medium,Kids are talking by the door,03-01-04-01-01-01-13.wav,ravdess_13
3,RAVDESS,male,neutral,medium,Dogs are sitting by the door,03-01-01-01-02-02-13.wav,ravdess_13
4,RAVDESS,male,sad,high,Kids are talking by the door,03-01-04-02-01-02-13.wav,ravdess_13


In [8]:
ravdess_df.shape

(1440, 7)

## TESS

In [11]:
for dirpath, dirnames, filenames in os.walk(tess_dir):
    print(f'There are {len(dirnames)} directories and {len(filenames)} files in {dirpath}')

There are 14 directories and 0 files in /home/amar/projects/audino/datasets/TESS
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/OAF_disgust
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/YAF_fear
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/OAF_neutral
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/OAF_fear
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/OAF_happy
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/OAF_pleasant_surprise
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/YAF_happy
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/YAF_pleasant_surprise
There are 0 directories and 200 files in /home/amar/projects/audino/datasets/TESS/OAF_sad
There are 0 directories and 200 files in /home/amar/projects/audino

### Populating Tess

In [9]:
# mapping
speakers = {
    'OAF': 'tess_old', 
    'YAF': 'tess_young'
}
emotions = {
    'ps': 'surprised'
}

In [10]:
tess_rows = list()
for dirpath, dirnames, filenames in os.walk(tess_dir):
    if len(filenames) > 0:
        for filename in filenames:
            filename_breakdown = filename.split('_')
            tess_row = dict()
            tess_row['dataset_name'] = 'TESS'
            tess_row['gender'] = 'female'
            emotion = filename_breakdown[-1].split('.')[0]
            tess_row['emotion'] = emotion if emotion != 'ps' else 'surprised'
            tess_row['emotional_intensity'] = 'medium'
            tess_row['text'] = 'say the word ' + filename_breakdown[1]
            tess_row['audio_file_name'] = filename 
            tess_row['actor'] = 'tess_old' if filename_breakdown[0] == 'OAF' else 'tess_young'
            tess_rows.append(tess_row)

In [11]:
tess_df = pd.DataFrame(tess_rows)
tess_df.head()

Unnamed: 0,dataset_name,gender,emotion,emotional_intensity,text,audio_file_name,actor
0,TESS,female,disgust,medium,say the word time,OAF_time_disgust.wav,tess_old
1,TESS,female,disgust,medium,say the word vote,OAF_vote_disgust.wav,tess_old
2,TESS,female,disgust,medium,say the word goal,OAF_goal_disgust.wav,tess_old
3,TESS,female,disgust,medium,say the word sell,OAF_sell_disgust.wav,tess_old
4,TESS,female,disgust,medium,say the word came,OAF_came_disgust.wav,tess_old


In [12]:
tess_df.shape

(2800, 7)

## SAVEE

In [85]:
for dirpath, dirnames, filenames in os.walk(savee_dir):
    print(f'There are {len(dirnames)} directories and {len(filenames)} files in {dirpath}')

There are 4 directories and 1 files in /home/amar/projects/audino/datasets/SAVEE
There are 0 directories and 120 files in /home/amar/projects/audino/datasets/SAVEE/JE
There are 0 directories and 120 files in /home/amar/projects/audino/datasets/SAVEE/JK
There are 0 directories and 120 files in /home/amar/projects/audino/datasets/SAVEE/KL
There are 0 directories and 120 files in /home/amar/projects/audino/datasets/SAVEE/DC


### Populating savee

In [88]:
savee_rows = list()
for dirpath, dirnames, filenames in os.walk(savee_dir):
    if len(filenames) > 0:
        for filename in filenames:
            if 'Info' in filename:
                continue
            else:
                print(filename)
                break

h08.wav
h08.wav
h08.wav
h08.wav


In [None]:
# Skipping savee for now as text is too complicated

## Crema-d

In [15]:
i = 0
for dirpath, dirnames, filenames in os.walk(crema_dir):
    for filename in filenames:
        if i == 5:
            break
        print(filename.split('_'))
        i += 1

['1070', 'TIE', 'SAD', 'XX.wav']
['1054', 'DFA', 'FEA', 'XX.wav']
['1033', 'ITS', 'SAD', 'XX.wav']
['1025', 'WSI', 'HAP', 'XX.wav']
['1081', 'TIE', 'FEA', 'XX.wav']


### Populating crema

In [13]:
# mapping
crema_emotions_map = {
    'SAD': 'sad',
    'FEA': 'fear', 
    'HAP': 'happy', 
    'ANG': 'angry', 
    'DIS': 'disgust', 
    'NEU': 'neutral'
}
emotional_intensity_crema_map = {
    'HI': 'high', 
    'LO': 'low', 
    'MD': 'medium', 
    'XX': 'unknown'
}
sentence_map = {
    'DFA': "Don't forget a jacket", 
    'IEO': "It's eleven'o clock", 
    'IOM': 'I am on my way to the meeting', 
    'ITH': "I think i have a doctor's appointment", 
    'ITS': 'I think i have seen this before', 
    'IWL': 'I would like a new alarm clock', 
    'IWW': 'I wonder what this is about', 
    'MTI': 'Maybe tomorrow it will be cold', 
    'TAI': 'The airplane is almost full', 
    'TIE': 'That is exactly what happened', 
    'TSI': 'The surface is slick', 
    'WSI': 'Will stop in a couple of minutes'
}
male = set([1,5,11,14,15,16,17,19,22,23,26,27,31,32,33,34,35,36,38,39,40,41,42,44,45,48,50,51,57,59,62,64,65,66,67,68,69,70,71,77,80,81,83,85,86,87,88,90])
female = set(range(1,92)) - male

In [14]:
crema_rows = list()
for dirpath, dirnames, filenames in os.walk(crema_dir):
    for filename in filenames:
        filename_breakdown = filename.split('_')
        crema_row = dict()
        crema_row['dataset_name'] = 'CREMA'
        crema_row['gender'] = 'male' if int(filename_breakdown[0][2:]) in male else 'female'
        crema_row['emotion'] = crema_emotions_map[filename_breakdown[2]] 
        crema_row['emotional_intensity'] = emotional_intensity_crema_map[filename_breakdown[-1][:2]]
        crema_row['text'] = sentence_map[filename_breakdown[1]]
        crema_row['audio_file_name'] = filename
        crema_row['actor'] = 'crema_' + filename_breakdown[0][2:]
        crema_rows.append(crema_row)

crema_df = pd.DataFrame(crema_rows)
crema_df.head()

Unnamed: 0,dataset_name,gender,emotion,emotional_intensity,text,audio_file_name,actor
0,CREMA,male,sad,unknown,That is exactly what happened,1070_TIE_SAD_XX.wav,crema_70
1,CREMA,female,fear,unknown,Don't forget a jacket,1054_DFA_FEA_XX.wav,crema_54
2,CREMA,male,sad,unknown,I think i have seen this before,1033_ITS_SAD_XX.wav,crema_33
3,CREMA,female,happy,unknown,Will stop in a couple of minutes,1025_WSI_HAP_XX.wav,crema_25
4,CREMA,male,fear,unknown,That is exactly what happened,1081_TIE_FEA_XX.wav,crema_81


# Final dataframe

In [15]:
full_df = pd.concat([ravdess_df, tess_df, crema_df])
full_df.head()

Unnamed: 0,dataset_name,gender,emotion,emotional_intensity,text,audio_file_name,actor
0,RAVDESS,male,sad,medium,Dogs are sitting by the door,03-01-04-01-02-02-13.wav,ravdess_13
1,RAVDESS,male,calm,medium,Dogs are sitting by the door,03-01-02-01-02-02-13.wav,ravdess_13
2,RAVDESS,male,sad,medium,Kids are talking by the door,03-01-04-01-01-01-13.wav,ravdess_13
3,RAVDESS,male,neutral,medium,Dogs are sitting by the door,03-01-01-01-02-02-13.wav,ravdess_13
4,RAVDESS,male,sad,high,Kids are talking by the door,03-01-04-02-01-02-13.wav,ravdess_13


In [16]:
full_df.shape

(11682, 7)

In [19]:
## Save to a csv file
processed_data_path = os.path.join(paths_dict['datasets_dir'], 'processed', 'full_df.csv')
full_df.to_csv(processed_data_path, index=False)