# 0. Imports

---

In [None]:
import numpy as np
import os
from os.path import join
import pandas as pd
import shutil
from sklearn.utils import shuffle
import urllib

# 1. Download the data
---

- The data will be downloaded to the `download` directory
- The wav files will be moved to the `audios` directory

In [None]:
source="https://zenodo.org/records/4526477/files/SUBESCO.zip?download=1"

# download zip
zip_path = join("download","zip_files","subesco.zip")
if not os.path.exists(zip_path):
  os.makedirs(join("download","zip_files"), exist_ok=True)
  urllib.request.urlretrieve(source, zip_path)

download_dir = join("download","subesco")
audio_dir = join("audios","subesco")

# unpack zip
if not os.path.exists(download_dir):
  os.makedirs(download_dir)
  shutil.unpack_archive(zip_path, download_dir)

# move audio files to `./audios`
if not os.path.exists(audio_dir):
  os.makedirs(audio_dir)
  src=join(download_dir,"SUBESCO")
  for file in os.listdir(src):
    shutil.copy2(join(src, file) , audio_dir)

# 2. Preprocessing
---

- create tables
- set filepaths as index
- normalize lables
  - rename categorical labels to nouns
  - scale numerical labels to range (0, 1)
- filter bad annotations


In [None]:
if 'F_02_MONIKA_S_2_SURPRISE_3].wav' in os.listdir(audio_dir):
  os.rename(
      join(audio_dir,'F_02_MONIKA_S_2_SURPRISE_3].wav'),
      join(audio_dir,'F_02_MONIKA_S_2_SURPRISE_3.wav')
  )

# create DataFrame containing audio files
index = pd.Index(os.listdir(audio_dir))
df = pd.DataFrame(index=index)


# define maps to normalize labels
gender_map={"M":"male","F":"female"}

emotion_map={
    'SURPRISE' : 'surprise',
    'NEUTRAL' : 'neutral',
    'DISGUST' : 'disgust',
    'HAPPY' : 'happiness',
    'SAD' : 'sadness',
    'FEAR' : 'fear',
    'ANGRY' : 'anger',
}

get_take=lambda x: int(x.replace(".wav",""))

# set columns
df["gender"]=df.index.str.split("_").str[0].map(gender_map)
df["speaker_number"]=df.index.str.split("_").str[0:2].str.join("_") # see paper
df["speaker"]=df.index.str.split("_").str[2]
df["sentence"]=df.index.str.split("_").str[3:5].str.join("_")
df["emotion"]=df.index.str.split("_").str[5].map(emotion_map)
df["take"]=df.index.str.split("_").str[6].map(get_take)

# create tables dir
tables_dir = "tables"
if not os.path.exists(tables_dir):
  os.makedirs(tables_dir)

# save files table
df.to_csv(join("tables","subesco_files.csv"))

df.head()

## 3. create train/test split
---

create splits, which are:
- **speaker independent**
- **gender balanced**

In [None]:
dimensions = ["arousal", "valence", "dominance"]
# select speakers randomly
speakers = df.speaker.unique()
np.random.seed(42)
test_speakers = np.random.choice(speakers, size=2, replace=False)
train_speakers = [sp for sp in speakers if not sp in test_speakers]

# shuffle dataframe
shuffled_df = shuffle(df, random_state=8)

# split data into train/test dataframes
test_df = shuffled_df[shuffled_df.speaker.isin(test_speakers)]
train_df = shuffled_df[shuffled_df.speaker.isin(train_speakers)]

# save tables as csv
test_df["emotion"].to_csv(join("tables","quechua_emotions_test.csv"))
train_df["emotion"].to_csv(join("tables","quechua_emotions_train.csv"))
for dimension in dimensions:
  test_df[dimension].to_csv(join("tables",f"quechua_{dimension}_test.csv"))
  train_df[dimension].to_csv(join("tables",f"quechua_{dimension}_train.csv"))