# 0. Import packages
---

In [None]:
import numpy as np
import os
import pandas as pd
import shutil
from sklearn.utils import shuffle
import urllib.request

# 1. Download the data
---

- The data will be downloaded to the `download` directory
- The wav files will be moved to the `audios` directory

In [None]:
# download zip
if not os.path.exists("./quechua.zip"):
  urllib.request.urlretrieve("https://figshare.com/ndownloader/files/37361143", "quechua.zip")

download_dir = "./download/quechua/"
audio_dir = "./audios/quechua/"

# unpack zip
if not os.path.exists(download_dir):
  os.makedirs(download_dir)
  shutil.unpack_archive("quechua.zip", download_dir)

if not os.path.exists(audio_dir):
  os.makedirs(audio_dir)
  src=download_dir+"Audios/"
  for file in os.listdir(src):
    shutil.copy2(src+file , audio_dir)


# 2. Preprocessing
---

- create tables
- set filepaths as index
- normalize lables
  - rename categorical labels to nouns
  - scale numerical labels to range (0, 1)
- filter bad annotations


In [110]:
# read `Data` file which contains emotional categories and actors
data_sheet = pd.read_excel("./download/quechua/Data/Data/Data.xlsx", sheet_name=None)
df = data_sheet["map"]

# add dimensional values
dimensions_df = pd.read_csv("./download/quechua/Labels/Labels/Labels.csv")
df = df.merge(dimensions_df, on="Audio")

# drop irrelevant columns
df.drop(columns=['File', 'Duration (s)'], inplace=True)

# rename columns
rename_dict={col:col.lower() for col in df.columns}
rename_dict["Audio"]="file"
rename_dict["Actor"]="speaker"
df.rename(columns=rename_dict, inplace=True)

# set index
df["file"]=df["file"].apply(lambda x: audio_dir+str(x)+".wav")
df.set_index("file", inplace=True)

# drop files with bad labels
df=df[df["valence"]!='2.333.333.333'].copy()
df=df[~df["speaker"].isin(["6-","2_"])].copy()

# add speaker ages  (see https://www.nature.com/articles/s41597-022-01855-9/)
age_map = {
    "a1":43,
    "a2":36,
    "a3":49,
    "a4":28,
    "a5":45,
    "a6":36,
}
df["age"]=df["speaker"].apply(lambda x: age_map[x])

# add speaker genders
gender_map = {
    "a1":"female",
    "a2":"male",
    "a3":"female",
    "a4":"male",
    "a5":"female",
    "a6":"male",
}
df["gender"]=df["speaker"].apply(lambda x: gender_map[x])

# normalize categorical labels
emotion_names_map={
    'anger':'anger', 'boredom':'boredom', 'happy':'happiness',
    'sleepy':'sleepiness','sadness':'sadness', 'calm':'calmness',
    'fear':'fear', 'excited':'excitement', 'neutral':'neutral',
    'angry':'anger', 'bored':'boredom'
}
df["emotion"]=df["emotion"].apply(lambda x: emotion_names_map[x])

# normalize dimensional labels
def min_max_scale(x,old_range,new_range):
  out = (x-old_range[0])/(old_range[1]-old_range[0])
  out = out*(new_range[1]-new_range[0])+new_range[0]
  return out
df["arousal"]=df["arousal"].apply(lambda x: min_max_scale(x,[1,5],[0,1]))
df["valence"]=df["valence"].apply(lambda x: min_max_scale(float(x),[1,5],[0,1]))
df["dominance"]=df["dominance"].apply(lambda x: min_max_scale(x,[1,5],[0,1]))

# create tables dir
tables_dir = "./tables/"
if not os.path.exists(tables_dir):
  os.makedirs(tables_dir)

# save files table
df.to_csv("./tables/quechua_files.csv")

df.head()

Unnamed: 0_level_0,emotion,speaker,valence,arousal,dominance,age,gender
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
./audios/quechua/10001.wav,anger,a2,0.0625,0.9375,0.875,36,male
./audios/quechua/10002.wav,boredom,a2,0.625,0.25,0.3125,36,male
./audios/quechua/10003.wav,anger,a2,0.5625,0.4375,0.375,36,male
./audios/quechua/10004.wav,boredom,a5,0.6875,0.3125,0.375,45,female
./audios/quechua/10005.wav,happiness,a2,0.625,0.625,0.6875,36,male


## 3. create train/test split
---

create splits, which are:
- **speaker independent**
- **gender balanced**

In [111]:
# select speakers randomly
speakers = df.speaker.unique()
np.random.seed(42)
test_speakers = np.random.choice(speakers, size=2, replace=False)
train_speakers = [sp for sp in speakers if not sp in test_speakers]

# shuffle dataframe
shuffled_df = shuffle(df, random_state=8)

# split data into train/test dataframes
test_df = shuffled_df[shuffled_df.speaker.isin(test_speakers)]
train_df = shuffled_df[shuffled_df.speaker.isin(train_speakers)]

# separate emotions and dimensions
emotions_test = test_df["emotion"]
emotions_train = train_df["emotion"]
arousal_test = test_df["arousal"]
arousal_train = train_df["arousal"]
dominance_test = test_df["dominance"]
dominance_train = train_df["dominance"]
valence_test = test_df["valence"]
valence_train = train_df["valence"]

# write tables
emotions_test.to_csv("./tables/quechua_emotions_test.csv")
emotions_train.to_csv("./tables/quechua_emotions_train.csv")
arousal_test.to_csv("./tables/quechua_arousal_test.csv")
arousal_train.to_csv("./tables/quechua_arousal_train.csv")
dominance_test.to_csv("./tables/quechua_dominance_test.csv")
dominance_train.to_csv("./tables/quechua_dominance_train.csv")
valence_test.to_csv("./tables/quechua_valence_test.csv")
valence_train.to_csv("./tables/quechua_valence_train.csv")
