# 0. Import packages
---

In [None]:
# from google.colab import drive
import os
import pandas as pd
import shutil
import urllib.request

# 1. Download the data
---

- The data will be downloaded to the `download` directory
- The wav files will be moved to the `audios` directory

In [None]:
# download zip
if not os.path.exists("./quechua.zip"): #
  urllib.request.urlretrieve("https://figshare.com/ndownloader/files/37361143", "quechua.zip")

# unpack zip
if not os.path.exists("./download"):
  os.mkdir("./download")
  shutil.unpack_archive("quechua.zip", "./download")

if not os.path.exists("./audios"):
  os.mkdir("./audios")
  src="./download/Audios/"
  for file in os.listdir(src):
      shutil.copy2(src+file , "./audios")


# 2. Preprocessing
---

- create tables
- set filepaths as index
- normalize lables
  - rename categorical labels to nouns
  - scale numerical labels to range (0, 1)
- filter bad annotations


In [None]:
# read `Data`file which contains emotional categories and actors
data_sheet = pd.read_excel("./download/Data/Data/Data.xlsx", sheet_name=None)
df = data_sheet["map"]

# add dimensional values
dimensions_df = pd.read_csv("./download/Labels/Labels/Labels.csv")
df = df.merge(dimensions_df, on="Audio")

# drop irrelevant columns
df.drop(columns=['File', 'Duration (s)'], inplace=True)

# rename columns
rename_dict={col:col.lower() for col in df.columns}
rename_dict["Audio"]="file"
df.rename(columns=rename_dict, inplace=True)

# set index
df["file"]=df["file"].apply(lambda x: "audios/"+str(x)+".wav")
df.set_index("file", inplace=True)

# normalize categorical labels
emotion_names_map={
    'anger':'anger', 'boredom':'boredom', 'happy':'happiness',
    'sleepy':'sleepiness','sadness':'sadness', 'calm':'calmness',
    'fear':'fear', 'excited':'excitement', 'neutral':'neutral',
    'angry':'anger', 'bored':'boredom'
}
df["emotion"]=df["emotion"].apply(lambda x: emotion_names_map[x])

# normalize dimensional labels
def min_max_scale(x,old_range,new_range):
  out = (x-old_range[0])/(old_range[1]-old_range[0])
  out = out*(new_range[1]-new_range[0])+new_range[0]
  return out
df=df[df["valence"]!='2.333.333.333'].copy() # drop files with bad annotations
df["arousal"]=df["arousal"].apply(lambda x: min_max_scale(x,[1,5],[0,1]))
df["valence"]=df["valence"].apply(lambda x: min_max_scale(float(x),[1,5],[0,1]))
df["dominance"]=df["dominance"].apply(lambda x: min_max_scale(x,[1,5],[0,1]))
df.head()

Unnamed: 0_level_0,emotion,actor,valence,arousal,dominance
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
audios/10001.wav,anger,a2,0.0625,0.9375,0.875
audios/10002.wav,boredom,a2,0.625,0.25,0.3125
audios/10003.wav,anger,a2,0.5625,0.4375,0.375
audios/10004.wav,boredom,a5,0.6875,0.3125,0.375
audios/10005.wav,happiness,a2,0.625,0.625,0.6875


## 3. create train/test split
---

- create splits, which are **speaker independent** and **gender balanced**