In [1]:
import glob
import pandas as pd

In [3]:
DATA_DIR = "../data"
files = glob.glob(f"{DATA_DIR}/sautidb_v1/*.wav", recursive=True)

### Fix Efik_Ibibio filename convention

In [4]:
old_fname = [f.split("/")[-1] for f in files]

# bug fix: instead of having EFIK_IBIBIO_EFIK_IBIBIO_..., we should have EFIKIBIBIO_EFIKIBIBIO_..
new_fname = [f.split("/")[-1].replace("EFIK_IBIBIO", "EFIKIBIBIO") for f in files]

In [5]:
fname_mapping = pd.DataFrame({"old_filename": old_fname, "new_filename": new_fname})
fname_mapping.to_csv(f"sautidb_v1_filename_rename_map.csv", index=False, header=False)

## Create text transcript

In [6]:
import json

In [7]:
with open(f"{DATA_DIR}/cmuarctic.json", "r") as f:
    sentences = json.load(f)

In [8]:
df = pd.DataFrame(new_fname, columns=["filename"])
df["nativeLanguage"] = df["filename"].apply(lambda x: x.split("_")[0])
df["fluentLanguage"] = df["filename"].apply(lambda x: x.split("_")[1])
df["speakerID"] = df["filename"].apply(lambda x: x.split("_")[2])
df["gender"] = df["filename"].apply(lambda x: x.split("_")[3])
df["sentenceID"] = df["filename"].apply(lambda x: x.split("_")[-1].split(".")[0])
df["sentence"] = df["sentenceID"].apply(lambda x: sentences[x.capitalize()])
df.head()

Unnamed: 0,filename,nativeLanguage,fluentLanguage,speakerID,gender,sentenceID,sentence
0,YORUBA_YORUBA_0230_M_A0194.wav,YORUBA,YORUBA,230,M,A0194,He drank of the water cautiously.
1,YORUBA_YORUBA_0006_M_B0354.wav,YORUBA,YORUBA,6,M,B0354,It's that much junk.
2,YORUBA_YORUBA_0006_M_B0383.wav,YORUBA,YORUBA,6,M,B0383,A bush chief had died a natural death.
3,YORUBA_YORUBA_0048_M_B0151.wav,YORUBA,YORUBA,48,M,B0151,"It was steel, a fisher trap."
4,EDO_OTHERS_0175_F_A0186.wav,EDO,OTHERS,175,F,A0186,Like a flash he launched himself into the feat...


In [9]:
df["speakerID"].min(), df["speakerID"].max()

('0001', '1241')

### Compute audio duration

In [10]:
import librosa

In [11]:
AUDIO_DIR="sautidb_v1.1"
duration = lambda x: librosa.get_duration(filename=f"{DATA_DIR}/{AUDIO_DIR}/{x}")
df["duration"] = df["filename"].apply(lambda x: f"{duration(x):.2f}")
df.head()

Unnamed: 0,filename,nativeLanguage,fluentLanguage,speakerID,gender,sentenceID,sentence,duration
0,YORUBA_YORUBA_0230_M_A0194.wav,YORUBA,YORUBA,230,M,A0194,He drank of the water cautiously.,1.73
1,YORUBA_YORUBA_0006_M_B0354.wav,YORUBA,YORUBA,6,M,B0354,It's that much junk.,1.67
2,YORUBA_YORUBA_0006_M_B0383.wav,YORUBA,YORUBA,6,M,B0383,A bush chief had died a natural death.,2.96
3,YORUBA_YORUBA_0048_M_B0151.wav,YORUBA,YORUBA,48,M,B0151,"It was steel, a fisher trap.",3.13
4,EDO_OTHERS_0175_F_A0186.wav,EDO,OTHERS,175,F,A0186,Like a flash he launched himself into the feat...,4.27


In [12]:
df.to_csv("audio_metadata.csv", index=None)

In [13]:
from IPython.display import Audio

In [14]:
audio = lambda x: f"{DATA_DIR}/{AUDIO_DIR}/{x}"

In [15]:
Audio(audio("EDO_IGBO_0122_M_A0551.wav"))