This code fetches the audio files from S3, passes them to the embedding model and saves them along with metadata into a file. It uses mislabelled_data.txt, which can be created using identify_mislabelled_data.ipynb

In [1]:
from trim import trim_audio
from utils import *
from phonemes import get_phoneme_prediction
from consonant_sound_detector import get_probability_of_consonant_sound_isf, get_probability_of_consonant_sound_clf, get_speech_embedding_model, embed_audio, cut_middle_frame, normalize_audio, \
    pad_audio

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import s3fs
import nlpaug.augmenter.audio as ag

from IPython.display import Audio



In [3]:
consonant_list = ['b','p','k','g','m','n','t','d']
RATE = 16_000

In [84]:
s3 = s3fs.S3FileSystem()
wav_files = s3.glob('<your-bucket>/data/01_raw/ah_consonants_ah/*/*.wav')

# remove all files not in s3 or with no consonants
mislabel_df = pd.read_csv('mislabelled_data.txt')
delete_df = mislabel_df[mislabel_df['correction'] == 'delete']
delete_list = delete_df['filepath'].tolist()
for filepath_delete in delete_list:
    filepath_delete = filepath_delete.replace("s3://", "")
    idx = wav_files.index(filepath_delete)
    wav_files.pop(idx)

In [6]:
emb_df = pd.DataFrame()

for wav_file in wav_files:
    try:
        filename = wav_file.split('/')[-1]
        sound_name = filename.replace('.wav', '')
        sound_name = re.sub(r'\d+', '', sound_name).strip('-')
        sound_name = re.sub(r'ah', '', sound_name).strip('-')
        if str(sound_name) in consonant_list:
            filepath = 's3://' + wav_file
            wav = load_wav(filepath)
            wav_trimmed = trim_audio(wav)

            wav_n = normalize_audio(wav_trimmed)
            wav_p = pad_audio(wav_n)
            
            # uncomment to augment data
            #aug = ag.VtlpAug(RATE)
            #wav_aug = aug.augment(wav_p)

            embedding_model = get_speech_embedding_model()
            wav_e = embed_audio(wav_p, embedding_model)
            wav_e = np.squeeze(wav_e, axis=0)  # 4d to 3d
            wav_m = cut_middle_frame(wav_e, 16, flatten=True)
            
            wav_m_df = pd.DataFrame(wav_m)  # column DataFrame
            wav_m_df = wav_m_df.T  # row DataFrame
            wav_m_df['consonants'] = sound_name
            wav_m_df['source_id'] = filepath.split('/')[-2]
            wav_m_df['filepath'] = filepath

            emb_df = emb_df.append(wav_m_df, ignore_index=True)
            
    except Exception as e:
        print(e)
        print('Failed to load:', wav_file)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding model loaded, embedding shape: (None, None, 1, 96)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding model loaded, embedding shape: (None, None, 1, 96)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding model loaded, embedding shape: (None, None, 1, 96)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding model loaded, embedding shape: (None, None, 1, 96)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding model loaded, embedding shape: (None, None, 1, 96)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding model loaded, embedding shape: (None, None, 1, 96)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
Embedding m

In [7]:
emb_df.to_csv('embeddings.txt', index=False)