In [None]:

from tqdm import tqdm
from dataset import get_data_loader, move_data_to_device, MyDataset
from hparams import Hparams


args = Hparams.args
dataset = MyDataset(
        dataset_root=args['dataset_root'],
        split='train',
        sampling_rate=args['sampling_rate'],
        sample_length=args['sample_length'],
        frame_size=args['frame_size'],
        song_fns=None,
    )


In [None]:
import librosa
def testItem(idx=0):
    print(f"samplerate: {dataset.sampling_rate }")
    pYinFrameTime = librosa.frames_to_time(dataset.sample_length, sr=dataset.sampling_rate, hop_length=200)
    melspecFrameTime = librosa.frames_to_time(dataset.sample_length, sr=dataset.sampling_rate, hop_length=321)
    print(f"melspec FrameTime: {melspecFrameTime}")
    print(f"pYin FrameTime: {pYinFrameTime}")
    
    mel_spectrogram, yin, pyin = dataset.__getitem__(idx)
    print(mel_spectrogram.shape[1])
    print(yin.shape)
    print(pyin.shape)

testItem(0)

In [None]:
import os
import numpy as np
import pandas as pd
import tarfile
import pinyin


def read_aidatatang_index(data_root=os.path.join(os.getcwd(),"data_full")):

    # handling transcripts
    transcript_path = os.path.join(data_root, 'aidatatang', 'transcript', 'aidatatang_200_zh_transcript.txt')
    with open(transcript_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    def parseLine(line):
        split = line.split(' ')
        transcriptIDX = split[0]
        # T0055 G0002 S0002
        prefix = transcriptIDX[:5]
        participant_id = transcriptIDX[5:10]
        sentence_id = transcriptIDX[10:]
        
        text = split[1:]
        text,text_ids = extract_pinyin(list(' '.join(text).strip()))
        return participant_id,sentence_id, text,text_ids
    
    lines=[parseLine(line) for line in lines]

    # handling actual index
    def read_subfolder(subfolder_path):
        files = os.listdir(subfolder_path)
        return files
    
    subfolders = ["dev","test","train"]
    subfolder_index= {subfolder:sorted(read_subfolder(os.path.join(data_root, 'aidatatang', 'corpus', subfolder))) for subfolder in subfolders}
    for subfolder in subfolders:
        indexing = set(subfolder_index[subfolder])
        # sanity
        assert len(indexing) == len(subfolder_index[subfolder])
        subfolder_index[subfolder] = indexing 

    df = pd.DataFrame.from_records(data = lines, columns=["participantID", "sentenceID", "transcript", "toneclass"])
    
    def get_category(x):
        pid = x["participantID"]
        fname = f"{pid}.tar.gz"
        for subfolder in subfolders:
            if fname in subfolder_index[subfolder]:
                return subfolder
        raise ValueError(f"Could not find {fname} in any subfolder")

    df["folder"] = df.apply(get_category, axis=1)
    return df

def extract_pinyin(sentence_word_list):
    pinyin_word_list = [pinyin.get(x, format="numerical", delimiter=" ") for x in sentence_word_list]
    pinyin_word_list_tone_class = [int(x[-1]) if len(x)>1 else 0 for x in pinyin_word_list]
    return pinyin_word_list,pinyin_word_list_tone_class
    
def read_aidatatang_data(participantID, sentenceID):

    fullFileName = f"T0055{participantID}{sentenceID}"
    data_root=os.path.join(os.getcwd(),"data_full")
    makePath = lambda x,y: os.path.join(data_root, 'aidatatang_200zh', 'corpus', x, f"{y}.tar.gz")
    subfolders = ["dev","test","train"]
    possiblePaths = [makePath(subfolder,participantID) for subfolder in subfolders]
    zipped_path = [path for path in possiblePaths if os.path.exists(path)][0]


    suffixed = {
        'AudioData': (".wav",lambda x: librosa.load(x)[0]) ,
        "MetaData": (".txt", lambda x: extract_pinyin(x.read().decode("utf-8"))), 
        "Transcript": (".trn", lambda x: x.read().decode("utf-8"))
        }

    with tarfile.open(zipped_path, 'r',) as tar_ref:
        print(tar_ref.getnames())
        data =  {
            suf[0]: suf[1][1](tar_ref.extractfile(f"./{participantID}/{fullFileName}{suf[1][0]}")) for suf in suffixed.items()
        }
    for x in data:
        print(x, data[x])



    return data

read_aidatatang_index()

# d = read_aidatatang_data(*read_aidatatang_index()[0][3][:2])

In [None]:
librosa.display.waveshow(d['AudioData'], sr=16000)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal as signal

def spectralFlux():
    audioData = d['AudioData']
    # sosfilt = signal.butter(2,(20,200), 'bp', fs=16000, output='sos')
    # audioData = signal.sosfilt(sosfilt, audioData)
    # audioData = audioData[np.abs(audioData)<np.mean(np.abs(audioData))]

    melspec = librosa.feature.melspectrogram(y= audioData, sr=16000, n_fft=2048, hop_length=128, n_mels=256)


    onsets = librosa.onset.onset_detect(y =audioData, sr=16000, hop_length=200,)
    timings = librosa.frames_to_time(onsets, sr=16000, hop_length=200)

    fundamental ,voiced, probability = librosa.pyin(audioData, sr=16000, hop_length=200, fmin=20, fmax = 8000)
    fundamental_max = np.nanmax(fundamental)
    fundamental_min = np.nanmin(fundamental)
    fundamental_range = fundamental_max - fundamental_min

    fundamental = np.nan_to_num(fundamental,nan=np.nanmean(fundamental))
    fundamental = (fundamental - fundamental_min) / fundamental_range

    gradients = np.gradient(fundamental)
    
    
    fig, axs = plt.subplots(nrows=6, figsize=(15, 20))
    ax,ax2, ax3, ax4,ax5,ax6 =axs
    librosa.display.waveshow(audioData, sr=16000, ax=ax)
    librosa.display.waveshow(fundamental, sr=int(16000/200), ax=ax2)
    librosa.display.waveshow(gradients, sr=int(16000/200), ax=ax3)

    librosa.display.waveshow(voiced.astype(np.float32), sr=int(16000/200), ax=ax4)
    librosa.display.waveshow(probability.astype(np.float32), sr=int(16000/200), ax=ax5)
    librosa.display.specshow(librosa.amplitude_to_db(melspec), sr=16000, hop_length=128, ax=ax6)


    ax2.set_ylim(0,1)
    # print(fundamental)


    ax.vlines(timings, -1, 1, color='r', alpha=0.9, linestyle='--', label='Onsets')
spectralFlux()
# wo3 lao3 po2 shi4 da4 ben4 dan4 

In [None]:
def read_michigan_dataset_index(data_root=os.path.join(os.getcwd(),"data_full")):

    # handling transcripts
    audio = os.path.join(data_root, 'michigan', 'tone_perfect_all_mp3', 'tone_perfect')
    transcripts= os.path.join(data_root, 'michigan', 'tone_perfect_all_xml', 'tone_perfect')
    
    audioIndex = os.listdir(audio)
    transcriptIndex = os.listdir(transcripts)
    # ignoreing the metadata for now
    
    def parseAudioIndex(filename):
        elem = filename.split("_")
        word = elem[0]
        word_tone_class = int(word[-1]) 
        particpantID = elem[1]
        return (particpantID, word, word_tone_class, filename)
    
    audioData = [parseAudioIndex(filename) for filename in audioIndex]
    return pd.DataFrame.from_records(data=audioData, columns=["participantID", "word", "toneclass", "filename"])


def read_michigan_dataset_audio(filename, 
                                data_root=os.path.join(os.getcwd(),"data_full"),
                                sr = 16000,
                                mono=True
                                ):
    filepath = os.path.join(data_root, 'michigan', 'tone_perfect_all_mp3', 'tone_perfect', filename)
    return librosa.load(filepath, sr=sr, mono=mono)[0]


read_michigan_dataset_audio(read_michigan_dataset_index().iloc[0]["filename"])

In [None]:
import dataset
import sklearn.model_selection
from hparams import Hparams_michigan
from dataset import read_michigan_dataset_index

def get_data_loader_michigan(args):
        # dataset_root=args['dataset_root'],
        # split=split,
        # sampling_rate=args['sampling_rate'],
        # sample_length=args['sample_length'],
        # frame_size=args['frame_size'],
    index = read_michigan_dataset_index()
    tone_classes = index["toneclass"].values
    ids = list(range(len(index)))
    train_ids, test_ids= sklearn.model_selection.train_test_split(ids, test_size=0.2, random_state=42, shuffle=True, stratify=tone_classes)

    train_index = index.iloc[train_ids]
    test_index = index.iloc[test_ids]

    train_ds = dataset.DatasetMichigan(
        dataset_index=train_index, 
        dataset_root=args['dataset_root'], 
        sampling_rate=args['sampling_rate'], 
        preload_audio=args['preload_audio'],
        sample_length=args['sample_length'],
        pad_audio=args['pad_audio'],
        )
    
    test_ds = dataset.DatasetMichigan(
        dataset_index=test_index, 
        dataset_root=args['dataset_root'], 
        sampling_rate=args['sampling_rate'], 
        preload_audio=args['preload_audio'],
        sample_length=args['sample_length'],
        pad_audio=args['pad_audio'],
        )
    return train_ds, test_ds
get_data_loader_michigan(Hparams_michigan.args)[0].plot_item(0)

In [None]:
get_data_loader_michigan(Hparams_michigan.args)[1].plot_item(0)

In [None]:
test.plot_item(0)