In [3]:
import argparse
import yaml

In [11]:
import os, sys
# add root working folder into path so that we can import abs paths in jupyter
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [12]:
# /processor/ljspeech.py
import os
import librosa
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm
from text import _clean_text

In [16]:
parser = argparse.ArgumentParser()
parser.add_argument("config", type=str, help="path to preprocess.yaml")

argString = '../config/LJSpeech/preprocess.yaml'
# args = parser.parse_args()
args = parser.parse_args(argString.split())
config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)

In [17]:
config

{'dataset': 'LJSpeech',
 'path': {'corpus_path': '/home/ming/Data/LJSpeech-1.1',
  'lexicon_path': 'lexicon/librispeech-lexicon.txt',
  'raw_path': './raw_data/LJSpeech',
  'preprocessed_path': './preprocessed_data/LJSpeech'},
 'preprocessing': {'val_size': 512,
  'text': {'text_cleaners': ['english_cleaners'], 'language': 'en'},
  'audio': {'sampling_rate': 22050, 'max_wav_value': 32768.0},
  'stft': {'filter_length': 1024, 'hop_length': 256, 'win_length': 1024},
  'mel': {'n_mel_channels': 80, 'mel_fmin': 0, 'mel_fmax': 8000},
  'pitch': {'feature': 'phoneme_level', 'normalization': True},
  'energy': {'feature': 'phoneme_level', 'normalization': True}}}

In [18]:
def prepare_align(config):
    in_dir = config["path"]["corpus_path"]
    out_dir = config["path"]["raw_path"]
    sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
    max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
    cleaners = config["preprocessing"]["text"]["text_cleaners"]
    speaker = "LJSpeech"
    with open(os.path.join(in_dir, "metadata.csv"), encoding="utf-8") as f:
        for line in tqdm(f):
            parts = line.strip().split("|")
            base_name = parts[0]
            text = parts[2]
            text = _clean_text(text, cleaners)

            wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
            if os.path.exists(wav_path):
                os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
                wav, _ = librosa.load(wav_path, sampling_rate)
                wav = wav / max(abs(wav)) * max_wav_value
                wavfile.write(
                    os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
                    sampling_rate,
                    wav.astype(np.int16),
                )
                with open(
                    os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
                    "w",
                ) as f1:
                    f1.write(text)

In [19]:
in_dir = config["path"]["corpus_path"]
out_dir = config["path"]["raw_path"]

In [20]:
in_dir

'/home/ming/Data/LJSpeech-1.1'

In [21]:
out_dir

'./raw_data/LJSpeech'

In [23]:
in_dir = '../Data/LJSpeech-1.1'

In [24]:
sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]

In [25]:
cleaners = config["preprocessing"]["text"]["text_cleaners"]

In [27]:
speaker = "LJSpeech"

In [36]:
with open(os.path.join(in_dir, "metadata.csv"), encoding="utf-8") as f:
    for line in tqdm(f):
        parts = line.strip().split("|")
        # print(parts)
        base_name = parts[0]
        text = parts[2]
        text = _clean_text(text, cleaners)

        wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
        if os.path.exists(wav_path):
            # print(wav_path)
            os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
            wav, _ = librosa.load(wav_path)
            wav = wav / max(abs(wav)) * max_wav_value
            
            wavfile.write(
                os.path.join(out_dir, speaker, "{}.wav".format(base_name)),
                sampling_rate,
                wav.astype(np.int16),
            )
            with open(
                os.path.join(out_dir, speaker, "{}.lab".format(base_name)),
                "w",
            ) as f1:
                f1.write(text)
        # break

13100it [01:48, 121.21it/s]


In [None]:
# 接下来对ego4d进行preprocess

In [42]:
parser = argparse.ArgumentParser()
parser.add_argument("config", type=str, help="path to preprocess.yaml")

argString = '../config/Ego4D_final_v2/preprocess.yaml'
# args = parser.parse_args()
args = parser.parse_args(argString.split())
config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)

In [43]:
config

{'dataset': 'Ego4D_final_v2',
 'path': {'corpus_path': './Data/Ego4D_final_v2/final_dataset_v2',
  'lexicon_path': 'lexicon/librispeech-lexicon.txt',
  'raw_path': './raw_data/Ego4D_final_v2',
  'preprocessed_path': './preprocessed_data/Ego4D_final_v2',
  'transcript_train_path': './Data/Ego4D_final_v2/final_dataset_v2/utterances_final_train_v2.csv',
  'transcript_val_path': './Data/Ego4D_final_v2/final_dataset_v2/utterances_final_val_v2.csv'},
 'preprocessing': {'val_size': 512,
  'text': {'text_cleaners': ['english_cleaners'], 'language': 'en'},
  'audio': {'sampling_rate': 16000, 'max_wav_value': 32768.0},
  'stft': {'filter_length': 1024, 'hop_length': 256, 'win_length': 1024},
  'mel': {'n_mel_channels': 80, 'mel_fmin': 0, 'mel_fmax': 8000},
  'pitch': {'feature': 'phoneme_level', 'normalization': True},
  'energy': {'feature': 'phoneme_level', 'normalization': True}}}

In [48]:
in_dir = f".{config['path']['corpus_path']}"
out_dir = f".{config['path']['raw_path']}"
sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
# cleaners = config["preprocessing"]["text"]["text_cleaners"]
speaker = "Ego4D_final_v2"
transcript_train_path = f".{config['path']['transcript_train_path']}"
transcript_val_path = f".{config['path']['transcript_val_path']}"

In [49]:
import pandas as pd

In [50]:
train_df = pd.read_csv(transcript_train_path)
print(train_df.columns)

Index(['utterance_id', 'video_id', 'person_id', 'video_start_time',
       'video_end_time', 'transcription', 'sample_duration'],
      dtype='object')


In [52]:
in_dir

'../Data/Ego4D_final_v2/final_dataset_v2'

In [53]:
out_dir

'../raw_data/Ego4D_final_v2'

In [61]:
for idx, row in tqdm(train_df.iterrows()):
    uid = row['utterance_id']
    text = row['transcription']
    text = _clean_text(text, cleaners)
    wav_path = os.path.join(in_dir, "train", f"{uid}.wav")
    if os.path.exists(wav_path):
        os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
        wav, _ = librosa.load(wav_path, sr=None)
        # print(wav_path)
        # print(wav.shape)
        # print(_)
        wav = wav / max(abs(wav)) * max_wav_value
        wavfile.write(
            os.path.join(out_dir, speaker, f"{uid}.wav"),
            sampling_rate,
            wav.astype(np.int16),
        )
        with open(os.path.join(out_dir, speaker, f"{uid}.lab"), "w",) as f1:
            f1.write(text)
    # break

4208it [00:16, 261.80it/s]


KeyboardInterrupt: 