In [None]:
import torch
import whisper
from whisper.transcribe import transcribe
from whisper.audio import SAMPLE_RATE


model = whisper.load_model('large-v1')

device = torch.device('cuda:0')
model.encoder.to(device)
model.decoder.to(device)
model.decoder.register_forward_pre_hook(lambda _, inputs: tuple([inputs[0].to(device), inputs[1].to(device)] + list(inputs[2:])))
model.decoder.register_forward_hook(lambda _, inputs, outputs: outputs.to(device))

In [None]:
from pathlib import Path
import torchaudio
import IPython.display as ipd

audio_path = Path('/home/server2/librivox/Stephan/island_dr_moreau_08_wells.mp3')
audio, sr = torchaudio.load(audio_path)
audio = audio[0, sr*20:sr*45]
if sr != SAMPLE_RATE:
    audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE)
    
out = transcribe(model, audio, verbose=True, language='en')
ipd.display(ipd.Audio(data=audio, rate=SAMPLE_RATE))

In [None]:
out_ds_folder = Path('~/datasets/Stephan').expanduser()
out_ds_wav_folder = out_ds_folder / 'wavs'
out_ds_wav_folder.mkdir(exist_ok=True)
out_ds_filelsit = out_ds_folder / 'meta.txt'

audio_folder = audio_path.parent

target_sr = 22050
audio_paths = [audio_path for audio_path in audio_folder.iterdir() if audio_path.suffix == '.mp3']
audio_paths.sort()

duration = 0
for audio_path in audio_paths:
    audio, sr = torchaudio.load(audio_path)
    duration += audio.shape[-1] / sr
    print(str(audio_path))
print(f'Duration: {duration / (60 * 60):.2f} hours')

In [None]:
from tqdm import tqdm
import math

meta = []

for audio_idx, audio_path in enumerate(tqdm(audio_paths)):
    audio, sr = torchaudio.load(audio_path)
    audio = audio[0, :]
    if sr != SAMPLE_RATE:
        audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=SAMPLE_RATE)
    out = transcribe(model, audio, verbose=True, language='en')

    audio, sr = torchaudio.load(audio_path)
    audio = audio[0, :]
    if sr != target_sr:
        audio = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=target_sr)

    for s_idx, segment in enumerate(out['segments']):
        start_idx = math.floor(target_sr * segment['start'])
        end_idx = math.ceil(target_sr * segment['end'])
        wav = audio[start_idx:end_idx]
        wav_path = out_ds_wav_folder / f'{audio_idx}_{s_idx}.wav'
        torchaudio.save(filepath=wav_path, src=wav.unsqueeze(0), sample_rate=target_sr)
        meta.append((wav_path, segment['text'], 'stephan'))

In [None]:
import random
import IPython.display as ipd

r_idx = random.randint(0, len(meta) - 1)

wav_path, text, speaker = meta[r_idx]

print(text)
ipd.display(ipd.Audio(filename=wav_path))

In [None]:
with open(out_ds_filelsit, 'w') as file:
    for wav_path, text, speaker in meta:
        wav_path = Path(wav_path)
        text = text.strip()
        rel_wav_path = wav_path.relative_to(out_ds_folder)
        file.write(f'{str(rel_wav_path)}|{text}|{speaker}\n')