In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import numpy as np
import librosa

from pytube import Channel, YouTube
import torch

# Dataset creation

## Data entries

In [2]:
channel_url = "https://www.youtube.com/channel/UCGeBogGDZ9W3dsGx-mWQGJA"

channel = Channel(channel_url)
channel.channel_name, len(channel), channel.channel_id

('IMPAULSIVE', 336, 'UCGeBogGDZ9W3dsGx-mWQGJA')

In [3]:
video_attributes_name = ("title", "video_id", "watch_url", "length", "publish_date", "thumbnail_url", "author", "channel_id", "channel_url", "description", "keywords")
video_attributes = {attr: [] for attr in video_attributes_name}

channel_dataframes_path = Path(".") / "dataset" / "full" / (channel.channel_id + ".pkl")

if not channel_dataframes_path.is_file():
    video: YouTube
    for video in tqdm(channel.videos):
        video.check_availability()
        for attr in video_attributes_name:
            video_attributes[attr].append(getattr(video, attr))

    channel_dataframes = pd.DataFrame(video_attributes)
    channel_dataframes.to_pickle(Path(".") / "dataset" / (channel.channel_id + ".pkl"))

## Download audio sample

In [4]:
dataset_audio_sample_stream = channel.videos[0].streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first()
dataset_audio_sample_path = Path(".").resolve() / "dataset" / "samples" / f"{channel.videos[0].video_id}.mp4"

print(f"\nDownload stream {dataset_audio_sample_stream} at {dataset_audio_sample_path}")
dataset_audio_sample_stream.download(
    output_path = dataset_audio_sample_path.parent,
    filename = dataset_audio_sample_path.name
)

In [None]:
dataset_audio_sample_rate = 16000
dataset_audio_sample_data, _ = librosa.load(dataset_audio_sample_path, duration=540, offset=25, sr=dataset_audio_sample_rate)

  return f(*args, **kwargs)


# Model creation

In [None]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA not detected")

NVIDIA GeForce RTX 2060 with Max-Q Design


### Speech2Text

In [7]:
import torch
from pathlib import Path
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from datasets import load_dataset
import soundfile as sf

device = torch.device("cuda")

small_model_name = "facebook/s2t-small-librispeech-asr"
medium_model_name = "facebook/s2t-medium-librispeech-asr"
model = Speech2TextForConditionalGeneration.from_pretrained(small_model_name).to(device)
processor = Speech2TextProcessor.from_pretrained(small_model_name)

def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = load_dataset(
    "patrickvonplaten/librispeech_asr_dummy",
    "clean",
    split="validation"
)
ds = ds.map(map_to_array)

input_features = processor(
    # ds["speech"][0],
    dataset_audio_sample_data,
    sampling_rate=dataset_audio_sample_rate,
    return_tensors="pt"
).input_features  # Batch size 1
# sf.write(file=Path(".") / "audio_test.wav", data=ds["speech"][0], samplerate=dataset_audio_sample_rate)

generated_ids = model.generate(input_features.to(device))
print(generated_ids.shape)
transcription = processor.batch_decode(generated_ids)
transcription


Reusing dataset librispeech_asr_dummy (/home/arthur/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /home/arthur/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-8a8433b3bbe68d1e.arrow
  input_lengths = (input_lengths - 1) // 2 + 1


RuntimeError: CUDA out of memory. Tried to allocate 3.35 GiB (GPU 0; 6.00 GiB total capacity; 4.44 GiB already allocated; 0 bytes free; 4.53 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Wav2Vec2

In [7]:
# !pip install transformers
# !pip install datasets
import soundfile as sf
import torch
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# load pretrained model
device = torch.device("cuda")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model.to(device)

librispeech_samples_ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# load audio
# audio_input, sample_rate = sf.read(librispeech_samples_ds[0]["file"])
# audio_input, sample_rate = sf.read(dataset_audio_sample_path)
# audio_input = audio_input[:17416]
# sf.write(file=Path(".") / "audio_test.wav", data=audio_input, samplerate=sample_rate)

# pad input values and return pt tensor
audio = torch.from_numpy(dataset_audio_sample_data).float().to(device)
input_values = processor(audio, sampling_rate=dataset_audio_sample_rate, return_tensors="pt").input_values.to(device)

# INFERENCE

# retrieve logits & take argmax
outputs = model.wav2vec2(input_values)
logits = model.lm_head(model.dropout(outputs[0]))
predicted_ids = torch.argmax(logits, dim=-1)

# transcribe
transcription = processor.decode(predicted_ids[0])

transcription

# # FINE-TUNE

# target_transcription = "A MAN SAID TO THE UNIVERSE I EXIST"

# # encode labels
# with processor.as_target_processor():
#   labels = processor(target_transcription, return_tensors="pt").input_ids

# # compute loss by passing labels
# loss = model(input_values, labels=labels).loss
# loss.backward()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Reusing dataset librispeech_asr_dummy (/home/arthur/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)


"A WHOLISTIC PERSON CANNOT BE PERFECT IN EVERY ASPECT THERE'S ALWAYS FLAWSERS ALWAYS I HAVE FLAWS WE ALA FLWS YOU OF SO MANY FOR A A A  A KNOW WY I'VE NEVER GOT INSULTED WITH SUCH A SEXY VOICE I  O WAS EVEN O FIND IT IT ALLA LIKE  ES TALKING TO ME A HA A A A AA NOSIS SIR THE LAST VIDIO THAT I POSTED AM I YOU TO"

In [18]:
model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 64, kernel_size=(40,), stride=(30,), bias=False)
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(64, 64, kernel_size=(40,), stride=(30,), bias=False)
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=64, out_features=64, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): Wav2Vec2EncoderStableLayerNorm(
      (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
        (con

In [9]:
input_values.shape, outputs[0].shape

(torch.Size([1, 480000]), torch.Size([1, 1499, 768]))

In [10]:
import librosa
import numpy
import soundfile as sf

sr = 16000
y, _ = librosa.load(dataset_audio_sample_path, duration=15, sr=sr) # load first seconds

sf.write("test.wav", y, sr)

# # Calculate RMS
# rms_window = 1.0 # in seconds 
# rms = librosa.feature.rms(y=y, hop_length=int(sr*rms_window))
# rms_db = librosa.core.amplitude_to_db(rms, ref=0.0)
# print(list(rms_db[0]))
# Calculate RMS
rms_window = 1.0 # in seconds 
rms = librosa.feature.rms(y=y, hop_length=int(sr*rms_window))
rms_db = librosa.core.amplitude_to_db(rms, ref=0.0)
rms_db.shape

  return f(*args, **kwargs)


(1, 16)