<a href="https://colab.research.google.com/github/10udCryp7/Speech-Practice/blob/main/notebooks/01_Working_with_audio_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip install -U datasets
!pip install -U datasets[audio]

# Load dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset('hf-internal-testing/librispeech_asr_dummy', split = 'validation')

# Get Waveform

In [None]:
# get array of waveform
sample_array = dataset[0]['audio']['array']
print(f'array: {sample_array}')

# get tensor of waveform
sample_tensor = dataset[0]['audio'].get_all_samples().data
print(f'tensor: {sample_tensor}')

In [None]:
import gradio as gr

with gr.Blocks() as demo:
  audio = dataset[0]['audio']['sampling_rate'], sample_array
  output = gr.Audio(audio, 'test')

demo.launch(debug = True)


#Visualize Waveform

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

array = sample_array
sr = dataset[0]['audio']['sampling_rate']

plt.figure(12)
librosa.display.waveshow(array, sr=sr)

#Resampling with cast_column

In [None]:
from datasets import Audio

dataset_32k = dataset.cast_column('audio', Audio(sampling_rate = 32000))

dataset_32k.features['audio']

# Filter

In [None]:
!cd ..

In [None]:
MAX_DUR = 10.0

In [None]:
def filter_length(audio_dur):
  return audio_dur < MAX_DUR

dur_col = [librosa.get_duration(y = audio['array'], sr = audio['sampling_rate']) for audio in dataset['audio']]

dataset = dataset.add_column("duration" , dur_col)

In [None]:
dataset.filter(filter_length, input_columns=['duration'])

In [None]:
dataset = dataset.remove_columns(['duration'])

#Feature Extractor

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
feature_extractor(sample_array, sampling_rate = 16000, padding = True)

In [None]:
def prep_data(sample):
  audio = sample['audio']
  features = feature_extractor(audio['array'], sampling_rate = audio['sampling_rate'], padding = True)

  return features

In [None]:
dataset = dataset.map(prep_data)

In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt


example = dataset[0]

librosa.display.specshow(np.asarray(example['input_features'][0]),
                         sr = feature_extractor.sampling_rate,
                         hop_length = feature_extractor.hop_length,
                         x_axis = 'time',
                         y_axis = 'mel')


plt.colorbar()

#AutoProcessor

In [None]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("openai/whisper-small")

# Streaming

In [None]:
!pip install -U datasets[audio]

In [None]:
from datasets import load_dataset

dataset = load_dataset('hf-internal-testing/librispeech_asr_dummy', split = 'validation', streaming = True)

In [None]:
print(next(iter(dataset)))