# Libraries Installation

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install tensorflow

In [None]:
! pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

#Features dataset creation and pre-processing

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "nl", split="train[:40%]+validation[:40%]", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "nl", split="test[:40%]", use_auth_token=True)

print(common_voice)

In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Dutch", task="transcribe")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Dutch", task="transcribe")

In [None]:
print(common_voice["train"][0])

In [None]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(common_voice["train"][0])

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

#Save dataset on the disk

In [None]:
common_voice["train"].save_to_disk("train")
common_voice["test"].save_to_disk("test")

#Upload features dataset to Google Drive

In [None]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')
from googleapiclient.http import MediaFileUpload

In [None]:
import os.path

for file in os.listdir("train"):
    print("UPLOADING", file)
    file_metadata = {"name": "file_train"}
    media = MediaFileUpload(os.path.join("train", file), resumable=True)
    drive_service.files().create(
        body=file_metadata, media_body=media, fields="id"
    ).execute()
    print('File ID: {}'.format(created.get('id')))

for file in os.listdir("test"):
    print("UPLOADING", file)
    file_metadata = {"name": "file_test"}
    media = MediaFileUpload(os.path.join("test", file), resumable=True)
    drive_service.files().create(
        body=file_metadata, media_body=media, fields="id"
    ).execute()
    print('File ID: {}'.format(created.get('id')))
