<a href="https://colab.research.google.com/github/Antares28/id2223_kth_lab2/blob/main/feature_engineering_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prepare Environment

Install packages

In [None]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
! pip install -U accelerate
! pip install -U transformers

Link with the huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()
##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

Mount to google drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Load Dataset

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-CN", split="train", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-CN", split="test", use_auth_token=True)

print(common_voice)

In [None]:
# discard the additional metadata information
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

In [None]:
# Save dataset to google drive
import os

dataset_dir = '/content/drive/MyDrive/whisper/ASR_data'
os.makedirs(dataset_dir, exist_ok = True)

common_voice.save_to_disk(dataset_dir)

In [None]:
# Retrieve common voice from google drive
from datasets import load_dataset, DatasetDict

dataset_dir = '/content/drive/MyDrive/whisper/ASR_data'

common_voice = DatasetDict.load_from_disk(dataset_dir)

print(common_voice)

## Prepare Feature Extractor, Tokenizer and Data

In [None]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [None]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="mandarin", task="transcribe")

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="zh-CN", task="transcribe")

### Prepare Data

In [None]:
print(common_voice["train"][0])


In [None]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(common_voice["train"][0])

In [None]:
# input_str = common_voice["train"][0]["sentence"]
# labels = tokenizer(input_str).input_ids
# decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
# decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

# print(f"Input: {input_str}")
# print(f"Decoded w/ special: {decoded_with_special}")
# print(f"Decoded w/out special: {decoded_str}")
# print(f"Are equal: {input_str == decoded_str}")

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

In [None]:
# Save the processed data to google drive
import os

data_dir = '/content/drive/MyDrive/whisper/processed_data'
os.makedirs(data_dir, exist_ok=True)

common_voice.save_to_disk(data_dir)

In [None]:
print(common_voice["train"][0])

In [None]:
common_voice