# Prepare environment

## install packet

In [None]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt update
!apt install -y ffmpeg

!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install hopsworks

## login huggingface and hopsworks

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# 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

import hopsworks
project = hopsworks.login()
# T1aiPJzBrcYM1mzh.92B50IDnHkgHGljiOTS72KNPeaxWW8orYFF2YjyjfqmZdpI6ASKg9xRu0oYIbMM0

# Load dataset

## download dataset

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-CN", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-CN", split="test", use_auth_token=True)

print(common_voice)

## remove useless informaion

In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

# Prepare Feature Extractor, Tokenizer and Data

## create A WhisperProcessor

In [None]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Chinese", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Chinese", task="transcribe")

## prepare data

In [None]:
from datasets import Audio

def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

# Upload dataset

In [None]:
common_voice.save_to_disk("common_voice")
dataset_api = project.get_dataset_api()

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/dataset_dict.json", 
    upload_path = "fsorg_Training_Datasets/common_voice/", overwrite=True)

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/train/state.json", 
    upload_path = "fsorg_Training_Datasets/common_voice/train/", overwrite=True)

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/train/dataset_info.json", 
    upload_path = "fsorg_Training_Datasets/common_voice/train/", overwrite=True)

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/train/dataset.arrow", 
    upload_path = "fsorg_Training_Datasets/common_voice/train/", overwrite=True)

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/test/state.json", 
    upload_path = "fsorg_Training_Datasets/common_voice/test/", overwrite=True)

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/test/dataset.arrow", 
    upload_path = "fsorg_Training_Datasets/common_voice/test/", overwrite=True)

uploaded_file_path = dataset_api.upload(
    local_path = "./common_voice/test/dataset_info.json", 
    upload_path = "fsorg_Training_Datasets/common_voice/test/", overwrite=True)