In [12]:
# Install required errors
!pip install datasets==1.13.3
!pip install transformers==4.11.3
!pip install librosa
!pip install jiwer





## Prepare Data, Tokenizer, Feature Extractor


In [13]:
# Load dataset
from datasets import load_dataset, load_metric

timit = load_dataset("timit_asr")
timit = timit.remove_columns(["phonetic_detail", "word_detail", "dialect_region", "id", "sentence_type", "speaker_id"])

Reusing dataset timit_asr (/Users/reza/.cache/huggingface/datasets/timit_asr/clean/2.0.1/5bebea6cd9df0fc2c8c871250de23293a94c1dc49324182b330b6759ae6718f8)


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
# display random sample of data
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [15]:
# remove special character
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

timit = timit.map(remove_special_characters)

Loading cached processed dataset at /Users/reza/.cache/huggingface/datasets/timit_asr/clean/2.0.1/5bebea6cd9df0fc2c8c871250de23293a94c1dc49324182b330b6759ae6718f8/cache-ec1c787be5bba22a.arrow
Loading cached processed dataset at /Users/reza/.cache/huggingface/datasets/timit_asr/clean/2.0.1/5bebea6cd9df0fc2c8c871250de23293a94c1dc49324182b330b6759ae6718f8/cache-c3721faf74ba518b.arrow


In [16]:
show_random_elements(timit["train"].remove_columns(["audio", "file"]))

Unnamed: 0,text
0,many shapes in bathtubs
1,i took her word for it but is she really going with you
2,academic aptitude guarantees your diploma
3,don't ask me to carry an oily rag like that
4,for what do the utopians labor
5,it motivates his behavior
6,my instructions desperately need updating
7,how permanent are their records
8,i ate every oyster on nora's plate
9,don't ask me to carry an oily rag like that


In [17]:
# enumerate chars
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [18]:
vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=timit.column_names["train"])
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'i': 0,
 'k': 1,
 'a': 2,
 'x': 3,
 'n': 4,
 'h': 5,
 'z': 6,
 'r': 7,
 'c': 8,
 't': 9,
 "'": 10,
 'f': 11,
 'p': 12,
 'y': 13,
 'v': 14,
 'o': 15,
 'e': 16,
 'b': 17,
 'm': 18,
 'd': 19,
 'u': 20,
 'w': 21,
 'l': 22,
 's': 23,
 'q': 24,
 'j': 25,
 'g': 26,
 ' ': 27}

In [20]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

30

In [21]:
# save chars to json
import json
with open('out/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [22]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("out/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" ")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [23]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [24]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [25]:
timit["train"][0]["file"]
timit["train"][0]["audio"]

{'path': '/Users/reza/.cache/huggingface/datasets/downloads/extracted/7893e0473a0581fb9cec4c41b1782068ca05a4f780a5dc974566150e7ba0af6b/data/TRAIN/DR4/MMDM0/SI681.WAV',
 'array': array([-2.1362305e-04,  6.1035156e-05,  3.0517578e-05, ...,
        -3.0517578e-05, -9.1552734e-05, -6.1035156e-05], dtype=float32),
 'sampling_rate': 16000}

In [26]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(timit["train"]))

print(timit["train"][rand_int]["text"])
ipd.Audio(data=np.asarray(timit["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

rich purchased several signed lithographs 
