This notebook tries to train on the finalised dataset (196GB).

In [4]:
import pandas as pd

df = pd.read_json("ParlaSpeech-HR.v1.0.jsonl", orient="records", lines=True)
#df["sentence"] = df.norm_words.apply(" ".join)
df = df.rename(columns={"path":"hashname"})

def process(text:str):
    from parse import compile
    from string import punctuation
    p = compile("{hit:d}.")
    in_list = text.split()
    out_list = list()
    for seg in in_list:
        parse_result = p.parse(seg)
        if parse_result:
            # We got a number with a dot afterward:
            out_list.append(seg.lower())
        else:
            out_list.append(seg.translate(str.maketrans('', '', punctuation)).lower())
    return " ".join(out_list)
df["sentence"] = df.words.apply(" ".join).apply(process)


In [5]:
df = pd.concat([df, df.speaker_info.apply(pd.Series)], axis=1)

In [6]:
test_df = df[df.split=="dev"].copy()
is_train = df.split=="train" 
has_speaker_name =~df.Speaker_name.isna()
speaker_name_is_not_dash = df.Speaker_name != "-"
train_df = df[is_train & has_speaker_name & speaker_name_is_not_dash].copy()


In [7]:
new_indices = []
target_size = 220000
while len(new_indices) < target_size:
    speakers = train_df.Speaker_name.unique()
    for speaker in speakers:
        ind = train_df[train_df.Speaker_name == speaker].index[0]
        new_indices.append(ind)
    train_df = train_df.drop(index=ind)
    print(f"{len(new_indices)=}, {len(speakers)=}")

train_df = df[is_train & has_speaker_name & speaker_name_is_not_dash].copy()
train_df = train_df.loc[new_indices, :]

len(new_indices)=304, len(speakers)=304
len(new_indices)=608, len(speakers)=304
len(new_indices)=912, len(speakers)=304
len(new_indices)=1215, len(speakers)=303


In [None]:
train_df.shape

(220148, 22)

In [None]:
columns_to_keep = ["hashname", "sentence"]
train_df = train_df.loc[:, columns_to_keep]
test_df = test_df.loc[:, columns_to_keep]


In [None]:
import os
# Use old or new vocab?
os.system("cp vocab_300_with_numbers.json vocab.json")


from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    "./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" ")

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer)

import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:

import gc
gc.collect()

train_df["audio"] = train_df.hashname.apply(lambda s: "data_16000_mono/seg."+s)
test_df["audio"] = test_df.hashname.apply(lambda s: "data_16000_mono/seg."+s)


import datasets
from datasets import load_dataset, load_metric, Audio

train_dataset = datasets.Dataset.from_pandas(train_df)
test_dataset = datasets.Dataset.from_pandas(test_df)

train_dataset = train_dataset.cast_column("audio", Audio())
test_dataset = test_dataset.cast_column("audio", Audio())

del train_df
del test_df
gc.collect()

35

In [14]:

from transformers import Trainer
from transformers import TrainingArguments
from transformers import Wav2Vec2ForCTC
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass, field
import torch


def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

train_mapped = train_dataset.map(
    prepare_dataset, remove_columns=train_dataset.column_names, 
    #num_proc=8, 
    cache_file_name=".cache_train", )
test_mapped = test_dataset.map(
    prepare_dataset, remove_columns=test_dataset.column_names, 
    #num_proc=8, 
    cache_file_name=".cache_test", )
print("Data Preparation Complete!")

del train_dataset
del test_dataset

gc.collect()

  0%|          | 0/220148 [00:00<?, ?ex/s]

  normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
  ret = ret.dtype.type(ret / rcount)
  normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/500 [00:00<?, ?ex/s]

Data Preparation Complete!


2683

In [15]:
train_mapped.save_to_disk("train220k_mapped")

In [16]:
test_mapped.save_to_disk("test_mapped")