In [None]:
%pip install datasets transformers accelerate -U transformers[torch] dask flash-attn --no-build-isolation

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
Collecting dask
  Downloading dask-2024.7.0-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flash-attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m


In [2]:
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import dask.dataframe as dd
import numpy as np
import pandas as pd
import re

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [1]:
# dataset retirado do https://commonvoice.mozilla.org/pt/datasets
# common_voice_11_0 e
def dataset(linguagem:str):
    dataset_voice = load_dataset("mozilla-foundation/common_voice_11_0", linguagem, split="train+validation", trust_remote_code=True)
    test_dataset_voice = load_dataset("mozilla-foundation/common_voice_11_0", linguagem, split="test", trust_remote_code=True)

    return dataset_voice, test_dataset_voice

dataset_voice, test_dataset_voice = dataset("pt")

NameError: name 'load_dataset' is not defined

In [None]:
dataset_voice[0]["audio"]

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h", sampling_rate=48000) #facebook/wav2vec2-base-960h
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h", #facebook/wav2vec2-base-960h
                                       ctc_loss_reduction="mean",
                                       torch_dtype=torch.float16,
                                       attn_implementation="flash_attention_2",
                                       pad_token_id=processor.tokenizer.pad_token_id).to(device)

In [None]:
type(dataset_voice), type(test_dataset_voice)

In [None]:
dataset_voice, test_dataset_voice

In [None]:
dataset_voice_processed = dataset_voice.remove_columns(['client_id', 'accent', 'gender', 'age', 'up_votes', 'path', 'down_votes'])
test_dataset_voice_processed = test_dataset_voice.remove_columns(['client_id', 'accent', 'gender', 'age', 'up_votes', 'path', 'down_votes'])

In [None]:
dataset_voice_processed, test_dataset_voice_processed

In [None]:
def remove_special_characters(observation):
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
    observation["sentence"] = re.sub(chars_to_ignore_regex, '', observation["sentence"]).lower() + " "
    return observation

dataset_voice_processed = dataset_voice_processed.map(remove_special_characters)
test_dataset_voice_processed = test_dataset_voice_processed.map(remove_special_characters)

In [None]:
num_lines_to_exclude = 10000
total_lines = dataset_voice.num_rows
indices_to_keep = list(range(num_lines_to_exclude, total_lines))
filtered_train_dataset = dataset_voice.select(indices_to_keep)

In [None]:
def prepare_dataset(observation):
    observation["input_ids"] = processor(observation["audio"]["array"], sampling_rate=observation["audio"]["sampling_rate"], return_tensors="pt").input_values[0]
    with processor.as_target_processor():
        observation["labels"] = processor(observation["sentence"]).input_ids
    return observation

dataset_voice_processed = dataset_voice_processed.map(prepare_dataset, remove_columns=dataset_voice_processed.column_names)
test_dataset_voice_processed = test_dataset_voice_processed.map(prepare_dataset, remove_columns=test_dataset_voice_processed.column_names)

In [None]:
# Renomear a coluna 'input_values' para 'input_ids' no dataset de treinamento
#dataset_voice_processed = dataset_voice_processed.rename_column("input_values", "input_ids")

# Renomear a coluna 'input_values' para 'input_ids' no dataset de teste
#test_dataset_voice_processed = test_dataset_voice_processed.rename_column("input_values", "input_ids")

In [None]:
dataset_voice_processed.column_names, test_dataset_voice_processed.column_names

In [None]:
# Função para calcular as métricas
def compute_metrics(pred):
    wer_metric = load_metric("wer")
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
data_collator = DataCollatorWithPadding(processor.tokenizer)
data_collator

In [None]:
# Configurar argumentos de treinamento
training_args = TrainingArguments(
  output_dir="./wav2vec2",
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  num_train_epochs=3,
  learning_rate=1e-4,
  weight_decay=0.01,
)

# Configurar o DataCollator
data_collator = DataCollatorWithPadding(processor.tokenizer)

# Configurar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_voice_processed,
    eval_dataset=test_dataset_voice_processed,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,  # Certifique-se de que o tokenizador esteja correto
)

# Treinar o modelo
trainer.train()

In [None]:
# Avaliar e salvar o modelo
trainer.evaluate(test_dataset)
model.save_pretrained("./wav2vec2")
processor.save_pretrained("./wav2vec2")