### Dependencies

In [2]:
!pip install datasets
!pip install numpy soundfile

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [3]:
from huggingface_hub import login

from datasets import load_dataset, Dataset, DatasetDict
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torchaudio

### Load dataset

In [2]:
# Te pedirá ingresar tu token de autenticación de Hugging Face
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [82]:
cv_1_train = load_dataset("mozilla-foundation/common_voice_6_1", "en", split="train")

In [83]:
cv_1_test = load_dataset("mozilla-foundation/common_voice_6_1", "en", split="test")
cv_1_validation = load_dataset("mozilla-foundation/common_voice_6_1", "en", split="validation")

### Preprocess dataset

In [97]:
train_df = cv_1_train.to_pandas()
test_df = cv_1_test.to_pandas()
validation_df = cv_1_validation.to_pandas()

In [98]:
### Equilibrar por Acento y Genero

# Concatenar los DataFrames de train, validation y test
# df = pd.concat([train_df, validation_df, test_df], ignore_index=True)

df = train_df.sample(n=50000)

# Filtrar por los valores de género (Male y Female) y por acento (US y England)
df = df[df['gender'].isin(['male', 'female']) & df['accent'].isin(['us', 'england'])]

# Contar el número de muestras por combinación de género y acento
group_counts = df.groupby(['gender', 'accent']).size().reset_index(name='count')

# Encontrar el número mínimo de muestras en los subgrupos
min_count = group_counts['count'].min()

# Filtrar y equilibrar el DataFrame por cada combinación de género y acento
balanced_dfs = []
for (gender, accent), group in df.groupby(['gender', 'accent']):
    balanced_group = group.sample(n=min_count, random_state=42)
    balanced_dfs.append(balanced_group)

# Combinar los DataFrames equilibrados
balanced_df = pd.concat(balanced_dfs, ignore_index=True)

# Verificar el resultado
balanced_df[['accent', 'gender']].value_counts()


Unnamed: 0_level_0,Unnamed: 1_level_0,count
accent,gender,Unnamed: 2_level_1
england,female,741
england,male,741
us,female,741
us,male,741


Audio duration

In [96]:
import torchaudio

# Función para obtener la duración del audio
def get_audio_duration(row):
    waveform, sample_rate = torchaudio.load(row['path'])
    duration_seconds = waveform.size(1) / sample_rate  # Calcular la duración en segundos
    return duration_seconds

# Aplicar la función a cada fila del DataFrame balanceado
balanced_df['duration'] = balanced_df.apply(get_audio_duration, axis=1)

# Calcular la duración promedio de los audios
average_duration = balanced_df['duration'].mean()

# Mostrar la duración promedio
print(f"La duración promedio de los audios es de {average_duration:.2f} segundos.")


La duración promedio de los audios es de 5.55 segundos.


In [99]:
### Equilibrar solamente por acento
# df = cv_1_train.to_pandas()

# # Filtrar acentos US y England
# us_df = df[df['accent'] == 'us']
# england_df = df[df['accent'] == 'england']

# # Equilibrar los datasets (mismo número de muestras)
# england_count = len(england_df)
# us_df = us_df.sample(n=england_count, random_state=42)

# # Combinar los datasets equilibrados
# balanced_df = pd.concat([us_df, england_df])


# ## Usar menos muestras de las que tenemos
# balanced_df = balanced_df.sample(n=3000)

# Cargar el procesador de Wav2Vec2 (solo una vez)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

# Modificar la función de procesamiento de audio
def load_audio(row):
    # Cargar el archivo de audio
    waveform, sample_rate = torchaudio.load(row['path'])

    # Resamplear si es necesario
    if sample_rate != processor.feature_extractor.sampling_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, processor.feature_extractor.sampling_rate)
        waveform = resampler(waveform)

    # Definir la longitud máxima (por ejemplo, 32000 muestras, ajusta según la duración de tu audio)
    max_length = int(processor.feature_extractor.sampling_rate * 6)  # Ejemplo: 2 segundos de audio

    # Aplicar el preprocesamiento de Wav2Vec2 con padding y truncation
    inputs = processor(waveform.squeeze().numpy(),
                       sampling_rate=processor.feature_extractor.sampling_rate,
                       return_tensors="pt",
                       padding="max_length",  # Padding para un tamaño uniforme
                       truncation=True,       # Truncar si la secuencia es demasiado larga
                       max_length=max_length)  # Longitud máxima

    # Retornar los valores de entrada procesados
    return inputs.input_values[0].tolist()

# Aplicar la función de carga y procesamiento del audio
balanced_df['input_values'] = balanced_df.apply(load_audio, axis=1)


# Convertir etiquetas de acento en números
label2id = {'us': 0, 'england': 1}
id2label = {v: k for k, v in label2id.items()}
balanced_df['label'] = balanced_df['accent'].map(label2id)

# Convertir DataFrame a Dataset de Hugging Face
balanced_dataset = Dataset.from_pandas(balanced_df)

# Seleccionar solo las columnas necesarias para el modelo
balanced_dataset = balanced_dataset.remove_columns(['audio', 'path', 'up_votes', 'down_votes', 'age', 'gender', 'locale', 'segment', 'client_id', 'sentence'])
if '__index_level_0__' in balanced_dataset.column_names:
    balanced_dataset = balanced_dataset.remove_columns(['__index_level_0__'])


# Dividir el dataset en entrenamiento, prueba y validación
train_testvalid = balanced_dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

# Verificar la estructura del dataset final
dataset




DatasetDict({
    train: Dataset({
        features: ['accent', 'input_values', 'label'],
        num_rows: 2371
    })
    test: Dataset({
        features: ['accent', 'input_values', 'label'],
        num_rows: 297
    })
    valid: Dataset({
        features: ['accent', 'input_values', 'label'],
        num_rows: 296
    })
})

### Finetuning Wav2Vec

In [100]:
# Cargar el modelo preentrenado para clasificación de secuencias
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# Definir argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./wav2vec2-accent-classification",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",  # No reporting to Weights & Biases, etc.
)

# Definir las funciones de evaluación
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = np.sum(preds == labels) / len(labels)
    return {"accuracy": accuracy}

# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    tokenizer=processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6538,0.689925,0.540541
2,0.696,0.693112,0.540541


KeyboardInterrupt: 

.+ 5000 data: 0.92 accuracy, 0.18 validation and training loss

### Testing


Evaluation

In [None]:
results = trainer.evaluate(dataset["test"])
print(f"Accuracy: {results['eval_accuracy']:.4f}")


### Save model

In [None]:
model.save_pretrained("./model-wav2vec2-accent-classification")
processor.save_pretrained("./model-wav2vec2-accent-classification")


[]

### Load and make inference

In [31]:
# Mezclar (barajar) las filas del DataFrame
shuffled_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Ver las primeras filas del DataFrame mezclado
shuffled_df.head()


Unnamed: 0,client_id,path,audio,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,344bf332f298134d3880f9e38710e4c7e7037157bd63a3...,/root/.cache/huggingface/datasets/downloads/ex...,"{'bytes': None, 'path': '/root/.cache/huggingf...",She also plays the guitar and lute.,2,0,twenties,female,england,en,
1,27684391c8f6095629c2c12a88cb47055fa3abc33d53e3...,/root/.cache/huggingface/datasets/downloads/ex...,"{'bytes': None, 'path': '/root/.cache/huggingf...","""He's totally daft about birds.""",2,0,teens,female,us,en,
2,500d18a3257795d8ea82b3c39440f7656ecda7b29519f3...,/root/.cache/huggingface/datasets/downloads/ex...,"{'bytes': None, 'path': '/root/.cache/huggingf...",The proceeds of the book benefited several cha...,3,0,thirties,female,us,en,
3,2429806e5f5039a19ed9376741f82f6512e720f71bd07f...,/root/.cache/huggingface/datasets/downloads/ex...,"{'bytes': None, 'path': '/root/.cache/huggingf...",The river is crossed by the Murray Valley High...,2,0,twenties,male,england,en,
4,4875a5fc17c3936321a179429a287bc6971775f7f8b5d2...,/root/.cache/huggingface/datasets/downloads/ex...,"{'bytes': None, 'path': '/root/.cache/huggingf...",This car park is available for users of both M...,2,0,fifties,female,us,en,


In [64]:
expected_accent = shuffled_df['accent'][1]
sentence = shuffled_df['sentence'][1]
expected_accent

'us'

In [66]:
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torchaudio

model = Wav2Vec2ForSequenceClassification.from_pretrained("./model-wav2vec2-accent-classification")
processor = Wav2Vec2Processor.from_pretrained("./model-wav2vec2-accent-classification")

# Mover el modelo a la GPU si está disponible, de lo contrario usar la CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Ruta al nuevo archivo de audio
audio_path = shuffled_df['path'][1]

# Preprocesar el nuevo audio
def preprocess_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)

    if sample_rate != processor.feature_extractor.sampling_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, processor.feature_extractor.sampling_rate)
        waveform = resampler(waveform)

    max_length = int(processor.feature_extractor.sampling_rate * 10)  # Ejemplo: 8 segundos
    inputs = processor(waveform.squeeze().numpy(),
                       sampling_rate=processor.feature_extractor.sampling_rate,
                       return_tensors="pt",
                       padding="max_length",
                       truncation=True,
                       max_length=max_length)

    return inputs

# Preprocesar el nuevo audio
inputs = preprocess_audio(audio_path)

# Mover los inputs al mismo dispositivo que el modelo
inputs = {k: v.to(device) for k, v in inputs.items()}

# Hacer predicciones
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()

# Mapear la clase predicha al nombre del acento
id2label = {0: 'us', 1: 'england'}
predicted_accent = id2label[predicted_class_id]

print(f"The predicted accent is: {predicted_accent}")


The predicted accent is: england


In [77]:
from IPython.display import Audio

# Ruta del archivo de audio en la primera fila del dataframe balanceado
audio_path = shuffled_df['path'][1]

# Reproducir el archivo de audio
Audio(audio_path)
