###Install dependencies

In [1]:
pip install tensorflow librosa huggingface_hub torchaudio


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1->torchaudio)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1->torchaudio)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.1->torchaudio)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.1->torchaudio)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.1->torchaudio)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.2.1->torchaudio)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-an

###Load dataset


In [3]:
from datasets import load_dataset

# Carga del dataset desde Hugging Face Hub
dataset = load_dataset("asapp/slue-phase-2", 'hvb')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/383M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/387M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/407M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11344 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1690 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6121 [00:00<?, ? examples/s]

###Data preprocessing

In [4]:
import librosa
import numpy as np

def pad_mfcc(mfcc, max_len=100):  # Asegúrate de que max_len es adecuado
    if len(mfcc) < max_len:
        pad_width = max_len - len(mfcc)
        mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    return mfcc[:max_len]  # Asegúrate de que todos los MFCC tengan exactamente max_len tiempo


def preprocess_audio(audio):
    # Extraer características, por ejemplo, MFCC
    mfcc = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13)
    mfcc=mfcc.T
    # Normalizar las características
    mfcc = pad_mfcc(mfcc)
    return mfcc


In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Inicializar el tokenizador para convertir texto a secuencias
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(dataset['train']['text'])

from tensorflow.keras.preprocessing.sequence import pad_sequences

def encode_transcriptions(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=100, padding='post')  # Ajusta 'maxlen' según sea necesario
    return padded_sequence



###Model definition

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, InputLayer

def build_model(input_dim, output_vocab_size):
    model = Sequential([
        InputLayer(input_shape=(None, input_dim)),
        LSTM(128, return_sequences=True),
        Dense(output_vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

# Construye el modelo con el tamaño correcto de entrada y salida
model = build_model(input_dim=13, output_vocab_size=len(tokenizer.word_index)+1)


###Training

In [12]:
def batch_generator(dataset, batch_size=32):
    batch = []
    for item in dataset['train']:
        batch.append(item)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    if batch:
        yield batch


In [None]:
print(dataset['train'][0])

{'issue_id': '0002f70f7386445b', 'audio': {'path': '0002f70f7386445b_1669_4339.wav', 'array': array([0., 0., 0., ..., 0., 0., 0.]), 'sampling_rate': 16000}, 'speaker_id': '46', 'text': 'hello this is harper valley national bank', 'utt_index': 1, 'channel': 2, 'role': 'agent', 'start_ms': 1669, 'duration_ms': 2670, 'intent': 'replace card', 'dialog_acts': ['statement_open']}


In [None]:
def train_model(model, dataset, epochs=10, batch_size=32):
    for epoch in range(epochs):
        for batch in batch_generator(dataset, batch_size):
            audio_features = np.array([preprocess_audio(audio['audio']['array']) for audio in batch])
            transcriptions = np.array([encode_transcriptions(audio['text']) for audio in batch])

            model.fit(audio_features, transcriptions, verbose=1)

train_model(model, dataset)




###Evaluation

In [None]:
def evaluate_model(model):
    validation_data = dataset['validation']
    audio_features = np.array([preprocess_audio(path) for path in validation_data['audio']])
    transcriptions = np.array([encode_transcriptions(text) for text in validation_data['text']])
    loss = model.evaluate(audio_features, transcriptions)
    print("Validation Loss:", loss)

evaluate_model(model)
