# Tarea 3. Transformer con llamadas a funciones

Una evolución interesante de los primeros LLMs fue la incorporación de llamadas a funciones para superar las limitaciones en cálculos concretos de los modelos de lenguajes. <br>
Para esta tarea final utilizamos los ficheros: fechas2_train_function.en.csv y fechas2_train_function.es.csv en la que se añade al final del texto a reconocer un separador ‘|’ y un código de Python para llamar al sistema y pedir que ejecute esa función <br>
Prepare un sistema como el de la tarea 1 que entrene combinando las listas de los dos idiomas para que sea bilingüe dentro del contexto simplificado de esta tarea.
En test esta vez en lugar de medir la tasa de error WER lo que haremos es ejecutar el código dentro de la sección separada y comparar la fecha que genera con la de referencia de la tabla: <br>
fechas2_test_function.csv <br>
y evaluaremos la tasa de error. <br>

# Imports

In [1]:
import os
import torch
import torch, torchaudio, glob
import numpy as np
import random
import csv

from Trabajo_Utils import NoiseAug, RIRAug, identity, wer
from Trabajo_Model import *

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
SEQ_LEN = 20
NB_EPOCHS = 5
BATCH_SIZE = 32

def seed_everything(seed):      
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(42)

### Data Downloads

In [3]:
# dataset = load_dataset("FluidInference/musan", split="train")

# if not os.path.exists("musan_small"):
#     os.makedirs("musan_small", exist_ok=True)
#     for i, example in enumerate(dataset):
#         audio = example["audio"]["array"]
#         sr = example["audio"]["sampling_rate"]
#         sf.write(f"musan_small/file_{i}.wav", audio, sr)
        
# url = 'https://openslr.elda.org/resources/28/rirs_noises.zip'

# if not os.path.exists('RIRS_NOISES'):
#     if not os.path.exists('rirs_noises.zip'):
#         os.system('wget ' + url)
#     os.system('unzip -q rirs_noises.zip')
#     os.system('rm rirs_noises.zip')

In [4]:
from datetime import datetime, timedelta

def relative_day(days_to_add, current_date='21/11/2025'):
    date_format = "%d/%m/%Y"
    current = datetime.strptime(current_date, date_format)
    new_date = current + timedelta(days=int(days_to_add))
    return new_date.strftime(date_format)

def next_day(day_name, current_date='21/11/2025'):
    date_format = "%d/%m/%Y"
    current = datetime.strptime(current_date, date_format)
    
    # Map day names → ISO weekday codes
    day_name = day_name.strip().lower()
    day_map = {
        "monday": 1,
        "tuesday": 2,
        "wednesday": 3,
        "thursday": 4,
        "friday": 5,
        "saturday": 6,
        "sunday": 7,
    }
    if day_name not in day_map:
        raise ValueError(f"Invalid day name: {day_name}")
    
    target_code = day_map[day_name]
    days_ahead = (target_code - current.isoweekday()) % 7
    if days_ahead == 0:
        days_ahead = 7 # always next occurrence
        
    new_date = current + timedelta(days=days_ahead)
    return new_date.strftime(date_format)

In [5]:
class WordTokenizer:
    def __init__(self, csv_file):
        self.word2index = {
            '<pad>': 0,
            '<sos>': 1,
            '<eos>': 2,
            '<unk>': 3
        }
        self.index2word = {
            0: '<pad>',
            1: '<sos>',
            2: '<eos>',
            3: '<unk>'
        }

        self.build_vocab(csv_file)

    def build_vocab(self, csv_file):
        with open(csv_file, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                text = row['txt'].lower()
                for word in text.split():
                    if word not in self.word2index:
                        idx = len(self.word2index)
                        self.word2index[word] = idx
                        self.index2word[idx] = word

    def encode(self, text, seq_len=-1):
        tokens = ['<sos>'] + text.lower().split() + ['<eos>']
        ids = [
            self.word2index.get(w, self.word2index['<unk>'])
            for w in tokens
        ]

        if seq_len > len(ids):
            ids += [self.word2index['<pad>']] * (seq_len - len(ids))
        else:
            ids = ids[:seq_len]

        return torch.tensor(ids)


    def decode(self, ids):
        if isinstance(ids, torch.Tensor):
            ids = ids.tolist()

        words = [self.index2word.get(i, '<unk>') for i in ids]
        text = ' '.join(words)
        text = text.replace('<sos>', '').replace('<eos>', '').replace('<pad>', '')
        return text.strip()


In [6]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, tokenizer, audio_len=4*16000, seq_len=SEQ_LEN):
        self.audio_len = audio_len
        self.seq_len = seq_len
        self.tokenizer = tokenizer

        with open(csv_file, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            self.data = [(r['wav'], r['txt']) for r in reader]
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        wav, txt = self.data[idx]
        x, _ = torchaudio.load(wav)

        if x.shape[1] < self.audio_len:
            x = torch.nn.functional.pad(x, (0, self.audio_len - x.shape[1]))
        else:
            x = x[:, :self.audio_len]

        x = x[0]
        y = self.tokenizer.encode(txt, self.seq_len)
        return x, y

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, tokenizer, audio_len=4*16000, seq_len=SEQ_LEN):
        self.audio_len = audio_len
        self.seq_len = seq_len
        self.tokenizer = tokenizer

        with open(csv_file, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            self.data = [(row['wav'], row['txt']) for row in reader]

        print(len(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        wav_path, text = self.data[idx]

        x, fs = torchaudio.load(wav_path)
        if x.shape[1] < self.audio_len:
            x = torch.nn.functional.pad(
                x, (0, self.audio_len - x.shape[1]), value=0
            )
        else:
            x = x[:, :self.audio_len]

        x = x[0]
        y = self.tokenizer.encode(text, self.seq_len)
        return x, y

In [7]:
tokenizer = WordTokenizer('fechas2/fechas2_train_function.es.csv')
tokenizer.build_vocab('fechas2/fechas2_train_function.en.csv')

train_es = TrainDataset(
    'fechas2/fechas2_train_function.es.csv',
    tokenizer
)

train_en = TrainDataset(
    'fechas2/fechas2_train_function.en.csv',
    tokenizer
)

trainset = torch.utils.data.ConcatDataset([train_es, train_en])

testset = TestDataset(
    'fechas2/fechas2_test_function.csv',
    tokenizer
)

trainloader = torch.utils.data.DataLoader(
    trainset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

testloader = torch.utils.data.DataLoader(
    testset,
    batch_size=1,
    shuffle=False
)

1000


In [8]:
vocab_size = len(tokenizer.word2index)

model = AudioTransformer(
    vocab_size=vocab_size,
    d_model=256,
    nb_layers=8,
    d_ff=512,
    n_heads=8,
    d_head=32,
    dropout=0.1,
    seq_len=SEQ_LEN
)

model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
pad_idx = tokenizer.word2index['<pad>']

In [9]:
# import math

# BATCH_FACTOR = 3
# MAX_BATCH = None
# if device == 'cpu':
#     MAX_BATCH = math.ceil(len(trainloader) / BATCH_FACTOR)

# model.train()
# for epoch in range(NB_EPOCHS):
#     total_loss = 0

#     for i, (x, y) in enumerate(trainloader):
#         if MAX_BATCH is not None and i >= MAX_BATCH:
#             break

#         x = x.to(device)
#         y = y.to(device)

#         optimizer.zero_grad()
#         loss = model.loss(x, y, pad_idx=pad_idx)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#     print(f"Epoch {epoch+1}/{NB_EPOCHS} - Loss: {total_loss/(i+1):.3f}")

# torch.save(model.state_dict(), "model_function.pt")


In [11]:
def extract_code(text): 
    if '|' not in text:
        return None
    _, code = text.split('|', 1) 
    return code.strip()

def execute_code(code):
    try:
        return eval(code, { "relative_day": relative_day, "next_day": next_day })
    except Exception:
        return None

In [12]:
model = AudioTransformer(
    vocab_size=vocab_size,
    d_model=256,
    nb_layers=8,
    d_ff=512,
    n_heads=8,
    d_head=32,
    dropout=0.1,
    seq_len=SEQ_LEN
)

state = torch.load("model_function.pt", map_location=device)
model.load_state_dict(state)
model.to(device)
model.eval()

AudioTransformer(
  (fe): AudioFeatures(
    (fe): MelSpectrogram(
      (spectrogram): Spectrogram()
      (mel_scale): MelScale()
    )
    (spec_aug): SpecAug()
    (linear): Linear(in_features=80, out_features=256, bias=True)
  )
  (enc): Encoder(
    (att): ModuleList(
      (0-7): 8 x SelfAttention(
        (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (q_linear): Linear(in_features=256, out_features=256, bias=True)
        (v_linear): Linear(in_features=256, out_features=256, bias=True)
        (k_linear): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out): Linear(in_features=256, out_features=256, bias=True)
      )
    )
    (ff): ModuleList(
      (0-7): 8 x FeedForward(
        (ff): Sequential(
          (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (1): Linear(in_features=256, out_features=512, bias=True)
          (2): ReLU()
          (3): Dropout(p=0.1, inplac

In [13]:
for i in range(5):
    x, _ = testset[i]
    wav, ref_date = testset.data[i]

    hyp_ids = model.generate(x.unsqueeze(0), tokenizer)
    hyp_text = tokenizer.decode(hyp_ids)

    code = extract_code(hyp_text)
    pred_date = execute_code(code) if code else None

    print("REF DATE :", ref_date)
    print("HYP TEXT :", hyp_text)
    print("PRED DATE:", pred_date)
    print("-"*40)


REF DATE : 23/11/2025
HYP TEXT : the day after tomorrow thanks | relative_day(+2)
PRED DATE: 23/11/2025
----------------------------------------
REF DATE : 23/11/2025
HYP TEXT : please this coming thursday | next_day('thursday')
PRED DATE: 27/11/2025
----------------------------------------
REF DATE : 23/11/2025
HYP TEXT : por favor en un par de días | relative_day(+2)
PRED DATE: 23/11/2025
----------------------------------------
REF DATE : 27/11/2025
HYP TEXT : this thursday thank you | next_day('thursday')
PRED DATE: 27/11/2025
----------------------------------------
REF DATE : 26/11/2025
HYP TEXT : please next wednesday | next_day('wednesday')
PRED DATE: 26/11/2025
----------------------------------------


In [14]:
def evaluate_function(model, dataset, tokenizer, gen_mode = 'greedy'):
    correct = 0

    for i in range(len(dataset)):
        x, _ = dataset[i]
        if gen_mode == 'greedy':
            hyp_ids = model.generate(x.unsqueeze(0), tokenizer)
        elif gen_mode == 'sampling':
            hyp_ids = model.generate_sampling(x.unsqueeze(0), tokenizer)
        elif gen_mode == 'topk':
            hyp_ids = model.generate_topk(x.unsqueeze(0), tokenizer)
        elif gen_mode == 'beam':
            hyp_ids = model.generate_beam_search(x.unsqueeze(0), tokenizer
                                                ,beam_size=5)
        hyp = tokenizer.decode(hyp_ids)

        code = extract_code(hyp)
        if code is None:
            continue

        pred = execute_code(code)
        ref = dataset.data[i][1]

        if pred == ref:
            correct += 1

    return correct / len(dataset)


In [13]:
model.load_state_dict(torch.load("model_function.pt", map_location=device))
model.eval()

acc = evaluate_function(model, testset, tokenizer, gen_mode = 'greedy')
print(f"Accuracy Tarea 3: {acc:.3f}")

acc = evaluate_function(model, testset, tokenizer, gen_mode = 'sampling')
print(f"Accuracy Tarea 3 - sampling: {acc:.3f}")

acc = evaluate_function(model, testset, tokenizer, gen_mode = 'topk')
print(f"Accuracy Tarea 3 - topk: {acc:.3f}")

Accuracy Tarea 3: 0.951
Accuracy Tarea 3 - sampling: 0.931
Accuracy Tarea 3 - topk: 0.917


In [15]:
acc = evaluate_function(model, testset, tokenizer, gen_mode = 'beam')
print(f"Accuracy Tarea 3 - beam: {acc:.3f}")

Accuracy Tarea 3 - beam: 0.954
