# import and setup

In [5]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm


from datasets.arrow_dataset import Dataset

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
from src.model.text_vae import TextVAE
from src.model.decoder import Decoder
from src.model.encoder import Encoder

In [43]:
from src.model.text_vae import TextVAE
from src.constants import PAD_INDEX, UNK, PAD, SOS, EOS
from src.train.eval import eval_text_vae
from src.gaussian_kldiv import GaussianKLDiv
import math

In [33]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
logger = logging.getLogger(__name__)

In [6]:
from datasets import Dataset
import pandas as pd

# dataloader

In [7]:
seed = 1234

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:


train_df = pd.read_csv('data/train.tsv', sep='\t')
train_df = pd.DataFrame(train_df)
train_data = Dataset.from_pandas(train_df, split='train')

test_df = pd.read_csv('data/test.tsv', sep='\t')
test_df = pd.DataFrame(test_df)
test_data = Dataset.from_pandas(train_df, split='test')

In [8]:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")



def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example["sentence"])[:max_length]
    return {"tokens": tokens}


print(train_data[0])

{'sentence': 'thanks again dustin for all your help !', 'label': 1}


In [9]:
max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

Map:   0%|          | 0/443259 [00:00<?, ? examples/s]

Map:   0%|          | 0/443259 [00:00<?, ? examples/s]

In [11]:
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [12]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [13]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [14]:
vocab.set_default_index(unk_index)

In [18]:
vocab_size = len(vocab)

In [19]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [21]:
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

Map:   0%|          | 0/249333 [00:00<?, ? examples/s]

Map:   0%|          | 0/83111 [00:00<?, ? examples/s]

Map:   0%|          | 0/443259 [00:00<?, ? examples/s]

In [22]:
valid_data = valid_data.with_format(type="torch", columns=["ids", "label" ])
train_data = train_data.with_format(type="torch", columns=["ids", "label" ])
test_data = test_data.with_format(type="torch", columns=["ids", "label"])

In [23]:
def pad_to_max_length(batch):
    # Extract `ids` as a list of tensors
    ids = batch["ids"]
    
    # Pad each tensor in `ids` to the same length
    padded_ids = nn.utils.rnn.pad_sequence(ids, batch_first=True, padding_value=pad_index)
    
    # Truncate to max_seq_len if necessary
    if padded_ids.shape[1] > max_length:
        padded_ids = padded_ids[:, :max_length]
    
    # Add padded data back to batch
    batch["padded_ids"] = [padded_ids[i] for i in range(len(padded_ids))]
    return batch

# Apply padding function to each batch in the dataset
# dataset = test_data.map(pad_to_max_length, batched=True, batch_size=len(test_data))


## i think padding is dones here later in the model....

# train_data = train_data.map(pad_to_max_length, batched=True, batch_size=len(train_data))
# valid_data = valid_data.map(pad_to_max_length, batched=True, batch_size=len(valid_data))
# test_data = test_data.map(pad_to_max_length, batched=True, batch_size=len(test_data))


Map:   0%|          | 0/249333 [00:00<?, ? examples/s]

Map:   0%|          | 0/83111 [00:00<?, ? examples/s]

Map:   0%|          | 0/443259 [00:00<?, ? examples/s]

In [24]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["padded_ids"] for i in batch]
        # batch_ids = nn.utils.rnn.pad_sequence(
        #     batch_ids, padding_value=pad_index, batch_first=True
        # )
        batch_ids = torch.stack(batch_ids)
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "label": batch_label}
        return batch

    return collate_fn

In [25]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [26]:
batch_size = 320

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [27]:
print("number of training samples: ", len(train_data))
print(1875-(batch_size * 5))

number of training samples:  249333
275


In [28]:
print(f"Number of train batches: {len(train_data_loader)}")
print(f"Number of valid batches: {len(valid_data_loader)}")
print(f"Number of test batches: {len(test_data_loader)}")

Number of train batches: 780
Number of valid batches: 260
Number of test batches: 1386


In [30]:
import yaml

config_path = 'config.yaml'
config = yaml.safe_load(open(config_path, 'r', encoding='utf-8'))

In [31]:
model = TextVAE(
    vocab_size=vocab_size,
    embed_size=config['embed_size'],
    hidden_size=config['hidden_size'],
    num_layers=config['num_layers'],
    dropout=config['dropout'],
    enc_dec_tying=config['enc_dec_tying'],
    dec_gen_tying=config['dec_gen_tying']
)


In [35]:
save_dict = model.state_dict()

In [44]:
logger.info('transfer model to GPU')
model = model.to(device)

logger.info('set up criterion and optimizer')
criterion = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)
kldiv = GaussianKLDiv()
optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

26-Jan-25 21:25:09 - transfer model to GPU
26-Jan-25 21:25:09 - set up criterion and optimizer


In [37]:
save_path = 'model.pth'

In [45]:
logger.info('start train')

min_dev_loss = 1e9
corr_dev_wer = 1

for epoch in range(config['epoches']):

    total_tokens = 0
    correct_tokens = 0
    total_loss = 0

    for i, batch in enumerate(train_data_loader):

        model.train()
        optimizer.zero_grad()
        sentence = batch["ids"]

        ## move to desired device
        sentence = sentence.to(device)
        
        src = sentence[:, 1:]
        trg_input = sentence
        batch_size = sentence.size(0)
        pad = torch.zeros(size=(batch_size, 1), dtype=torch.long, device=sentence.device)
        trg_output = torch.cat((sentence[:, 1:], pad), dim=-1)

        logit, mean, std = model(src, trg_input)
        trg_output = trg_output.view(-1)
        output_size = logit.size(-1)
        logit = logit.view(-1, output_size)
        loss = criterion(logit, trg_output) + kldiv(mean, std) * config['lambd']
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), config['clip_grad_norm'])
        optimizer.step()

        mask = (trg_output != PAD_INDEX)
        token_num = mask.long().sum().item()
        total_tokens += token_num
        total_loss += token_num * loss.item()
        prediction = logit.argmax(dim=-1)
        correct_tokens += (prediction.masked_select(mask) == trg_output.masked_select(mask)).long().sum().item()

        if i % config['eval_freq'] == 0:
            train_loss = total_loss / total_tokens
            train_wer = 1 - correct_tokens / total_tokens
            total_loss = 0
            correct_tokens = 0
            total_tokens = 0
            dev_loss, dev_wer = eval_text_vae(model, valid_data_loader, criterion)
            logger.info('[epoch %2d step %4d]\ttrain_ppl: %.4f train_wer: %.4f dev_ppl: %.4f dev_wer: %.4f' %
                        (epoch, i, math.exp(train_loss), train_wer, math.exp(dev_loss), dev_wer))
            if dev_loss < min_dev_loss:
                min_dev_loss = dev_loss
                corr_dev_wer = dev_wer
                torch.save(model, save_path)

logger.info('dev_ppl: %.4f\tdev_wer: %.4f' % (math.exp(min_dev_loss), corr_dev_wer))
logger.info('finish')

26-Jan-25 21:25:13 - start train


RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor