In [8]:
import pandas as pd

# Specify the path to your TSV file
train_tsv_file_path = '/tmp/cyc/23Fall-269/Train_GCC-training.tsv'
val_tsv_file_path = '/tmp/cyc/23Fall-269/Validation_GCC-1.1.0-Validation.tsv'

# Read the TSV file into a DataFrame
train_df = pd.read_csv(train_tsv_file_path, delimiter='\t', header=None)[0]
val_df = pd.read_csv(val_tsv_file_path, delimiter='\t', header=None)[0]

0            author : a life in photography -- in pictures
1                  an angler fishes river on a snowy day .
2        photograph of the sign being repaired by brave...
3        the player staring intently at a computer scre...
4        globes : the green 3d person carrying in hands...
                               ...                        
15835    a bougainvillea with pink flowers on a white b...
15836        ingredient hanging over river during festival
15837            the general circulation of the atmosphere
15838    young teenager and her black horse in a traini...
15839    person warms up during a game against american...
Name: 0, Length: 15840, dtype: object


In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

class Seq2SeqDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_sentence = self.data.iloc[idx]
        
        # Tokenize and encode the source sentence
        source_tokens = self.tokenizer.encode_plus(
            source_sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': source_tokens['input_ids'].squeeze(),
            'attention_mask': source_tokens['attention_mask'].squeeze(),
            'target_ids': source_tokens['input_ids'].squeeze(),  # Target is the same as the input
            'target_mask': source_tokens['attention_mask'].squeeze()
        }


  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 18.2MB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.01MB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 1.67MB/s]
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm import tqdm

# Define the autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, model_name='t5-small'):
        super(Autoencoder, self).__init__()
        self.encoder_decoder = T5ForConditionalGeneration.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, target_ids=None, target_mask=None):
        if target_ids is not None:
            # Training mode: input and target are provided
            outputs = self.encoder_decoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=target_ids
            )
            return outputs.loss
        else:
            # Inference mode: only input is provided
            outputs = self.encoder_decoder.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=50,  # Set a reasonable maximum length for generated sequences
                num_beams=1,  # Set to 1 for greedy decoding
                no_repeat_ngram_size=2,  # Avoid repeating bigrams in the output
                early_stopping=True
            )
            return outputs
        


In [None]:
# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Create the dataset and DataLoader
train_dataset = Seq2SeqDataset(train_df, tokenizer)
val_dataset = Seq2SeqDataset(train_df, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

# Initialize the autoencoder model
autoencoder_model = Autoencoder()

# Define the optimizer and learning rate scheduler
optimizer = optim.AdamW(autoencoder_model.parameters(), lr=5e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    autoencoder_model.train()

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target_ids = batch['target_ids']
        target_mask = batch['target_mask']

        loss = autoencoder_model(input_ids, attention_mask, target_ids, target_mask)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}')

    # Optionally update the learning rate
    scheduler.step()

    # Evaluate with Exact Match (EM) on a validation set
    autoencoder_model.eval()
    with torch.no_grad():
        em_count = 0
        total_samples = 0

        for val_batch in tqdm(val_dataloader, desc=f'Validation - Epoch {epoch + 1}'):
            input_ids = val_batch['input_ids']
            attention_mask = val_batch['attention_mask']
            target_ids = val_batch['target_ids']
            target_mask = val_batch['target_mask']

            # Generate sequences
            generated_ids = autoencoder_model(input_ids, attention_mask).cpu().numpy()

            # Decode token IDs to strings
            generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
            target_sentences = val_batch['target']

            # Check for exact match
            em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
            total_samples += len(generated_sentences)

        em_score = em_count / total_samples
        print(f'Validation EM Score: {em_score}')

In [14]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m608.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
