In [9]:
import pandas as pd
import string

# Specify the path to your TSV file
train_tsv_file_path = '/home/allenfu/cyc/23Fall-269/Train_GCC-training.tsv'
val_tsv_file_path = '/home/allenfu/cyc/23Fall-269/Validation_GCC-1.1.0-Validation.tsv'

# Read the TSV file into a DataFrame
train_df = pd.read_csv(train_tsv_file_path, delimiter='\t', header=None)[0]
val_df = pd.read_csv(val_tsv_file_path, delimiter='\t', header=None)[0]

def remove_spaces(sentence):
    for punctuation in string.punctuation:
        sentence = sentence.replace(f' {punctuation}', punctuation)
    return ' '.join(sentence.split())

train_df = train_df.apply(remove_spaces)
val_df = val_df.apply(remove_spaces)

In [10]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

class Seq2SeqDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_sentence = self.data.iloc[idx]
        
        # Tokenize and encode the source sentence
        source_tokens = self.tokenizer.encode_plus(
            source_sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': source_tokens['input_ids'].squeeze(),
            'attention_mask': source_tokens['attention_mask'].squeeze(),
            'target_ids': source_tokens['input_ids'].squeeze(),  # Target is the same as the input
            'target_mask': source_tokens['attention_mask'].squeeze(),
            'target': source_sentence
        }

In [11]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from tqdm import tqdm

device = 'cuda'
# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Create the dataset and DataLoader
train_dataset = Seq2SeqDataset(train_df, tokenizer)
val_dataset = Seq2SeqDataset(val_df, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

# Initialize the autoencoder model
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

# Define the optimizer and learning rate scheduler
optimizer = optim.AdamW(t5_model.parameters(), lr=5e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
num_epochs = 3
best_em_score = 0.0

for epoch in range(num_epochs):
    total_loss = 0
    t5_model.train()

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_ids = batch['target_ids'].to(device)
        target_mask = batch['target_mask'].to(device)

        # Training mode: input and target are provided
        outputs = t5_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=target_ids
        )
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}')

    # Optionally update the learning rate
    scheduler.step()

    # Evaluate with Exact Match (EM) on a validation set
    t5_model.eval()
    with torch.no_grad():
        em_count = 0
        total_samples = 0

        for val_batch in tqdm(val_dataloader, desc=f'Validation - Epoch {epoch + 1}'):
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            target_ids = val_batch['target_ids'].to(device)
            target_mask = val_batch['target_mask'].to(device)

            # Inference mode: only input is provided
            outputs = t5_model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,  # Set a reasonable maximum length for generated sequences
                num_beams=1,  # Set to 1 for greedy decoding
                no_repeat_ngram_size=2,  # Avoid repeating bigrams in the output
                early_stopping=True
            )

            # Generate sequences
            generated_ids = outputs.detach().cpu().numpy()

            # Decode token IDs to strings
            generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
            target_sentences = val_batch['target']

            # Check for exact match
            em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
            total_samples += len(generated_sentences)

        em_score = em_count / total_samples
        print(f'Validation EM Score: {em_score}')

        # Save the model if the EM score improves
        if em_score > best_em_score:
            best_em_score = em_score
            torch.save(t5_model.state_dict(), 't5_model.pth')
            print("Model saved!")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Epoch 1/3: 100%|██████████| 51849/51849 [3:02:02<00:00,  4.75it/s]  


Epoch 1/3, Average Loss: 0.005091056606930831


Validation - Epoch 1: 100%|██████████| 248/248 [01:59<00:00,  2.08it/s]


Validation EM Score: 0.8863636363636364
Model saved!


Epoch 2/3:   2%|▏         | 907/51849 [03:10<2:58:37,  4.75it/s]


KeyboardInterrupt: 

In [14]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m608.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [9]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    CLIPProcessor,
    CLIPTextModel,
)
import torch
model = T5ForConditionalGeneration.from_pretrained("sonoisa/t5-base-japanese")
tokenizer = T5Tokenizer.from_pretrained("sonoisa/t5-base-japanese", is_fast=True)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

tokenized_inputs = tokenizer(
    ["今日は良い天気です", "今日は良い天気です", "今良天気です今日今日今日"],
    add_special_tokens=True,
    max_length=64,
    return_tensors='pt',
    padding='max_length',
    truncation=True
) # It's sunny today

clip_inputs = processor(
    text=["今日は良い天気です", "今日は良い天気です", "今良天気です今日今日今日"],
    images=torch.zeros(3, 3, 224, 224),
    add_special_tokens=True,
    max_length=64,
    return_tensors='pt',
    padding='max_length',
    truncation=True
)

clip_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
inputs_embeds = model.get_input_embeddings()(tokenized_inputs["input_ids"])
print(inputs_embeds.shape)

# **NOTE**: pad_token_id is used as decoder_start_token_id
dummy_decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]] * 3) 

output_ids = model.generate(
    inputs_embeds=inputs_embeds,
    # attention_mask=tokenized_inputs["attention_mask"],
    decoder_input_ids=dummy_decoder_input_ids
)

output_ids

torch.Size([3, 64, 768])


tensor([[    0, 32099,     0,  ..., 32099,     0, 32099],
        [    0, 32099,     0,  ..., 32099,     0, 32099],
        [    0, 32099,     0,  ..., 32099,     0, 32099]])

In [12]:
!pip install pillow

Collecting pillow
  Using cached Pillow-10.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Using cached Pillow-10.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (3.6 MB)
Installing collected packages: pillow
Successfully installed pillow-10.1.0
