In [1]:
import pandas as pd
import string

# Specify the path to your TSV file
train_tsv_file_path = '/home/allenfu/cyc/23Fall-269/Train_GCC-training.tsv'
val_tsv_file_path = '/home/allenfu/cyc/23Fall-269/Validation_GCC-1.1.0-Validation.tsv'

# Read the TSV file into a DataFrame
train_df = pd.read_csv(train_tsv_file_path, delimiter='\t', header=None)[0]
val_df = pd.read_csv(val_tsv_file_path, delimiter='\t', header=None)[0]

def remove_spaces(sentence):
    for punctuation in string.punctuation:
        sentence = sentence.replace(f' {punctuation}', punctuation)
    return ' '.join(sentence.split())

train_df = train_df.apply(remove_spaces)
val_df = val_df.apply(remove_spaces)

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

class Seq2SeqDataset(Dataset):
    def __init__(self, dataframe, tokenizer, processor, max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_sentence = self.data.iloc[idx]
        
        # Tokenize and encode the source sentence
        t5_tokens = self.tokenizer.encode_plus(
            source_sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )

        t5_inputs =  {
            'input_ids': t5_tokens['input_ids'].squeeze(),
            'attention_mask': t5_tokens['attention_mask'].squeeze(),
            'target_ids': t5_tokens['input_ids'].squeeze(),  # Target is the same as the input
            'target_mask': t5_tokens['attention_mask'].squeeze(),
            'target': source_sentence
        }

        clip_tokens = self.processor(
            text=source_sentence, 
            images=torch.zeros((3, 224, 224)), 
            return_tensors="pt", 
            padding='max_length', 
            max_length=self.max_length, 
            truncation=True
        )

        clip_inputs = {
            'input_ids': clip_tokens['input_ids'].squeeze(),
            'attention_mask': clip_tokens['attention_mask'].squeeze(),
            'pixel_values': clip_tokens["pixel_values"].view(3, 224, 224),
            'target_ids': clip_tokens['input_ids'].squeeze(),  # Target is the same as the input
            'target_mask': clip_tokens['attention_mask'].squeeze(),
            'target': source_sentence
        }

        return clip_inputs, t5_inputs

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, CLIPModel, CLIPProcessor
from tqdm import tqdm
        
class Bottleneck(nn.Module):
    def __init__(self, input_dim, output_dim, bottleneck_dim=4096):
        super(Bottleneck, self).__init__()
        self.blocks = nn.Sequential(
            # nn.Linear(input_dim, bottleneck_dim),
            # nn.LayerNorm(bottleneck_dim),
            # nn.ReLU(),
            # nn.Linear(bottleneck_dim, output_dim),
            nn.Linear(input_dim, output_dim),
            nn.LayerNorm(output_dim),
            nn.ReLU()
        )

        # self.layer = nn.Linear(input_dim, output_dim)
        # self.norm = nn.LayerNorm(output_dim)

    def forward(self, x):
        return self.blocks(x)

class CLIPEval(nn.Module):
    def __init__(self, t5_model_path, device='cuda'):
        super(CLIPEval, self).__init__()
        self.encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.encoder.load_state_dict(torch.load('/home/allenfu/269/clip.pt'))
        self.bottleneck = Bottleneck(512, 768)
        self.decoder = T5ForConditionalGeneration.from_pretrained('t5-base')
        self.decoder.load_state_dict(torch.load(t5_model_path))
        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        self.device = device

        # Set requires_grad to False for encoder and decoder parameters
        for param in self.encoder.parameters():
            param.requires_grad = False

        for param in self.decoder.parameters():
            param.requires_grad = False

        # Set requires_grad to True for dimension transform layer parameters
        for param in self.bottleneck.parameters():
            param.requires_grad = True

    def forward(self, clip_inputs, t5_inputs, train=True):
        if train:
            # inputs_embeds = self.encoder.text_model.embeddings.token_embedding(clip_inputs['input_ids'].to(self.device))
            # t5_inputs_embeds = self.bottleneck(inputs_embeds)
            # t5_inputs_embeds = self.decoder.get_input_embeddings()(t5_inputs["input_ids"].to(self.device))[:, 0, :]
            encoder_outputs = self.encoder.text_model(
                input_ids=clip_inputs["input_ids"].to(self.device), 
                attention_mask=clip_inputs["attention_mask"].to(self.device),
                # output_attentions=True,
                # output_hidden_states=True,
            )
            encoder_outputs['last_hidden_state'] = self.bottleneck(encoder_outputs['last_hidden_state'])
            output = self.decoder(
                # inputs_embeds=t5_inputs_embeds,
                encoder_outputs=encoder_outputs,
                # input_ids=t5_inputs['input_ids'].to(self.device),
                attention_mask=t5_inputs['attention_mask'].to(self.device),
                # decoder_input_ids=torch.tensor([[self.tokenizer.pad_token_id]] * t5_inputs_embeds.shape[0]),
                labels=t5_inputs['target_ids'].to(self.device)
            )
            return output.loss
        else:
            # inputs_embeds = self.encoder.text_model.embeddings.token_embedding(clip_inputs['input_ids'].to(self.device))
            # t5_inputs_embeds = self.bottleneck(inputs_embeds)
            # t5_inputs_embeds = self.decoder.get_input_embeddings()(t5_inputs["input_ids"].to(self.device))[:, 0, :]
            encoder_outputs = self.encoder.text_model(
                input_ids=clip_inputs["input_ids"].to(self.device), 
                attention_mask=clip_inputs["attention_mask"].to(self.device),
                # output_attentions=True,
                # output_hidden_states=True,
            )
            encoder_outputs['last_hidden_state'] = self.bottleneck(encoder_outputs['last_hidden_state'])
            output = self.decoder.generate(
                # inputs_embeds=t5_inputs_embeds,
                encoder_outputs=encoder_outputs,
                attention_mask=t5_inputs['attention_mask'].to(self.device),
                decoder_input_ids=torch.tensor([[self.tokenizer.pad_token_id]] * t5_inputs['input_ids'].shape[0]).to(self.device)
            )
            return output

In [9]:
device = 'cuda'
# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Create the dataset and DataLoader
train_dataset = Seq2SeqDataset(train_df, tokenizer, processor)
val_dataset = Seq2SeqDataset(val_df, tokenizer, processor)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=4)

# Initialize the autoencoder model
t5_model_path = '/home/allenfu/cyc/23Fall-269/t5_model.pth'
clip_model = CLIPEval(t5_model_path, device).to(device)
clip_model.load_state_dict(torch.load('clip_model.pth'))

# Define the optimizer and learning rate scheduler
optimizer = optim.AdamW(clip_model.bottleneck.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
num_epochs = 3
best_em_score = 0.0
for epoch in range(num_epochs):
    total_loss = 0
    clip_model.train()

    for clip_inputs, t5_inputs in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        loss = clip_model(clip_inputs, t5_inputs, train=True)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}')

    # Optionally update the learning rate
    scheduler.step()

    # Evaluate with Exact Match (EM) on a validation set
    clip_model.eval()
    with torch.no_grad():
        em_count = 0
        total_samples = 0

        for clip_inputs, t5_inputs in tqdm(val_dataloader, desc=f'Validation - Epoch {epoch + 1}'):
            # Generate sequences
            generated_ids = clip_model(clip_inputs, t5_inputs, train=False).detach().cpu().numpy()

            # Decode token IDs to strings
            generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
            target_sentences = t5_inputs['target']

            # Check for exact match
            em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
            total_samples += len(generated_sentences)

        em_score = em_count / total_samples
        print(f'Validation EM Score: {em_score}')

        # Save the model if the EM score improves
        if em_score > best_em_score:
            best_em_score = em_score
            torch.save(clip_model.state_dict(), 'clip_model.pth')
            print("Model saved!")



Epoch 1/3: 100%|██████████| 25925/25925 [1:24:59<00:00,  5.08it/s]


Epoch 1/3, Average Loss: 0.17930788880316478


Validation - Epoch 1: 100%|██████████| 124/124 [00:59<00:00,  2.10it/s]


Validation EM Score: 0.35795454545454547
Model saved!


Epoch 2/3:  25%|██▍       | 6392/25925 [20:56<1:04:00,  5.09it/s]


KeyboardInterrupt: 

In [39]:
with torch.no_grad():
    em_count = 0
    total_samples = 0

    for clip_inputs, t5_inputs in val_dataloader:
        # Generate sequences
        generated_ids = clip_model(clip_inputs, t5_inputs, train=False).detach().cpu().numpy()

        # Decode token IDs to strings
        generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
        target_sentences = t5_inputs['target']

        # Check for exact match
        em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
        total_samples += len(generated_sentences)
        print(generated_sentences[0], '-------', target_sentences[0])

    em_score = em_count / total_samples
    print(f'Validation EM Score: {em_score}')

author of life, pictures in pictures ------- author: a life in photography-- in pictures
leaves vector illustration on a seamless pattern ------- leaves vector illustration on a seamless pattern background
a lot of dried fruit and nuts for sale in old quarters in city. ------- a lot of dried fruits and nuts for sale in old fashioned traditional grocery store in city on peninsula
how to tell a girl how you feel ------- how to tell if a girl likes you
today, today was another reminder of how person found the guy who invented the flip, the first ------- today this may look familiar, but person was the guy who discovered the first flip on a bicycle.
is your child's school or a parade? strappy?? ------- is your child's school or nursery having a parade? stuck for inspiration? there are loads of great ideas over at profession, including this gorgeously scruffy spring meadow!
cyclist shown in action during competitions in yellow ------- cyclist shown in action next to competitors wearing the 

In [14]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m608.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
