In [1]:
import pandas as pd
import string

# Specify the path to your TSV file
train_tsv_file_path = '/home/allenfu/cyc/23Fall-269/Train_GCC-training.tsv'
val_tsv_file_path = '/home/allenfu/cyc/23Fall-269/Validation_GCC-1.1.0-Validation.tsv'

# Read the TSV file into a DataFrame
train_df = pd.read_csv(train_tsv_file_path, delimiter='\t', header=None)[0]
val_df = pd.read_csv(val_tsv_file_path, delimiter='\t', header=None)[0]

def remove_spaces(sentence):
    for punctuation in string.punctuation:
        sentence = sentence.replace(f' {punctuation}', punctuation)
    return ' '.join(sentence.split())

train_df = train_df.apply(remove_spaces)
val_df = val_df.apply(remove_spaces)

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

class Seq2SeqDataset(Dataset):
    def __init__(self, dataframe, tokenizer, processor, max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_sentence = self.data.iloc[idx]
        
        # Tokenize and encode the source sentence
        t5_tokens = self.tokenizer.encode_plus(
            source_sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )

        t5_inputs =  {
            'input_ids': t5_tokens['input_ids'].squeeze(),
            'attention_mask': t5_tokens['attention_mask'].squeeze(),
            'target_ids': t5_tokens['input_ids'].squeeze(),  # Target is the same as the input
            'target_mask': t5_tokens['attention_mask'].squeeze(),
            'target': source_sentence
        }

        clip_tokens = self.processor(
            text=source_sentence, 
            images=torch.zeros((3, 224, 224)), 
            return_tensors="pt", 
            padding='max_length', 
            max_length=self.max_length, 
            truncation=True
        )

        clip_inputs = {
            'input_ids': clip_tokens['input_ids'].squeeze(),
            'attention_mask': clip_tokens['attention_mask'].squeeze(),
            'pixel_values': clip_tokens["pixel_values"].view(3, 224, 224),
            'target_ids': clip_tokens['input_ids'].squeeze(),  # Target is the same as the input
            'target_mask': clip_tokens['attention_mask'].squeeze(),
            'target': source_sentence
        }

        return clip_inputs, t5_inputs

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, CLIPModel, CLIPProcessor
from tqdm import tqdm

class KLIPModel(torch.nn.Module):

    def __init__(self, class_num = 1000):
        super(KLIPModel, self).__init__()

        self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

        self.clip_to_llama = nn.Sequential(
          torch.nn.Linear(512, 64),
          torch.nn.ReLU(),
          torch.nn.Linear(64, 4096),
        )

        self.classifier = nn.Sequential(
          torch.nn.Linear(512, 64),
          torch.nn.ReLU(),
          torch.nn.Linear(64, class_num),
          torch.nn.Softmax(),
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        clip_output = self.clip(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)

        class_logits = self.classifier(clip_output.image_embeds)
        clip_to_llama_embeds = self.clip_to_llama(clip_output.text_embeds)

        similarity_loss = None
        labels = torch.ones((input_ids.shape[0],), dtype=torch.float32, device=input_ids.device)
        criterion = nn.CosineEmbeddingLoss(margin=0.2)
        similarity_loss = criterion(clip_output.text_embeds, clip_output.image_embeds, labels)

        return clip_output, clip_to_llama_embeds, class_logits, similarity_loss

In [4]:
class Bottleneck(nn.Module):
    def __init__(self, input_dim, output_dim, bottleneck_dim=4096):
        super(Bottleneck, self).__init__()
        self.blocks = nn.Sequential(
            # nn.Linear(input_dim, bottleneck_dim),
            # nn.LayerNorm(bottleneck_dim),
            # nn.ReLU(),
            # nn.Linear(bottleneck_dim, output_dim),
            nn.Linear(input_dim, output_dim),
            nn.LayerNorm(output_dim),
            nn.ReLU()
        )

        # self.layer = nn.Linear(input_dim, output_dim)
        # self.norm = nn.LayerNorm(output_dim)

    def forward(self, x):
        return self.blocks(x)

class KLIPEval(nn.Module):
    def __init__(self, klip_model_path, t5_model_path, device='cuda'):
        super(KLIPEval, self).__init__()
        self.encoder = KLIPModel()
        self.encoder.load_state_dict(torch.load(klip_model_path))
        self.bottleneck = Bottleneck(512, 768)
        self.decoder = T5ForConditionalGeneration.from_pretrained('t5-base')
        self.decoder.load_state_dict(torch.load(t5_model_path))
        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        self.device = device

        # Set requires_grad to False for encoder and decoder parameters
        for param in self.encoder.parameters():
            param.requires_grad = False

        for param in self.decoder.parameters():
            param.requires_grad = False

        # Set requires_grad to True for dimension transform layer parameters
        for param in self.bottleneck.parameters():
            param.requires_grad = True

    def forward(self, clip_inputs, t5_inputs, train=True):
        if train:
            encoder_outputs = self.encoder.clip.text_model(
                input_ids=clip_inputs["input_ids"].to(self.device), 
                attention_mask=clip_inputs["attention_mask"].to(self.device),
            )

            encoder_outputs['last_hidden_state'] = self.bottleneck(encoder_outputs['last_hidden_state'])
            output = self.decoder(
                encoder_outputs=encoder_outputs,
                attention_mask=t5_inputs['attention_mask'].to(self.device),
                labels=t5_inputs['target_ids'].to(self.device)
            )
            return output.loss
        else:
            encoder_outputs = self.encoder.clip.text_model(
                input_ids=clip_inputs["input_ids"].to(self.device), 
                attention_mask=clip_inputs["attention_mask"].to(self.device),
            )

            encoder_outputs['last_hidden_state'] = self.bottleneck(encoder_outputs['last_hidden_state'])
            output = self.decoder.generate(
                # inputs_embeds=t5_inputs_embeds,
                encoder_outputs=encoder_outputs,
                attention_mask=t5_inputs['attention_mask'].to(self.device),
                decoder_input_ids=torch.tensor([[self.tokenizer.pad_token_id]] * t5_inputs['input_ids'].shape[0]).to(self.device),
                # max_length=64,  # Set a reasonable maximum length for generated sequences
                # num_beams=1,  # Set to 1 for greedy decoding
                # no_repeat_ngram_size=2,  # Avoid repeating bigrams in the output
                # early_stopping=True
            )
            return output

In [5]:
device = 'cuda'
# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Create the dataset and DataLoader
train_dataset = Seq2SeqDataset(train_df, tokenizer, processor)
val_dataset = Seq2SeqDataset(val_df, tokenizer, processor)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=4)

# Initialize the autoencoder model
klip_model_path = '/home/allenfu/269/klip_1.pt'
# klip_model_path = '/home/allenfu/cyc/23Fall-269/klip.pt'
t5_model_path = '/home/allenfu/cyc/23Fall-269/t5_model.pth'
klip_model = KLIPEval(klip_model_path, t5_model_path, device).to(device)

# Define the optimizer and learning rate scheduler
optimizer = optim.AdamW(klip_model.bottleneck.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
num_epochs = 3
best_em_score = 0.0
for epoch in range(num_epochs):
    total_loss = 0
    klip_model.train()

    for clip_inputs, t5_inputs in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        loss = klip_model(clip_inputs, t5_inputs, train=True)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}')

    # Optionally update the learning rate
    scheduler.step()

    # Evaluate with Exact Match (EM) on a validation set
    klip_model.eval()
    with torch.no_grad():
        em_count = 0
        total_samples = 0

        for clip_inputs, t5_inputs in tqdm(val_dataloader, desc=f'Validation - Epoch {epoch + 1}'):
            # Generate sequences
            generated_ids = klip_model(clip_inputs, t5_inputs, train=False).detach().cpu().numpy()

            # Decode token IDs to strings
            generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
            target_sentences = t5_inputs['target']

            # Check for exact match
            em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
            total_samples += len(generated_sentences)

        em_score = em_count / total_samples
        print(f'Validation EM Score: {em_score}')

        # Save the model if the EM score improves
        if em_score > best_em_score:
            best_em_score = em_score
            torch.save(klip_model.state_dict(), 'klip_model.pth')
            print("Model saved!")



For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Epoch 1/3: 100%|██████████| 25925/25925 [1:25:02<00:00,  5.08it/s]


Epoch 1/3, Average Loss: 0.1965398128375253


Validation - Epoch 1: 100%|██████████| 124/124 [00:58<00:00,  2.14it/s]


Validation EM Score: 0.40448232323232325
Model saved!


Epoch 2/3: 100%|██████████| 25925/25925 [1:25:00<00:00,  5.08it/s]


Epoch 2/3, Average Loss: 0.14528056094507358


Validation - Epoch 2: 100%|██████████| 124/124 [00:58<00:00,  2.14it/s]


Validation EM Score: 0.414520202020202
Model saved!


Epoch 3/3: 100%|██████████| 25925/25925 [1:25:05<00:00,  5.08it/s]


Epoch 3/3, Average Loss: 0.14117604197767225


Validation - Epoch 3: 100%|██████████| 124/124 [00:58<00:00,  2.14it/s]


Validation EM Score: 0.42455808080808083
Model saved!


In [23]:
with torch.no_grad():
    em_count = 0
    total_samples = 0

    for clip_inputs, t5_inputs in tqdm(val_dataloader):
        # Generate sequences
        generated_ids = klip_model(clip_inputs, t5_inputs, train=False).detach().cpu().numpy()

        # Decode token IDs to strings
        generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
        target_sentences = t5_inputs['target']

        # Check for exact match
        em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
        total_samples += len(generated_sentences)
        print(generated_sentences[0], '-------', target_sentences[0])

    em_score = em_count / total_samples
    print(f'Validation EM Score: {em_score}')

 25%|██▌       | 1/4 [00:01<00:03,  1.27s/it]

- a woman's ad in the middle of the night ------- author: a life in photography-- in pictures


 50%|█████     | 2/4 [00:01<00:01,  1.25it/s]

a solitary tree with a solitary tree ------- a lot of dried fruits and nuts for sale in old fashioned traditional grocery store in city on peninsula


 75%|███████▌  | 3/4 [00:02<00:00,  1.54it/s]

i'm a fan of the 'finger's' and i ------- today this may look familiar, but person was the guy who discovered the first flip on a bicycle.


100%|██████████| 4/4 [00:02<00:00,  1.46it/s]

person walking on the street ------- cyclist shown in action next to competitors wearing the yellow
Validation EM Score: 0.00390625





In [None]:
clip_model = 
clip_model.load_state_dict(torch.load('clip_model.pth'))
with torch.no_grad():
    em_count = 0
    total_samples = 0

    for clip_inputs, t5_inputs in tqdm(val_dataloader):
        # Generate sequences
        generated_ids = klip_model(clip_inputs, t5_inputs, train=False).detach().cpu().numpy()

        # Decode token IDs to strings
        generated_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]
        target_sentences = t5_inputs['target']

        # Check for exact match
        em_count += sum(1 for gen, target in zip(generated_sentences, target_sentences) if gen == target)
        total_samples += len(generated_sentences)
        # print(generated_sentences[0], '-------', target_sentences[0])

    em_score = em_count / total_samples
    print(f'Validation EM Score: {em_score}')

In [14]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m608.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [30]:
a = torch.load(klip_model_path)
torch.save(a.state_dict(), 'tmp.pth')

In [45]:
import torch
import torch.nn as nn

class MeanPoolingLayer(nn.Module):
    def forward(self, token_embeddings, attention_mask):
        # Apply mean pooling along the sequence dimension
        # Use attention mask to mask out padding tokens
        sum_embeddings = torch.sum(token_embeddings * attention_mask.unsqueeze(-1), dim=1)
        mean_pooled = sum_embeddings / attention_mask.sum(dim=1, keepdim=True)
        return mean_pooled

# Example usage:
embedding_dim = 768  # Assuming your token embeddings have dimension 768
max_sequence_length = 50  # Adjust based on your actual sequence length
batch_size = 32  # Adjust based on your batch size

# Example token embeddings (replace this with your actual token embeddings)
token_embeddings = torch.rand((batch_size, max_sequence_length, embedding_dim))

# Example attention mask (1 for valid tokens, 0 for padding)
attention_mask = torch.randint(0, 2, (batch_size, max_sequence_length))

# Instantiate the MeanPoolingLayer
mean_pooling_layer = MeanPoolingLayer()

# Apply mean pooling to get a single vector representation
mean_pooled_output = mean_pooling_layer(token_embeddings, attention_mask)

# Print the shape of the mean-pooled output
print(mean_pooled_output.shape)


torch.Size([32, 768])


In [54]:
import torch
import torch.nn as nn
from transformers import T5EncoderModel, T5Tokenizer

class CustomT5Model(nn.Module):
    def __init__(self, model_name='t5-base'):
        super(CustomT5Model, self).__init__()

        # Load pre-trained T5 model and tokenizer
        self.t5_encoder = T5EncoderModel.from_pretrained(model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        # Linear transformation and layer normalization for conditioning the decoder
        self.linear_transform = nn.Linear(self.t5_encoder.config.hidden_size, self.t5_encoder.config.hidden_size)
        self.layer_norm = nn.LayerNorm(self.t5_encoder.config.hidden_size)

    def forward(self, input_ids, attention_mask):
        # Forward pass through the T5 encoder
        encoder_outputs = self.t5_encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Extract token embeddings
        token_embeddings = encoder_outputs.last_hidden_state

        # Mean pooling over token embeddings
        attention_mask_expanded = attention_mask.unsqueeze(-1)
        mean_pooled_embeddings = torch.sum(token_embeddings * attention_mask_expanded, dim=1) / attention_mask_expanded.sum(dim=1, keepdim=True)

        # Linear transformation and layer normalization for conditioning the decoder
        conditioned_embeddings = self.layer_norm(self.linear_transform(mean_pooled_embeddings))

        return conditioned_embeddings

# Example usage:
model = CustomT5Model()

# Input text
input_text = "Your input text goes here."
input_ids = model.tokenizer.encode(input_text, return_tensors='pt')

# Attention mask (1 for valid tokens, 0 for padding)
attention_mask = (input_ids != model.tokenizer.pad_token_id).float()

# Forward pass through the custom T5 model
output = model(input_ids=input_ids, attention_mask=attention_mask)

# The 'output' now contains the conditioned embeddings for the decoder
output.shape

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


torch.Size([1, 1, 768])