<a href="https://colab.research.google.com/github/CopotronicRifat/CSE-437-PATTERN-RECOGNITION/blob/master/CAPTMFN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/openai/CLIP.git
!git clone https://github.com/jefferyYu/TomBERT.git
!git clone https://github.com/Porky-Pig/TwitterImageData.git

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch import nn, optim
from transformers import RobertaModel, RobertaTokenizer, BertConfig
from torchvision.models import vit_b_16, ViT_B_16_Weights
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import clip
from sklearn.metrics import f1_score

# Define paths to the data
tsv_base_path = 'TomBERT/absa_data/twitter2015'
image_base_path = 'TwitterImageData/twitter2015_images'

# Verify paths
print("TSV Directory exists:", os.path.exists(tsv_base_path))
print("Image Directory exists:", os.path.exists(image_base_path))

# Define the path to the base directory containing the TSV files
columns = ['index', 'Label', 'ImageID', 'String1', 'String2']

# Function to load and prepare data with the correct number of columns
def load_and_prepare_data(filename):
    file_path = os.path.join(tsv_base_path, filename)
    return pd.read_csv(file_path, sep='\t', header=0, names=columns)

# Load the data files
train_df = load_and_prepare_data('train.tsv')
dev_df = load_and_prepare_data('dev.tsv')
test_df = load_and_prepare_data('test.tsv')

# Combine train and dev sets
full_train_df = pd.concat([train_df, dev_df])

# Split into new train and validation sets
train_df, valid_df = train_test_split(full_train_df, test_size=0.1)

# Initialize OpenAI CLIP model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, image_base_path, transform=None):
        self.dataframe = dataframe
        self.image_base_path = image_base_path
        self.transform = transform
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        tweet = row['String1']
        aspect_term = row['String2']

        inputs = self.tokenizer(tweet, return_tensors="pt", padding='max_length', max_length=50, truncation=True)
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)

        aspect_inputs = self.tokenizer(aspect_term, return_tensors="pt", padding='max_length', max_length=20, truncation=True)
        aspect_ids = aspect_inputs['input_ids'].squeeze(0)
        aspect_attention_mask = aspect_inputs['attention_mask'].squeeze(0)

        image_path = os.path.join(self.image_base_path, row['ImageID'])
        image = Image.open(image_path).convert('RGB')

        # Generate caption using CLIP
        image_input = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            text_inputs = clip.tokenize(["This is a photo of"]).to(device)
            logits_per_image, logits_per_text = clip_model(image_input, text_inputs)
            caption = clip.tokenize("This is a photo of").to(device)

        caption_inputs = self.tokenizer.decode(caption.squeeze(0).cpu().numpy())
        caption_inputs = self.tokenizer(caption_inputs, return_tensors="pt", padding='max_length', max_length=50, truncation=True)
        caption_ids = caption_inputs['input_ids'].squeeze(0)
        caption_attention_mask = caption_inputs['attention_mask'].squeeze(0)

        if self.transform:
            image = self.transform(image)

        label = torch.tensor(row['Label'], dtype=torch.long)

        return input_ids, attention_mask, aspect_ids, aspect_attention_mask, image, caption_ids, caption_attention_mask, label

# Define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])

# Create DataLoader
train_dataset = CustomDataset(train_df, image_base_path, transform=transform)
valid_dataset = CustomDataset(valid_df, image_base_path, transform=transform)
test_dataset = CustomDataset(test_df, image_base_path, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

class TMFN(nn.Module):
    def __init__(self):
        super(TMFN, self).__init__()
        self.text_encoder = RobertaModel.from_pretrained('roberta-base')
        self.image_encoder = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
        self.dropout = nn.Dropout(p=0.3)
        self.fusion_layer = nn.Linear(768 + 1000 + 768, 512)
        self.classifier = nn.Linear(512, 3)

        # Caption generation components
        decoder_config = BertConfig.from_pretrained('roberta-base')
        self.caption_decoder = RobertaModel(decoder_config)
        self.caption_linear = nn.Linear(768, self.text_encoder.config.vocab_size)

    def forward(self, input_ids, attention_mask, aspect_ids, aspect_attention_mask, images, caption_ids, caption_attention_mask):
        text_features = self.text_encoder(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        aspect_features = self.text_encoder(aspect_ids, attention_mask=aspect_attention_mask).last_hidden_state[:, 0, :]
        image_features = self.image_encoder(images)
        combined_features = torch.cat([text_features, aspect_features, image_features], dim=1)
        fusion_output = torch.relu(self.fusion_layer(combined_features))
        fusion_output = self.dropout(fusion_output)
        logits = self.classifier(fusion_output)

        # Caption generation
        caption_outputs = self.caption_decoder(input_ids=caption_ids, attention_mask=caption_attention_mask)
        caption_logits = self.caption_linear(caption_outputs.last_hidden_state)

        return logits, caption_logits

def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in dataloader:
            input_ids, attention_mask, aspect_ids, aspect_attention_mask, images, caption_ids, caption_attention_mask, labels = [d.to(device) for d in data]
            outputs, _ = model(input_ids, attention_mask, aspect_ids, aspect_attention_mask, images, caption_ids, caption_attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    f1 = f1_score(all_labels, all_preds, average='macro')
    return f1

model = TMFN().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
caption_criterion = nn.CrossEntropyLoss(ignore_index=model.text_encoder.config.pad_token_id)

best_valid_f1 = 0
classification_weight = 0.5
caption_weight = 0.5

for epoch in range(20):
    model.train()
    total_loss = 0
    for data in tqdm(train_loader):
        input_ids, attention_mask, aspect_ids, aspect_attention_mask, images, caption_ids, caption_attention_mask, labels = [d.to(device) for d in data]
        outputs, caption_logits = model(input_ids, attention_mask, aspect_ids, aspect_attention_mask, images, caption_ids, caption_attention_mask)

        loss = classification_weight * criterion(outputs, labels)
        caption_loss = caption_weight * caption_criterion(caption_logits.view(-1, caption_logits.size(-1)), caption_ids.view(-1))
        total_loss = loss + caption_loss

        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    valid_f1 = evaluate_model(model, valid_loader, device)
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Validation F1: {valid_f1}')

    if valid_f1 > best_valid_f1:
        best_valid_f1 = valid_f1
        torch.save(model.state_dict(), 'best_model.pth')

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Evaluate on the test set
test_f1 = evaluate_model(model, test_loader, device)
print(f'Test F1 Score: {test_f1}')


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-tt0tl3i0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-tt0tl3i0
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone
Cloning into 'TomBERT'...
remote: Enumerating objects: 84, done.[K
remote: Total 84 (delta 0), reused 0 (delta 0), pack-reused 84[K
Unpacking objects: 100% (84/84), 663.85 KiB | 5.72 MiB/s, done.
Cloning into 'TwitterImageData'...
remote: Enumerating objects: 8248, done.[K
remote: Total 8248 (delta 0), reused 0 (delta 0), pack-reused 8248[K
Receiving objects: 100% (8248/8248), 576.77 MiB | 51.60 MiB/s, done.
Updating files: 100% (8288/8288), done.
TSV Directory exists: True
Image Directory exists: True


100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 214MiB/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:02<00:00, 169MB/s] 
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
100%|██████████| 484/484 [04:55<00:00,  1.64it/s]


Epoch 1, Loss: 0.0004606588918250054, Validation F1: 0.6218879410810195


100%|██████████| 484/484 [04:51<00:00,  1.66it/s]


Epoch 2, Loss: 0.0014732758281752467, Validation F1: 0.6852813784047104


100%|██████████| 484/484 [04:51<00:00,  1.66it/s]


Epoch 3, Loss: 0.0011947550810873508, Validation F1: 0.680328884798282


100%|██████████| 484/484 [04:50<00:00,  1.66it/s]


Epoch 4, Loss: 2.2677546439808793e-05, Validation F1: 0.6854494807683736


100%|██████████| 484/484 [04:51<00:00,  1.66it/s]


Epoch 5, Loss: 0.00039242187631316483, Validation F1: 0.7116339489880344


100%|██████████| 484/484 [04:47<00:00,  1.68it/s]


Epoch 6, Loss: 4.644658474717289e-05, Validation F1: 0.7161710270405922


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 7, Loss: 0.00015478592831641436, Validation F1: 0.7398150511698341


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 8, Loss: 2.7561189199332148e-05, Validation F1: 0.7155966943206638


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 9, Loss: 2.9995494514878374e-06, Validation F1: 0.70438854541468


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 10, Loss: 4.42182099504862e-05, Validation F1: 0.703563447088469


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 11, Loss: 0.0009183380752801895, Validation F1: 0.7331632668204091


100%|██████████| 484/484 [04:45<00:00,  1.70it/s]


Epoch 12, Loss: 0.00010331592056900263, Validation F1: 0.7138810293045439


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 13, Loss: 0.00010863624629564583, Validation F1: 0.7489877725171842


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 14, Loss: 6.746343501617957e-07, Validation F1: 0.699209260387014


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 15, Loss: 3.939548150810879e-06, Validation F1: 0.6872682369214783


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 16, Loss: 4.136903498874744e-06, Validation F1: 0.7136145541783288


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 17, Loss: 6.341974312817911e-06, Validation F1: 0.7025048541122224


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 18, Loss: 1.1716309018083848e-05, Validation F1: 0.7238584440481324


100%|██████████| 484/484 [04:45<00:00,  1.70it/s]


Epoch 19, Loss: 5.05276284457068e-06, Validation F1: 0.698923654341146


100%|██████████| 484/484 [04:44<00:00,  1.70it/s]


Epoch 20, Loss: 1.5787418305990286e-06, Validation F1: 0.6880972522886081
Test F1 Score: 0.7232051445879897
