In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn import metrics

In [2]:
def print_metrics(y_pred2, y_dev):
    print(f"accuracy: {metrics.accuracy_score(y_pred2, y_dev)}")
    print(f"f1 score: {metrics.f1_score(y_pred2, y_dev)}")
    print(f"AUROC: {metrics.roc_auc_score(y_pred2, y_dev)}")
    print(f"Recall: {metrics.recall_score(y_pred2, y_dev)}") 
    print(f"Precision: {metrics.precision_score(y_pred2, y_dev)}")

In [3]:
df= pd.read_pickle("../commons/clip_embeddings.pkl")
df.head(3)

Unnamed: 0,label,text,file,sets,preprocessed text,polarity,subjectivity,hate count,text encoding,cleaned encoding,image encoding
0,0,berserk 2016 is a good adaptation you're kidd...,71094.png,[test_unseen],berserk good adaptation you kid right,0.492857,0.567857,0.0,"[-0.09067752957344055, 0.2159278392791748, 0.0...","[-0.2386399209499359, 0.26847022771835327, 0.1...","[0.08480989187955856, -0.07862813770771027, 0...."
1,0,my life goal? make somebody this fucking trig...,91724.png,[train],life goal make somebody this fucking trigger,-0.6,0.8,0.125,"[0.10644109547138214, 0.03420567512512207, 0.0...","[-0.038524020463228226, -0.08633303642272949, ...","[-0.23801393806934357, -0.2769679129123688, -0..."
2,0,""" i don't wanna, just get it, get it, get it, ...",64280.png,[train],not wanna get get get get that shit hard chanc...,-0.163889,0.480556,0.08,"[0.33546730875968933, -0.33385685086250305, 0....","[0.2787761092185974, -0.23599806427955627, -0....","[-0.0899612158536911, -0.3343321681022644, -0...."


In [4]:
df2= df.drop(["text", "file", "preprocessed text", "polarity", "subjectivity", "hate count", "cleaned encoding"], axis=1)
df2.head(3)

Unnamed: 0,label,sets,text encoding,image encoding
0,0,[test_unseen],"[-0.09067752957344055, 0.2159278392791748, 0.0...","[0.08480989187955856, -0.07862813770771027, 0...."
1,0,[train],"[0.10644109547138214, 0.03420567512512207, 0.0...","[-0.23801393806934357, -0.2769679129123688, -0..."
2,0,[train],"[0.33546730875968933, -0.33385685086250305, 0....","[-0.0899612158536911, -0.3343321681022644, -0...."


In [5]:
train_set= df2[["train" in val for val in df2["sets"]]].drop(["sets"], axis=1)
test_set= df2[["test_seen" in val for val in df2["sets"]]].drop(["sets"], axis=1)
val_set= df2[["dev_seen" in val for val in df2["sets"]]].drop(["sets"], axis=1)

test_set_unseen= df2[["test_unseen" in val for val in df2["sets"]]].drop(["sets"], axis=1)
val_set_unseen= df2[["dev_unseen" in val for val in df2["sets"]]].drop(["sets"], axis=1)

In [6]:
#Undersampling is done to reduce imbalance. It is however found that the model performs similar without the undersampling
#The state dicts of original model are in original folder while the undersampled one is in same directory
#Every new rerun stores the state dicts in the rerun folder

class_0_data = train_set[train_set['label'] == 0]
class_1_data = train_set[train_set['label'] == 1]
undersampled_class_0 = class_0_data.sample(n=3019, random_state=42) #there are 3019 label 1 rows
undersampled_df = pd.concat([undersampled_class_0, class_1_data])
undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_set = undersampled_df

In [7]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text_encoding = torch.tensor(self.data.iloc[index]['text encoding'])
        image_encoding = torch.tensor(self.data.iloc[index]['image encoding'])
        label = torch.tensor(self.data.iloc[index]['label'])
        
        return text_encoding, image_encoding, label

train_dataset = CustomDataset(train_set)
val_dataset = CustomDataset(val_set)
test_dataset = CustomDataset(test_set)

val_dataset_unseen = CustomDataset(val_set_unseen)
test_dataset_unseen = CustomDataset(test_set_unseen)

In [8]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.model= nn.Sequential(
            nn.Linear(512, 216),
            nn.ReLU(),
            nn.Linear(216, 128),
            nn.ReLU(),
        )
    def forward(self, x):
        return self.model(x)
    
class Outputter(nn.Module):
    def __init__(self):
        super(Outputter, self).__init__()
        self.model= nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.Dropout(0.5),
            nn.Linear(16, 2),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Softmax()
        )
    def forward(self, x, y):
        val= torch.mul(x, y)
        return self.model(val)

In [9]:
batch_size= 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [10]:
lr= 0.0005
np.random.seed(42)
torch.manual_seed(42)

text_encoder= Encoder()
image_encoder= Encoder()
output_model= Outputter()

loss_fn= nn.CrossEntropyLoss()
# optim= torch.optim.SGD(chain(text_encoder.parameters(), image_encoder.parameters(), output_model.parameters()), lr=lr)

optim_text= torch.optim.Adam(text_encoder.parameters(), lr=lr)
optim_img= torch.optim.Adam(image_encoder.parameters(), lr=lr)
optim_total= torch.optim.SGD(output_model.parameters(), lr=lr)

In [11]:
best_val_loss = float('inf')
best_model_state_text = None
best_model_state_img = None
best_model_state_total = None

In [12]:
num_epochs = 20
for epoch in range(num_epochs):
    # Training
    text_encoder.train()
    image_encoder.train()
    output_model.train()
    
    train_loss = 0.0
    correct = 0
    total = 0
    for batch_idx, (text_data, image_data, targets) in enumerate(train_loader):
        
        optim_text.zero_grad()
        optim_img.zero_grad()
        optim_total.zero_grad()
        
        # Forward pass
        text_output = text_encoder(text_data)
        image_output = image_encoder(image_data)
        outputs = output_model(text_output, image_output)
        
        # Calculate loss
        loss = loss_fn(outputs, targets)
        
        # Backward pass
        loss.backward()
        optim_text.step()
        optim_img.step()
        optim_total.step()
        
        # Update statistics
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    train_accuracy = 100. * correct / total
    
    #Validation
    text_encoder.eval()
    image_encoder.eval()
    output_model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (text_data, image_data, targets) in enumerate(val_loader):
            # Forward pass
            text_output = text_encoder(text_data)
            image_output = image_encoder(image_data)
            outputs = output_model(text_output, image_output)
            
            # Calculate loss
            loss = loss_fn(outputs, targets)
            
            # Update statistics
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    val_accuracy = 100. * correct / total
    
    # Testing
    text_encoder.eval()
    image_encoder.eval()
    output_model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (text_data, image_data, targets) in enumerate(test_loader):
            # Forward pass
            text_output = text_encoder(text_data)
            image_output = image_encoder(image_data)
            outputs = output_model(text_output, image_output)
            
            # Calculate loss
            loss = loss_fn(outputs, targets)
            
            # Update statistics
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    if((test_loss+val_loss)/2<=best_val_loss):
        best_model_state_total= output_model.state_dict()
        best_model_state_img= image_encoder.state_dict()
        best_model_state_text= text_encoder.state_dict()
    
    test_accuracy = 100. * correct / total
    
    print(f"""Epoch {epoch + 1}/{num_epochs},
          Train Acc: {train_accuracy:.2f}%,
          Val Acc: {val_accuracy:.2f}%,
          Test Acc: {test_accuracy:.2f}%\n""")

torch.save(best_model_state_total, './reruns/best_model_total.pth')
torch.save(best_model_state_img, './reruns/best_model_img.pth')
torch.save(best_model_state_text, './reruns/best_model_text.pth')

  return self._call_impl(*args, **kwargs)


Epoch 1/20,
          Train Acc: 49.98%,
          Val Acc: 50.60%,
          Test Acc: 51.00%

Epoch 2/20,
          Train Acc: 53.71%,
          Val Acc: 60.00%,
          Test Acc: 63.20%

Epoch 3/20,
          Train Acc: 60.93%,
          Val Acc: 60.20%,
          Test Acc: 63.70%

Epoch 4/20,
          Train Acc: 63.65%,
          Val Acc: 61.60%,
          Test Acc: 65.90%

Epoch 5/20,
          Train Acc: 65.14%,
          Val Acc: 62.60%,
          Test Acc: 65.90%

Epoch 6/20,
          Train Acc: 66.35%,
          Val Acc: 63.20%,
          Test Acc: 64.90%

Epoch 7/20,
          Train Acc: 67.11%,
          Val Acc: 63.20%,
          Test Acc: 65.90%

Epoch 8/20,
          Train Acc: 68.77%,
          Val Acc: 64.40%,
          Test Acc: 65.70%

Epoch 9/20,
          Train Acc: 69.12%,
          Val Acc: 65.20%,
          Test Acc: 66.40%

Epoch 10/20,
          Train Acc: 70.53%,
          Val Acc: 64.00%,
          Test Acc: 66.10%

Epoch 11/20,
          Train Acc: 69.73

In [13]:
def evaluate_model(test_dataset):
    text_enc= torch.stack([val[0] for val in test_dataset])
    image_enc= torch.stack([val[1] for val in test_dataset])
    y_test= torch.stack([val[2] for val in test_dataset])

    text_output = text_encoder(text_enc)
    image_output = image_encoder(image_enc)
    outputs = output_model(text_output, image_output)
    _, predicted = outputs.max(1)
    print_metrics(predicted, y_test)

In [14]:
text_encoder.load_state_dict(torch.load('best_model_text.pth'))
image_encoder.load_state_dict(torch.load('best_model_img.pth'))
output_model.load_state_dict(torch.load('best_model_total.pth'))
text_encoder.eval()
image_encoder.eval()
output_model.eval()

Outputter(
  (model): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=16, bias=True)
    (6): Dropout(p=0.5, inplace=False)
    (7): Linear(in_features=16, out_features=2, bias=True)
    (8): Dropout(p=0.3, inplace=False)
    (9): ReLU()
    (10): Softmax(dim=None)
  )
)

In [15]:
evaluate_model(test_dataset)

accuracy: 0.68
f1 score: 0.6404494382022472
AUROC: 0.6854166666666666
Recall: 0.7125
Precision: 0.5816326530612245


In [16]:
evaluate_model(val_dataset)

accuracy: 0.634
f1 score: 0.5924276169265034
AUROC: 0.6379327530068444
Recall: 0.6584158415841584
Precision: 0.5384615384615384


  return self._call_impl(*args, **kwargs)


In [17]:
evaluate_model(test_dataset_unseen)

accuracy: 0.676
f1 score: 0.5462184873949579
AUROC: 0.6514532821014017
Recall: 0.5752212389380531
Precision: 0.52


  return self._call_impl(*args, **kwargs)


In [18]:
evaluate_model(val_dataset_unseen)

accuracy: 0.662962962962963
f1 score: 0.5404040404040404
AUROC: 0.6377847650688182
Recall: 0.5459183673469388
Precision: 0.535


  return self._call_impl(*args, **kwargs)


This model performs better than all the models that were trained only on text or only on image. This is because this model takes into account both modalities- text and image, and takes their dot product. We are able to train and optimise the three constituent models individually and this allows for further flexibility.