In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unins

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/iiitd_research'
import os
print(os.listdir())
os.chdir('gdrive/My Drive/iiitd_research')

Mounted at /content/gdrive
['.config', 'gdrive', 'sample_data']


In [None]:

import json
with open('./annotations/train2.jsonl') as f:
    data = [json.loads(line) for line in f]
print(data[4])
data_new = []
k=0
for i in data :

  i['entities']=i['hero']+i['villain']+i['victim']+i['other']
  #if len(i["OCR"])<=1500:
  for en in i['entities']:
      new_dict={}

      new_dict['image'] = i['image']
      new_dict['OCR'] = i['OCR']
      new_dict['entity'] = en
      if en in i['hero']:
        new_dict['label'] =0 #'hero'
      elif en in i['villain'] :
        new_dict['label'] =1 #'villain'
      elif en in i['victim'] :
        new_dict['label'] =2 #'victim'
      else:
        new_dict['label'] =3 #'other'

      data_new.append(new_dict)
  k=k+1

{'OCR': '"The power to create\nthis new world\nis not in our hopes,\nit\'s not in our dreams -\nit\'s in our hands."\n- JILL STEIN\n', 'image': 'memes_2208.png', 'hero': [], 'villain': [], 'victim': [], 'other': ['jil stein', 'jill stein']}


In [None]:
data_new

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'hero':0,
          'villian':1,
          'victim':2,
          'other':3
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, data_new):

        self.labels = [x['label'] for x in data_new]
        self.texts = [tokenizer(x['OCR']+x['entity'],
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for x in data_new]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        #self.dropout = nn.Dropout(dropout)
        #self.linear = nn.Linear(768, 5)
        # self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        #dropout_output = self.dropout(pooled_output)
        #linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output)

        return pooled_output

In [None]:
train_output=[]

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=4, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                train_output.append(output)
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')


In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')


####################
df_train, df_val = data_new[:200], data_new[200:250]

print(len(df_train),len(df_val))#, len(df_test))
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)
#evaluate(model, df_test)

200 50


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 50/50 [00:18<00:00,  2.77it/s]


Epochs: 1 | Train Loss:  1.881 | Train Accuracy:  0.000 | Val Loss:  3.706 | Val Accuracy:  0.000


100%|██████████| 50/50 [00:18<00:00,  2.73it/s]


Epochs: 2 | Train Loss:  1.789 | Train Accuracy:  0.000 | Val Loss:  3.253 | Val Accuracy:  0.000


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]


Epochs: 3 | Train Loss:  1.542 | Train Accuracy:  0.000 | Val Loss:  3.034 | Val Accuracy:  0.000


100%|██████████| 50/50 [00:18<00:00,  2.66it/s]


Epochs: 4 | Train Loss:  1.492 | Train Accuracy:  0.010 | Val Loss:  2.991 | Val Accuracy:  0.000


100%|██████████| 50/50 [00:18<00:00,  2.71it/s]


Epochs: 5 | Train Loss:  1.473 | Train Accuracy:  0.065 | Val Loss:  2.965 | Val Accuracy:  0.020


In [None]:
train_output

In [None]:
train_output[0]

tensor([[-0.5575,  0.4350,  0.9994,  ...,  0.9999, -0.9327,  0.9885],
        [-0.4474,  0.3238,  0.9992,  ...,  0.9998, -0.8576,  0.9872],
        [-0.7580,  0.4861,  0.9999,  ...,  1.0000, -0.2680,  0.9787],
        [-0.5086,  0.4510,  0.9997,  ...,  0.9999, -0.9379,  0.9944]],
       device='cuda:0', grad_fn=<TanhBackward0>)

In [None]:
len(train_output)

250

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

bert embeddings other code

In [None]:
!python -m pip install transformers

from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import torch

bertmodel = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True,)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT

    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.

    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids

    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids


    """

    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer(marked_text, padding='max_length', max_length = 100, truncation=True,return_tensors="pt")
    # indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    # tokens_tensor = torch.tensor([indexed_tokens])
    # segments_tensors = torch.tensor([segments_ids])

    return tokenized_text#, tokens_tensor, segments_tensors

In [None]:
  # train_label = train_label.to(device)
  #               mask = train_input['attention_mask'].to(device)
  #               input_id = train_input['input_ids'].squeeze(1).to(device)

  #               output = model(input_id, mask)

In [None]:
def get_bert_embeddings(tokenized_text,model):#tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model

    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids

    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token

    """
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    mask = tokenized_text['attention_mask'].to(device)
    input_id = tokenized_text['input_ids'].squeeze(1).to(device)
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        #outputs = model(tokens_tensor, segments_tensors)
        outputs = model(input_id, mask)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [None]:
import os
import skimage
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

from collections import OrderedDict
import torch

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

original_images = []
images = []
ocr = []
entity=[]
labels=[]

n=0
temp=''
for i in data_new:
    # n+=1
    # if n==1001:
    #   break

    if i['image'] not in os.listdir('images'):
        continue






       #image_input = torch.tensor(np.stack(images)).cuda()
     #original_images.append(image)
    # images.append(preprocess(image))
    # temp = i['image']


    ocr.append(i['OCR'])
    entity.append(i['entity'])
    labels.append(i['label'])




In [None]:
bert_train_embeddings = []
p=0
for text in ocr:
    tokenized_text = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokenized_text, bertmodel.cuda())
    p+=1
    if p%50==0:
      print('still running, at ',p)
    # Find the position 'bank' in list of tokens
    #word_index = tokenized_text.index('bank')
    # Get the embedding for bank
    #word_embedding = list_token_embeddings[word_index]
    # word=[]
    # np.sum(list_token_embeddings,axis=0)/100
    # for e in list_token_embeddings:
    #   word+=  e/len(list_token_embeddings)

    bert_train_embeddings.append(np.sum(list_token_embeddings,axis=0)/100)


still running, at  50
still running, at  100
still running, at  150
still running, at  200
still running, at  250
still running, at  300
still running, at  350
still running, at  400
still running, at  450
still running, at  500
still running, at  550
still running, at  600
still running, at  650
still running, at  700
still running, at  750
still running, at  800
still running, at  850
still running, at  900
still running, at  950
still running, at  1000
still running, at  1050
still running, at  1100
still running, at  1150
still running, at  1200
still running, at  1250
still running, at  1300
still running, at  1350
still running, at  1400
still running, at  1450
still running, at  1500
still running, at  1550
still running, at  1600
still running, at  1650
still running, at  1700
still running, at  1750
still running, at  1800
still running, at  1850
still running, at  1900
still running, at  1950
still running, at  2000
still running, at  2050
still running, at  2100
still runnin

In [None]:
len(bert_train_embeddings)

10280

In [None]:
print((bert_train_embeddings[0]))
print((bert_train_embeddings[1]))
print(len(bert_train_embeddings[2]))
print(len(bert_train_embeddings[3]))
print(len(bert_train_embeddings[4]))
print(len(bert_train_embeddings[5]))
print(len(bert_train_embeddings[6]))
print(len(bert_train_embeddings[6]))
print(len(bert_train_embeddings[23]))
print(len(bert_train_embeddings[22]))
print(len(bert_train_embeddings[12]))
print(len(bert_train_embeddings[35]))
print(len(bert_train_embeddings[45]))

[-6.31748836e-02  5.41181042e-02  3.48523440e-01  5.45532662e-02
  1.41182548e-01 -2.54798909e-01  1.00039121e-01  6.51276378e-01
  7.76337361e-02 -3.43260589e-01 -6.12750886e-02 -4.50686137e-01
 -4.44336240e-01  1.32623382e-01 -6.08658306e-02  3.33967544e-01
  8.63517017e-02  1.29008831e-01 -1.39632409e-01  3.85028542e-01
  1.21430747e-01  2.82636842e-02 -1.80261406e-01 -2.09685533e-01
  1.41027302e-01 -2.72387832e-02  3.12327214e-02 -2.52793065e-01
 -3.05056836e-02  1.13360344e-01  3.74761681e-01  1.60916616e-01
 -1.79938069e-02 -1.34708514e-01 -9.89248446e-02 -1.33927614e-01
 -8.39003858e-02 -7.48792787e-02 -8.92938461e-02  2.16883211e-01
 -5.24063379e-01 -3.52528966e-01  5.14559628e-02 -1.15608181e-01
 -3.17964216e-02 -4.37082762e-01  5.39770149e-01 -7.86113443e-02
  1.69981474e-01 -4.52476868e-02 -1.49194086e-01  1.48477758e-01
 -1.99722672e-01  1.14548801e-01 -1.44117180e-01  2.16935856e-01
 -1.42909446e-01 -5.65172546e-01 -2.73541110e-01 -2.64024428e-01
  3.36009480e-01 -1.34563

In [None]:
print(len(bert_train_embeddings[0][2]))
print(len(bert_train_embeddings[1][34]))
print(len(bert_train_embeddings[2][34]))
print(len(bert_train_embeddings[3][34]))
print(len(bert_train_embeddings[4][64]))
print(len(bert_train_embeddings[5][3]))
print(len(bert_train_embeddings[6][4]))
print(len(bert_train_embeddings[7][4]))
print(len(bert_train_embeddings[8]))
print(len(bert_train_embeddings[9]))
print(len(bert_train_embeddings[10]))
print(len(bert_train_embeddings[11]))
print(len(bert_train_embeddings[12]))
len(bert_train_embeddings[122])
len(bert_train_embeddings[233])
len(bert_train_embeddings[344])
len(bert_train_embeddings[422])
len(bert_train_embeddings[555])
len(bert_train_embeddings[633])
len(bert_train_embeddings[722])

TypeError: ignored

In [None]:
import pickle
with open("uspoliticsberttrainembeddings.pickle", 'wb') as f:
    pickle.dump(bert_train_embeddings, f)


In [None]:
imp = np.array(bert_train_embeddings)

In [None]:
imp[0][0]

TypeError: ignored

In [None]:
bert_entity_train_embeddings = []
p=0
for en in entity:
    tokenized_text = bert_text_preparation(en, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokenized_text, bertmodel)
    p+=1
    if p%50==0:
      print('still running, at ',p)


    bert_entity_train_embeddings.append(np.sum(list_token_embeddings,axis=0)/100)


still running, at  50
still running, at  100
still running, at  150
still running, at  200
still running, at  250
still running, at  300
still running, at  350
still running, at  400
still running, at  450
still running, at  500
still running, at  550
still running, at  600
still running, at  650
still running, at  700
still running, at  750
still running, at  800
still running, at  850
still running, at  900
still running, at  950
still running, at  1000
still running, at  1050
still running, at  1100
still running, at  1150
still running, at  1200
still running, at  1250
still running, at  1300
still running, at  1350
still running, at  1400
still running, at  1450
still running, at  1500
still running, at  1550
still running, at  1600
still running, at  1650
still running, at  1700
still running, at  1750
still running, at  1800
still running, at  1850
still running, at  1900
still running, at  1950
still running, at  2000
still running, at  2050
still running, at  2100
still runnin

In [None]:
len(bert_entity_train_embeddings )
# print((bert_entity_train_embeddings[0]))

# print(len(bert_entity_train_embeddings[6]))


10280

In [None]:
import pickle
with open("uspoliticsbertentitytrainembeddings.pickle", 'wb') as f:
    pickle.dump(bert_entity_train_embeddings, f)
