In [62]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [63]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [64]:
import torch
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [65]:
import numpy as np
import pandas as pd

In [66]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None) 

In [67]:
train = pd.read_pickle("train_hierarchial.pkl")

In [68]:
train=train.sample(1000)

In [69]:
post,context, label = train.post.values, train.context, train.label

In [70]:
# from transformers import BertModel

# from transformers import BertTokenizer, BertModel

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert = BertModel.from_pretrained('bert-base-uncased')

# def bert_representations(sentence):
#     input = tokenizer(sentence, return_tensors = "pt")
#     outputs = bert(**input)
#     return outputs.last_hidden_state[:,0,:]

In [71]:
import tensorflow as tf
import tensorflow_hub as hub

from bert import bert_tokenization

class MuRIL:
    def __init__(self, df, model_name, avg_pooling=False):
        self.tokenizer = None
        self.model = None
        self.tokenized_padded_text = None
        self.attention_mask = None
        self.textip = None
        self.pooledOp = None
        self.input_dfs = None
        self.data_frame = df
        self.feature_df = None
        self.model_name = None
        self.InitModel(model_name, avg_pooling)

    def InitModel(self, model_name, avg_pooling):
        
        if model_name == 'muril':
            self.textip = tf.keras.layers.Input(shape=(), dtype=tf.string)
            self.max_seq_length = 128
            muril_model, muril_layer = self.init_muril(
                model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=self.max_seq_length,
                avg_pooling=avg_pooling)
            vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
            do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
            self.tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
            self.model_name = 'muril'
            self.model = muril_model
            self.avg_pooling = avg_pooling

    def tokenize(self, column):
        tokenized_text = column.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))
 
        max_len = 0
        for i in tokenized_text.values:
            if len(i) > max_len:
                max_len = len(i)

        self.tokenized_padded_text = np.array([i + [0]*(max_len-len(i)) for i in tokenized_text.values])
        self.create_attention_mask()

    def create_attention_mask(self):
        self.attention_mask = np.where(self.tokenized_padded_text != 0, 1, 0)
        self.input_ids = torch.tensor(self.tokenized_padded_text)
        self.attention_mask = torch.tensor(self.attention_mask)

    def init_muril(self, model_url, max_seq_length, avg_pooling):
        inputs = dict(
            input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
            input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
            input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
        )

        muril_layer = hub.KerasLayer(model_url, trainable=True)
        outputs = muril_layer(inputs)
        print(outputs)
        assert 'sequence_output' in outputs
        assert 'pooled_output' in outputs
        assert 'encoder_outputs' in outputs
        assert 'default' in outputs
        if avg_pooling:
            return tf.keras.Model(inputs=inputs, outputs=outputs["encoder_outputs"]), muril_layer
        else:
            return tf.keras.Model(inputs=inputs, outputs=outputs["pooled_output"]), muril_layer

    def create_input(self, input_strings, tokenizer, max_seq_length):
        input_ids_all, input_mask_all, input_type_ids_all = [], [], []
        for input_string in input_strings:
            input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            sequence_length = min(len(input_ids), max_seq_length)

            if len(input_ids) >= max_seq_length:
                input_ids = input_ids[:max_seq_length]
            else:
                input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

            input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

            input_ids_all.append(input_ids)
            input_mask_all.append(input_mask)
            input_type_ids_all.append([0] * max_seq_length)

        return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

    def encode(self, input_text):
        input_ids, input_mask, input_type_ids = self.create_input(input_text,
                                                                  self.tokenizer,
                                                                  self.max_seq_length)
        inputs = dict(
            input_word_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids,
        )
        return self.model(inputs)

    def GetFeatures(self, input=None):
        if self.model_name != 'muril':
            with torch.no_grad():
                last_hidden_states = self.model(self.input_ids, attention_mask=self.attention_mask)
            last_hidden_states['last_hidden_state'].size()
            self.features = last_hidden_states[0][:, 0, :].numpy()
            self.features = pd.DataFrame(self.features)
            
        elif self.model_name == 'muril':
            embeddings = self.encode(input)
            if not self.avg_pooling:
                self.features = pd.DataFrame(embeddings.numpy())
            else:
                f1 = embeddings[7][:, 0, :].numpy()
                f2 = embeddings[6][:, 0, :].numpy()
                f3 = embeddings[5][:, 0, :].numpy()
                self.features = pd.DataFrame((f1+f2+f3)/3)
        return self.features

In [72]:
class CustomBERTModel(nn.Module):
    def __init__(self):
          super(CustomBERTModel, self).__init__()
     
          self.linear1 = nn.Linear(1536, 768)
          self.linear2 = nn.Linear(768, 300)
          self.linear3 = nn.Linear(1068, 768)
          self.linear4 = nn.Linear(768,2)
          self.outputlayer = nn.Softmax(dim=1)

    def concatenate(self, tensor1, tensor2):
      return torch.cat((tensor1, tensor2), 1)
      
    def forward(self, post=None, context=None, text=None, only_post = False, post_context=False, post_context_text=False):

      if only_post:
        input_post_embeddings = post
        
        linear4_output = self.linear4(input_post_embeddings)

        outputs = self.outputlayer(linear4_output)

      if post_context:

        input_post_embeddings = post

        input_context_embeddings = context

        post_context = self.concatenate(input_post_embeddings, input_context_embeddings)

        linear1_output = self.linear1(post_context)

        linear4_output = self.linear4(linear1_output)

        outputs = self.outputlayer(linear4_output)

      if post_context_text:

        input_post_embeddings = post

        input_context_embeddings = context

        input_text_embeddings = text

        context_text = self.concatenate(input_context_embeddings, input_post_embeddings)

        linear1_output = self.linear1(context_text)

        linear2_output = self.linear2(linear1_output)

        post_context_text = self.concatenate(input_text_embeddings, linear2_output)

        linear3_output = self.linear3(post_context_text)

        linear4_output = self.linear4(linear3_output)

        outputs = self.outputlayer(linear4_output)

      return outputs

In [73]:
model = CustomBERTModel()

In [74]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-7:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 8 different named parameters.

==== Embedding Layer ====

linear1.weight                                           (768, 1536)
linear1.bias                                                  (768,)
linear2.weight                                            (300, 768)
linear2.bias                                                  (300,)
linear3.weight                                           (768, 1068)

==== First Transformer ====

linear3.bias                                                  (768,)
linear4.weight                                              (2, 768)
linear4.bias                                                    (2,)

==== Output Layer ====

linear1.bias                                                  (768,)
linear2.weight                                            (300, 768)
linear2.bias                                                  (300,)
linear3.weight                                           (768, 1068)
linear3.bias                            

In [75]:
learning_rate = 1e-3
epochs = 4

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

In [76]:
model = model.to(device)

RuntimeError: CUDA error: out of memory

In [None]:
muril = MuRIL(train,model_name='muril',avg_pooling=True)

In [31]:
print("Training....")

for epoch in range(epochs):
    total_train_loss=0

    print("==========Epochs:{}===========".format(epoch)) 

    for post, context,text, label in zip(train["post"], train["context"], train["text"], train["label"]):

      if post!="" and context=="" and text=="":

        input_post=muril.GetFeatures([post]).iloc[0].to_list()
        input_post = torch.FloatTensor([input_post]).to(device)

        logits = model(input_post, only_post=True)
        

      if post!="" and context!="" and text=="":

        input_post=muril.GetFeatures([post]).iloc[0].to_list()
        input_post = torch.FloatTensor([input_post]).to(device)

        input_context=muril.GetFeatures([context]).iloc[0].to_list()
        input_context = torch.FloatTensor([input_context]).to(device)

        logits = model(input_post, input_context,post_context=True)

      if post!="" and context!="" and text!="":

        input_post=muril.GetFeatures([post]).iloc[0].to_list()
        input_post = torch.Tensor([input_post]).to(device)

        input_context=muril.GetFeatures([context]).iloc[0].to_list()
        input_context = torch.Tensor([input_context]).to(device)

        input_text=muril.GetFeatures([text]).iloc[0].to_list()
        input_text = torch.Tensor([input_text]).to(device)

        logits = model(input_post, input_context,input_text, post_context_text=True)
       
      print(logits)

      one_hot = torch.zeros(1, 2)

      one_hot[0, label] = 1.

      one_hot = one_hot.to(device)
        
      loss = criterion(input = logits, target = one_hot)

      optimizer.zero_grad()

      loss.backward()

      optimizer.step()

      total_train_loss += loss
      
      torch.save(model.state_dict(), "rev_muril_model_adamw.pth")
    print("Training Loss: {}".format((total_train_loss/len(train)).item()))

Training....


NameError: name 'muril' is not defined

In [None]:
torch.save(model.state_dict(), "rev_muril_model_adamw.pth")

In [None]:
model = CustomBERTModel()
model.load_state_dict(torch.load("muril_model_adamw.pth"))

In [None]:
# Testing Setup

In [None]:
test = pd.read_pickle("test_hierarchial.pkl")

In [None]:
from sklearn.metrics import accuracy_score, f1_score

print("Testing....")

predictions = []
labels = []

model.eval()

model=model.to(device)

with torch.no_grad():
  for post, context, text, label in zip(test["post"], test["context"], test["text"], test["label"]):

    if post!="" and context=="" and text=="":

      input_post=muril.GetFeatures([post]).iloc[0].to_list()
      input_post = torch.FloatTensor([input_post]).to(device)

      logits = model(input_post, only_post=True)
        
    if post!="" and context!="" and text=="":

      input_post=muril.GetFeatures([post]).iloc[0].to_list()
      input_post = torch.FloatTensor([input_post]).to(device)

      input_context=muril.GetFeatures([context]).iloc[0].to_list()
      input_context = torch.FloatTensor([input_context]).to(device)

      logits = model(input_post, input_context,post_context=True)

    if post!="" and context!="" and text!="":

      input_post=muril.GetFeatures([post]).iloc[0].to_list()
      input_post = torch.Tensor([input_post]).to(device)

      input_context=muril.GetFeatures([context]).iloc[0].to_list()
      input_context = torch.Tensor([input_context]).to(device)

      input_text=muril.GetFeatures([text]).iloc[0].to_list()
      input_text = torch.Tensor([input_text]).to(device)

      logits = model(input_post, input_context,input_text, post_context_text=True)

    y_pred = torch.argmax(logits, dim=-1).item()

    print(logits, y_pred, label)

    predictions.append(y_pred)

    labels.append(label)

  print("Accuracy: {}".format(accuracy_score(predictions, labels)))
  print("F1 Score: {}".format(f1_score(predictions, labels)))

In [None]:
import pickle

with open('Muril_1e-4.pkl', 'wb') as f:
  pickle.dump(predictions, f)