# 1.0 Install and import library

In [None]:

!pip install sentencepiece
!pip install transformers
!pip install fire


In [4]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange

## Define tokenizer, dataloader, bertmodel

In [228]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertmodel=BertModel.from_pretrained('bert-base-cased').to(device)
for name, param in bertmodel.named_parameters():
  param.requires_grad=True

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 2.0 Prepare dataset into the train, test

## Read original texts, labels

In [6]:
# Load original texts and labels
df=pd.read_json('All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

# Classes are imbalanced, so we need to remove some samples. 
new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])

# Assign new texts and labels as our dataset
texts=new_texts
labels=new_labels

## We want to get several things before injecting KG_embeddings.
* `list_entities`: for each input, it contains a list of entities that appear in the list. like `[[I love], [you]]`
* `entities_list_inputs_bert_ids`: for each input, it contains a list of bert_ids of entities. Like `[[3599,129],[9982]]`
* `entities_token_indexes`: for each input, it contains a list of position ids where an entity appears in the input. Like `[[8,9],[11]]`. This is obtained by comparing the bert_ids of entities (above) and bert_ids of the input.
* `list_kg_embeddings`: for each input, it contains a list of embeddings of the entities trained from KG. Like `[[emb_3599, emb_129],[emb_9982]]`
* So for each input, we want to combine the `list_kg_embeddings` with the original embeddings.

In [7]:
#Create bert_tokenizer ids of the entities
KG=torch.load('/content/data_for_KGE_training.pt')
entities=KG['entities']
#list: [1533, m]
bert_ids_of_entities=[]
for i in entities:
  bert_ids_of_entities.append(tokenizer(i, return_tensors="pt")['input_ids'][0:,1:-1].squeeze(0).tolist())

In [8]:
# for getting the following three lists
list_entities=[]
entities_list_inputs_bert_ids=[]
entities_token_indexes=[]

for i in texts:
  input_id=tokenizer(i,padding=False, max_length=512, truncation=True, return_tensors="pt")['input_ids'].squeeze(0).to(device) #[seq]
  a=input_id.tolist()
  entities_token_index=[]
  entities_list_inputs=[]
  list_entity=[]

  for j in range (len(bert_ids_of_entities)):
    b=bert_ids_of_entities[j]
    if (' '+(' ').join([str(i) for i in b])+' ') in ((' ').join([str(i) for i in a])):
      entity_id=[0]*len(b)
      entity_id[0]=a.index(b[0])
      for k in range(1,len(b)):
        entity_id[k]=entity_id[k-1]+1
      entities_token_index.append(entity_id)
      entities_list_inputs.append(b)
      list_entity.append(entities[j])
  entities_token_indexes.append(entities_token_index)
  entities_list_inputs_bert_ids.append(entities_list_inputs)
  list_entities.append(list_entity)

In [9]:
# for getting list_kg_embeddings
embeddings=torch.load('/content/KG_embeddings.pt')
list_kg_embeddings=[]
for i in list_entities:
  list_kg_embedding=[]
  for j in i:
    list_kg_embedding.append(embeddings[j])
  list_kg_embeddings.append(list_kg_embedding)

## Integrate all things into data list

In [10]:
# Map labels
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}


In [11]:
#create data list, which can be used for dataloader. 
labels_tensor=[torch.tensor(label2idx[i]) for i in labels]
data=[[i,j,k,l,m,n]for i,j,k,l,m,n in zip (texts,labels_tensor,list_entities,entities_list_inputs_bert_ids,entities_token_indexes,list_kg_embeddings)]
#divide into train, test
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.13, random_state=0)

# 3.0 Create the model and define the training process

## Create our model using Class

### model0 

In [240]:
# model0:insert kg_embedding after transformer, the pass through another transformer
class Bert_KG0(torch.nn.Module):
    def __init__(self,bertmodel): 
        super().__init__()
        self.linear1 = torch.nn.Linear(in_features=150, out_features=768, bias=True)
        self.bertmodel=bertmodel
        self.linear2 = torch.nn.Linear(in_features=768, out_features=5, bias=True)

    def forward(self, batch): #input_ids,[m,768]
        input_id=tokenizer(batch[0], return_tensors="pt")['input_ids'].to(device) #[seq]
        logits0 = bertmodel(input_id[:,:512])[0].squeeze(0) #[seq 768]
        logits=logits0.clone()
        for j in range(len(batch[-1])):
          embedding_to_add=self.linear1(torch.tensor(batch[-1][j]).float().to(device))
          for k in range(len(batch[-2][j])):
            logits[batch[-2][j][k]]=(0.7*logits[batch[-2][j][k]]+0.3*embedding_to_add)
        logits=bertmodel.encoder(logits.unsqueeze(0))[0].squeeze(0)
        CLS=logits[0]
        logits=self.linear2(CLS) # [m,5]
        return logits #(m,5)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
model=Bert_KG0(bertmodel).to(device)

### model1

In [223]:
# model1: just bert with linear layer
class Bert_KG1(torch.nn.Module):
    def __init__(self,bertmodel): 
        super().__init__()
        self.linear1 = torch.nn.Linear(in_features=150, out_features=768, bias=True)
        self.bertmodel=bertmodel
        self.linear2 = torch.nn.Linear(in_features=768, out_features=5, bias=True)

    def forward(self, batch): #input_ids,[m,768]
        input_id=tokenizer(batch[0], return_tensors="pt")['input_ids'].to(device) #[seq]
        logits0 = bertmodel(input_id[:,:512])[0].squeeze(0) #[seq 768]
        CLS=logits0[0]
        logits=self.linear2(CLS) # [m,5]
        return logits #(m,5)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
model=Bert_KG1(bertmodel).to(device)

### model 2

In [237]:
# model2:insert the kg_embedding into before transformer
class Bert_KG2(torch.nn.Module):
    def __init__(self,bertmodel): 
        super().__init__()
        self.linear1 = torch.nn.Linear(in_features=150, out_features=768, bias=True)
        self.bertmodel=bertmodel
        self.linear2 = torch.nn.Linear(in_features=768, out_features=5, bias=True)

    def forward(self, batch): #input_ids,[m,768]
        input_id=tokenizer(batch[0], return_tensors="pt")['input_ids'].to(device) #[seq]
        logits0 = bertmodel.embeddings(input_id[:,:512])[0].squeeze(0) #[seq 768]
        logits=logits0.clone()
        for j in range(len(batch[-1])):
          embedding_to_add=self.linear1(torch.tensor(batch[-1][j]).float().to(device))
          for k in range(len(batch[-2][j])):
            logits[batch[-2][j][k]]=(0.7*logits[batch[-2][j][k]]+0.3*embedding_to_add)
        logits=bertmodel.encoder(logits.unsqueeze(0))[0].squeeze(0)
        CLS=logits[0]
        logits=self.linear2(CLS) # [m,5]
        return logits #(m,5)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
model=Bert_KG2(bertmodel).to(device)

## Define the training process

In [231]:
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion, val_dataloader,len_val):
    for epoch in range(epochs):
        print(f"Training epoch {epoch+1}")
        model.train()
        loss_accumulate=0
        for i, batch in enumerate(train_dataloader):
            logits = model(batch) #m,5
            targets=batch[1].to(device) #m
            loss = criterion(logits.unsqueeze(0),targets)/batchsize_grad
            loss_accumulate+=loss
            loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
            
            if (i+1) % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
                if (i+1) % 100 == 0: print (i+1, loss_accumulate)
                loss_accumulate=0
                
        #for evaluate the model after an epoch
        model.eval()
        accuracy=0
        for i, batch in enumerate(val_dataloader):
            with torch.no_grad():
              logits = model(batch) #[m,5]
            softmaxed=torch.softmax(logits,-1) #[m,5]
            predict_label=torch.argmax(softmaxed,-1).to('cpu')
            targets=batch[1].to('cpu') #m
            from sklearn.metrics import accuracy_score
            accuracy+=accuracy_score([targets[0]],[predict_label])
        print ("accuracy",accuracy/len_val)

# 4.0 Start training:remember to save the model

In [None]:
batch_size=1
batchsize_grad=10
epochs=12
lr=5e-5
torch.manual_seed(0)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_data, batch_size=1, shuffle=True)
len_val=len(val_dataloader)
criterion=torch.nn.CrossEntropyLoss(weight=torch.tensor([1, 1, 1, 1, 1],dtype=torch.float).to(device))
torch.manual_seed(0)
optimizer = AdamW(model.parameters(), lr=lr)
torch.manual_seed(0)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*epochs/batchsize_grad)

In [None]:
train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer,criterion,val_dataloader, len_val)

# 5.0 Result:

Refer to the 3 files. But the general result is as follows:
* KG_embeddings after transformer can get the best result (Model 1), the highest accuracy reach 95.5% at epoch 8. Time: 11m 23s
* KG_embeddings before transformer reach 93.1% at epoch 4. Time: 5m 40s
* Bert+linear reach 91.1% at epoch 4. Time: 5m 17s

# 6.0 Conclusion:
KG_embedding is useful, because it fully captures the correlation between different entities in the entire corpus. 