# Necessary

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm

In [2]:
train_file = pd.read_json("../nyt_dataset/train_data.json",encoding="utf-8", orient='records')
valid_file = pd.read_json("../nyt_dataset/valid_data.json",encoding="utf-8", orient='records')
test_file = pd.read_json("../nyt_dataset/test_data.json",encoding="utf-8", orient='records')

In [3]:
train_tokens = [train_file['text'][i] for i in range(len(train_file['text']))]
train_label = [train_file['relation_list'][i][0]['predicate'] for i in range(len(train_file['relation_list']))]
valid_tokens = [valid_file['text'][i] for i in range(len(valid_file['text']))]
valid_label = [valid_file['relation_list'][i][0]['predicate'] for i in range(len(valid_file['relation_list']))]
test_tokens = [test_file['text'][i] for i in range(len(test_file['text']))]
test_label = [test_file['relation_list'][i][0]['predicate'] for i in range(len(test_file['relation_list']))]

In [4]:
training_df = pd.DataFrame(); testing_df = pd.DataFrame(); validating_df = pd.DataFrame()
training_df['Text'] = train_tokens; training_df['Ori_Label'] = train_label
validating_df['Text'] = valid_tokens; validating_df['Ori_Label'] = valid_label
testing_df['Text'] = test_tokens; testing_df['Ori_Label'] = test_label

In [5]:
# Make a list of all of the unique labels in the training and testing dataframes
labellist=sorted(list(set(training_df['Ori_Label'].unique()).union(set(testing_df['Ori_Label'].unique()))))
# Create a label dictionary
labels={label:i for i,label in enumerate(labellist)}
reverse_index={value:key for (key,value)in labels.items()}

In [6]:
tokenizer=BertTokenizer.from_pretrained('bert-base-cased')
# Create a dataset class that inherits the properties of the Dataset class in Torch to pre-process and store the data
class Dataset(torch.utils.data.Dataset):
    def __init__(self,df,column='Text'):
        # Convert label names to label indices using the `labels` dictionary 
        self.labels=[labels[label] for label in df['Ori_Label']]
        # Tokenize the text data using the BERT tokenizer
        self.texts=[tokenizer(text.lower(),padding='max_length',max_length=512,truncation=True,return_tensors="pt") for text in df[column]]
    def classes(self):
        return self.labels
    def __len__(self):
        return len(self.labels)
    def get_batch_labels(self,idx):
        return np.array(self.labels[idx])
    def get_batch_texts(self,idx):
        return self.texts[idx]
    def __getitem__(self,idx):
        batch_texts=self.get_batch_texts(idx)
        batch_y=self.get_batch_labels(idx)
        return batch_texts,batch_y
# Create training and test datasets using the defined `Dataset` class
train_data=Dataset(training_df)
valid_data=Dataset(validating_df)
test_data=Dataset(testing_df)

# BERT

In [7]:
# Prepare inputs for the specific device (GPU or CPU) on which the model will run. Pre-check that GPU/CUDA is enabled
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def prepare_inputs(input1,label,device):
  label=label.to(device)
  mask=input1['attention_mask'].to(device)
  input_id=input1['input_ids'].squeeze(1).to(device)
  return (input_id,mask,label)

In [9]:
# Create a classification layer on top of BERT
class BertClassifier(nn.Module):
    # Define Bert model, Dropout layer, linear layer and activation function
    def __init__(self,dropout=0.5,num_classes=2):
        super(BertClassifier,self).__init__()
        self.bert=BertModel.from_pretrained('bert-base-cased')
        self.dropout=nn.Dropout(dropout)
        self.linear=nn.Linear(768,num_classes)
        self.relu=nn.ReLU()
    # Define the computational flow of the model    
    def forward(self,input_id,mask):
        last_hidden_layer,pooled_output = self.bert(input_ids=input_id,attention_mask=mask,return_dict=False)
        dropout_output=self.dropout(pooled_output)
        linear_output=self.linear(dropout_output)
        final_layer=self.relu(linear_output)
        return final_layer

In [10]:
# Use the definitions of the initialisation method and the forward method of the BertClassifier class to sketch out what the neural network architecture looks like.
def train(model, train_data,val_data,learning_rate,epochs,batchSize):
    train_dataloader=torch.utils.data.DataLoader(train_data,batch_size=batchSize,shuffle=True)
    val_dataloader=torch.utils.data.DataLoader(val_data,batch_size=batchSize)
    # Determine if cuda can be called, and call cpu if cuda cannot be called
    use_cuda=torch.cuda.is_available()
    device=torch.device("cuda" if use_cuda else "cpu")
    # Define loss function, optimizer and learning rate
    criterion=nn.CrossEntropyLoss()
    optimizer=Adam(model.parameters(),lr=learning_rate)
    if use_cuda:
        model=model.cuda()
        criterion=criterion.cuda()
        
    for epoch_num in range(epochs):
        # Initialize the cumulative training set accuracy and training set loss
        total_acc_train=0
        total_loss_train=0
        # Set the model to training mode
        model.train()
        for train_input,train_label in tqdm(train_dataloader):
            input_id,mask, train_label=prepare_inputs(train_input,train_label,device)
            output=model(input_id,mask)
            # Pass the input data to the model for forward propagation, calculate the loss, and accumulate the training set accuracy and training set loss
            batch_loss=criterion(output,train_label.long())
            total_loss_train +=batch_loss.item()
            acc=(output.argmax(dim=1)==train_label).sum().item()
            total_acc_train+=acc
            # Clear the calculated gradient information
            model.zero_grad()
            # Calculate the gradient of the loss function with respect to the model parameters
            batch_loss.backward()
            # Update the parameters in the model to make it more optimal towards the training goal
            optimizer.step()
        # Initialize the cumulative validation set accuracy and validation set loss    
        total_acc_val=0
        total_loss_val=0
        # Set the model to evaluation mode
        model.eval()
        # Close gradient calculation
        with torch.no_grad():
            # Iterate through all the data in the validation set to evaluate the performance of the model
            for val_input,val_label in val_dataloader:
                input_id,mask, val_label=prepare_inputs(val_input,val_label,device)
                output=model(input_id,mask)
                # Calculate and accrue losses
                batch_loss=criterion(output,val_label.long())
                total_loss_val+=batch_loss.item()
                # Cumulative number of correctly predicted samples
                acc=(output.argmax(dim=1)==val_label).sum().item()
                total_acc_val+=acc
        print(f'Epochs: {epoch_num+1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train/len(train_data):.3f}')
        print(f'Val loss: {total_loss_val/len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')
      

# Hyperparameter

In [11]:
# Define the number of epochs, the learning rate and an instance of BertClassifier network.
model=BertClassifier(num_classes=len(labels.keys()))
model=model.to(device)
EPOCHS=6
LR=1e-5
batch_size=2

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Train

In [12]:
# Train the BERT model
train(model,train_data,valid_data,LR,EPOCHS,batch_size)

100%|██████████| 28098/28098 [1:38:56<00:00,  4.73it/s]


Epochs: 1 | Train Loss: 0.431 | Train Accuracy: 0.723
Val loss: 0.323 | Val Accuracy: 0.750


100%|██████████| 28098/28098 [1:37:53<00:00,  4.78it/s]


Epochs: 2 | Train Loss: 0.281 | Train Accuracy: 0.780
Val loss: 0.301 | Val Accuracy: 0.772


100%|██████████| 28098/28098 [1:36:09<00:00,  4.87it/s]


Epochs: 3 | Train Loss: 0.221 | Train Accuracy: 0.814
Val loss: 0.288 | Val Accuracy: 0.781


100%|██████████| 28098/28098 [1:35:05<00:00,  4.92it/s]


Epochs: 4 | Train Loss: 0.184 | Train Accuracy: 0.837
Val loss: 0.304 | Val Accuracy: 0.776


100%|██████████| 28098/28098 [1:35:17<00:00,  4.91it/s]


Epochs: 5 | Train Loss: 0.158 | Train Accuracy: 0.857
Val loss: 0.311 | Val Accuracy: 0.780


100%|██████████| 28098/28098 [1:34:42<00:00,  4.95it/s]


Epochs: 6 | Train Loss: 0.136 | Train Accuracy: 0.878
Val loss: 0.351 | Val Accuracy: 0.772


## Save Model

In [13]:
# Save model
output_dir = "bert-base-cased-textM-E" + str(EPOCHS)
torch.save(model,output_dir)

# Test

In [8]:
# Loda model
class BertClassifier(nn.Module):
    # Define Bert model, Dropout layer, linear layer and activation function
    def __init__(self,dropout=0.5,num_classes=2):
        super(BertClassifier,self).__init__()
        self.bert=BertModel.from_pretrained('bert-base-cased')
        self.dropout=nn.Dropout(dropout)
        self.linear=nn.Linear(768,num_classes)
        self.relu=nn.ReLU()
    # Define the computational flow of the model    
    def forward(self,input_id,mask):
        last_hidden_layer,pooled_output = self.bert(input_ids=input_id,attention_mask=mask,return_dict=False)
        dropout_output=self.dropout(pooled_output)
        linear_output=self.linear(dropout_output)
        final_layer=self.relu(linear_output)
        return final_layer

EPOCHS = 6
input_dir = "bert-base-cased-textM-E" + str(EPOCHS)
complete_model = torch.load(input_dir)

In [16]:
batchsize=2
# Model evaluation function
def evaluate(model,test_dataset):
    model.eval()
    test_dataloader=torch.utils.data.DataLoader(test_dataset,batch_size=batchsize)
    
    # Determine if cuda can be called, and call cpu if cuda cannot be called
    use_cuda=torch.cuda.is_available()
    device=torch.device("cuda" if use_cuda else "cpu")
    
    if use_cuda:
        model=model.cuda()
        
    total_acc_test=0
    with torch.no_grad():
        count=0
        predictions=[]
        for test_input,test_label in tqdm(test_dataloader):
            count+=batchsize
            test_label=test_label.to(device)
            mask=test_input['attention_mask'].to(device)
            input_id=test_input['input_ids'].squeeze(1).to(device)
            output=model(input_id,mask)
            # save the prediction for further analysis
            predictions.append(output.argmax(dim=1))  
            acc=(output.argmax(dim=1)==test_label).sum().item()
            
            total_acc_test+=acc
            
    # Show label prediction accuracy        
    print(f'Test accuracy: {total_acc_test/len(test_dataset): .3f}')
    return predictions

In [17]:
# Test set label prediction
predictions = evaluate(complete_model, test_data)

100%|██████████| 2500/2500 [03:39<00:00, 11.37it/s]

Test accuracy:  0.757





In [18]:
# Add the predicted label for each test item to the dataframe with the test data
flattened=[]
for batch in predictions:
    for pred in batch:
        flattened.append(reverse_index[pred.item()])
testing_df['Predict_Label']=flattened

In [None]:
result_pre_label = testing_df['Predict_Label']
result_ori_label = testing_df['Ori_Label']
resule_labels = list(set(list(result_ori_label)))

# Save Predict Result

In [30]:
# Save the dataframe with test data and predicted label
test_dir = "BERT_result_TextM_E" + str(EPOCHS) + ".csv"
testing_df.to_csv(test_dir)

# Load Predict

In [9]:
try:
  test_dir = "BERT_result_TextM_E" + str(EPOCHS) + ".csv"
  result_file_dir = test_dir
  result_file = pd.read_csv(result_file_dir)
  result_pre_label = result_file['Predict_Label']
  result_ori_label = result_file['Ori_Label']
  resule_labels = list(set(list(result_ori_label)))
except:
  print('Select File Wrong!')

# Evaluation

In [21]:
tp={}; fp={}; fn={}; tn={}
# Iteration statistics prediction results
for label1,pred1 in zip(result_ori_label,result_pre_label):
  for label in resule_labels:
    if label1==label:
      if pred1==label:
        tp[label]=tp.get(label,0)+1
      else:
        fn[label]=fn.get(label,0)+1
    else:
      if pred1==label:
        fp[label]=fp.get(label,0)+1
      else:
        tn[label]=tn.get(label,0)+1
# Computational performance evaluation metrics
precision = {label:value/(value+fp.get(label,0)) for label,value in tp.items()}
recall = {label:value/(value+fn.get(label,0)) for label,value in tp.items()}
f1 = {label:(2*value*recall.get(label,0))/(value+recall.get(label,0)) for label,value in precision.items()}
accuracy = {'test2_accuracy':sum(tp.values())/len(result_pre_label) for label,value in precision.items()}
f1_weighted = sum([value*len(result_file[result_file['Ori_Label']==label])/len(result_file) for label,value in f1.items()])

## Evaluation Result

In [22]:
accuracy

{'test2_accuracy': 0.7574}

In [23]:
precision

{'/people/person/children': 0.8055555555555556,
 '/location/location/contains': 0.869155206286837,
 '/business/person/company': 0.7567567567567568,
 '/people/person/nationality': 0.8507462686567164,
 '/people/person/place_lived': 0.7762376237623763,
 '/people/person/place_of_birth': 0.5849056603773585,
 '/location/neighborhood/neighborhood_of': 0.48120300751879697,
 '/location/country/capital': 0.5343137254901961,
 '/location/country/administrative_divisions': 0.27715355805243447,
 '/people/deceased_person/place_of_death': 0.3333333333333333,
 '/business/company/founders': 0.5,
 '/business/company/place_founded': 0.4857142857142857,
 '/location/administrative_division/country': 0.3170731707317073,
 '/sports/sports_team_location/teams': 0.6666666666666666,
 '/business/company/advisors': 1.0,
 '/business/company_shareholder/major_shareholder_of': 0.43478260869565216,
 '/people/person/religion': 0.5714285714285714}

In [24]:
recall

{'/people/person/children': 0.9354838709677419,
 '/location/location/contains': 0.8540540540540541,
 '/business/person/company': 0.9333333333333333,
 '/people/person/nationality': 0.8807947019867549,
 '/people/person/place_lived': 0.795131845841785,
 '/people/person/place_of_birth': 0.3875,
 '/location/neighborhood/neighborhood_of': 0.3764705882352941,
 '/location/country/capital': 0.40671641791044777,
 '/location/country/administrative_divisions': 0.5211267605633803,
 '/people/deceased_person/place_of_death': 0.5294117647058824,
 '/business/company/founders': 0.3793103448275862,
 '/business/company/place_founded': 0.5862068965517241,
 '/location/administrative_division/country': 0.16666666666666666,
 '/sports/sports_team_location/teams': 0.3076923076923077,
 '/business/company/advisors': 0.6666666666666666,
 '/business/company_shareholder/major_shareholder_of': 0.7692307692307693,
 '/people/person/religion': 0.8}

In [25]:
f1

{'/people/person/children': 0.8656716417910448,
 '/location/location/contains': 0.8615384615384616,
 '/business/person/company': 0.835820895522388,
 '/people/person/nationality': 0.8655097613882864,
 '/people/person/place_lived': 0.7855711422845693,
 '/people/person/place_of_birth': 0.46616541353383456,
 '/location/neighborhood/neighborhood_of': 0.4224422442244224,
 '/location/country/capital': 0.461864406779661,
 '/location/country/administrative_divisions': 0.36185819070904646,
 '/people/deceased_person/place_of_death': 0.409090909090909,
 '/business/company/founders': 0.4313725490196078,
 '/business/company/place_founded': 0.53125,
 '/location/administrative_division/country': 0.21848739495798317,
 '/sports/sports_team_location/teams': 0.42105263157894735,
 '/business/company/advisors': 0.8,
 '/business/company_shareholder/major_shareholder_of': 0.5555555555555555,
 '/people/person/religion': 0.6666666666666666}

In [26]:
f1_weighted

0.7536582269247721