# Installs and mount drive

In [None]:
%%capture

!pip install transformers

from google.colab import drive
drive.mount('/content/drive')

In [None]:
folder_name = '1_2'
num_classes = 2
model_name = 'mixtext_model.pt'
bert_fine_model_name = 'bert_model.pt'
data_eval = 'test.csv'

# path folder with models and data
PATH = '/content/drive/MyDrive/Masterthesis/MixText/data/' + folder_name + '/'

# Method to plot number of patents used vs accuracy

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

def plot_confidence(probabilities, true):
  if not isinstance(probabilities, pd.DataFrame):
    probabilities = pd.DataFrame(probabilities)
  if not isinstance(true, pd.DataFrame):
    true = pd.DataFrame(true)


  data_amount = len(true)
  acc = []
  data_percentage = []
  for c in np.arange(0, 1, 0.005):
    i = [True if np.max(x) >= c else False for x in probabilities.to_numpy()]
    confident_probabilities = probabilities[i]
    confident_predictions = np.argmax(confident_probabilities.to_numpy(), axis = 1)+1
    confident_true = true[i].T.to_numpy()[0]
    accuracy = (confident_predictions == confident_true).mean()
    acc.append(accuracy)
    data_percentage.append(len(confident_true))

  plt.title('Accuracy with uncofident predictions removed')
  plt.xlabel('Number of patents included')
  plt.ylabel('Accuracy')
  plt.plot(data_percentage, acc);
  plt.show();

# Method to evaluate models

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def model_eval(true, probabilities):
  pred = np.argmax(probabilities, axis=1) + 1
  # create confusion matrix and get accuracy
  cf_matrix = confusion_matrix(true, pred)
  accuracy  = np.trace(cf_matrix) / float(np.sum(cf_matrix))

  # get axis for confusion matrix
  labels = []

  print(round(f1_score(true, pred, average='macro')*100,1), "/", round(f1_score(true, pred, average='micro')*100,1), sep='')

  sns.heatmap(cf_matrix, annot=True, fmt='g', xticklabels='auto', yticklabels='auto');
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  plt.show()

  # accuracy
  print('Accuracy: ', accuracy)

  #f1 score
  print('F1 score macro: ', f1_score(true, pred, average='macro'))
  print('F1 score micro: ', f1_score(true, pred, average='micro'))

  plot_confidence(probabilities, true)


# PATH for evluation

# Mixtext model

In [None]:
# redirect to correct file create a MixText model
%cd /content/drive/MyDrive/Masterthesis/MixText/code

In [None]:
import torch
import torch.nn as nn
from mixtext import MixText

# load the saved mixtext model and put it into evaluation mode
model = MixText(num_classes, True).cuda()
model = nn.DataParallel(model)
model.load_state_dict(torch.load(PATH + model_name))
model.eval();

In [None]:
from transformers import AutoTokenizer
import pandas as pd

# create a tokenizer from patent bert
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents', do_lower_case=True)

# load the test data for the mixtext model
df = pd.read_csv(PATH + data_eval, header=None)
df.head()

In [None]:
len(df)

In [None]:
df[4].value_counts()

### Process data to be put into the MixText model

In [None]:
from keras.preprocessing.sequence import pad_sequences
from sys import float_repr_style
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
test_input_ids = []
MAX_LEN = 256

# tokenize all the patent text
for sen in df[2]:
  encoded_sent = tokenizer.encode(
      str(sen),
      add_special_tokens = True,
      max_length = MAX_LEN,
  )

  test_input_ids.append(encoded_sent)

# get the true labels
test_labels = df[1].to_numpy().astype(int)

# pad the sequences so they all are the same length
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN,
                               dtype='long', truncating='post', padding='post')

# create a attention mask for each text
test_attention_masks = []

for seq in test_input_ids:
  seq_mask = [float(i>0) for i in seq]
  test_attention_masks.append(seq_mask)

# convert to tensors
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

batch_size = 32

# create a dataloader for the test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

### Prediction with MixText model

In [None]:
from scipy.special import softmax
import numpy as np

# numpy arrays for true labels and the models predictions
true_labels = np.array([])
probabilities = np.array([])

# use a GPU to evaluate
device = torch.device('cuda')

first_loop = True

# get the model to predict
for (step, batch) in enumerate(test_dataloader):

  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch
  

  with torch.no_grad():
      outputs = model(b_input_ids)

  logits = outputs

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  if first_loop:
    probabilities = softmax(logits, axis=1)
    true_labels = label_ids
    first_loop = False
  else:
    probabilities = np.concatenate((probabilities, softmax(logits, axis=1)))
    true_labels = np.concatenate((true_labels, label_ids))

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

### Save predictions and true labels

In [None]:
mixtext_pred = np.argmax(probabilities, axis=1) + 1

In [None]:
(mixtext_pred == true_labels).mean()

### Evaluation of MixText model

In [None]:
model_eval(true_labels, probabilities)
len(true_labels)

# Fine-tuned BERT

In [None]:
from transformers import BertForSequenceClassification
# load the saved mixtext model and put it into evaluation mode
'''
model = MixText(num_classes, True).cuda()
model = nn.DataParallel(model)
model.load_state_dict(torch.load(PATH + bert_fine_model_name))
model.eval();
'''

model2 = BertForSequenceClassification.from_pretrained(
    'anferico/bert-for-patents',
    num_labels = num_classes + 1,
    output_attentions = False,
    output_hidden_states = False
).cuda()
model2.load_state_dict(torch.load(PATH + bert_fine_model_name))
model2.eval();

In [None]:
# create a tokenizer from patent bert
tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents', do_lower_case=True)

# load the test data for the mixtext model
df = pd.read_csv(PATH + data_eval, header=None)
df.head()

In [None]:
test_input_ids = []
MAX_LEN = 256

# tokenize all the patent text
for sen in df[2]:
  encoded_sent = tokenizer.encode(
      str(sen),
      add_special_tokens = True,
      max_length = MAX_LEN,
  )

  test_input_ids.append(encoded_sent)

# get the true labels
test_labels = df[1].to_numpy().astype(int)

# pad the sequences so they all are the same length
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN,
                               dtype='long', truncating='post', padding='post')

# create a attention mask for each text
test_attention_masks = []

for seq in test_input_ids:
  seq_mask = [float(i>0) for i in seq]
  test_attention_masks.append(seq_mask)

# convert to tensors
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

batch_size = 32

# create a dataloader for the test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
from scipy.special import softmax
import numpy as np

# numpy arrays for true labels and the models predictions
true_labels_bert = np.array([])
guesses_bert = np.array([])

# use a GPU to evaluate
device = torch.device('cuda')

first_loop = True

# get the model to predict
for (step, batch) in enumerate(test_dataloader):

  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch
  

  with torch.no_grad():
      outputs = model2(b_input_ids)[0]

  logits = outputs

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  if first_loop:
    guesses_bert = softmax(logits, axis=1)
    true_labels_bert = label_ids
    first_loop = False
  else:
    guesses_bert = np.concatenate((guesses_bert, softmax(logits, axis=1)))
    true_labels_bert = np.concatenate((true_labels_bert, label_ids))

In [None]:
model_eval(true_labels_bert, guesses_bert[:,[1,2]])

# Label MixText model 

In [None]:
import pickle

# load the model
# FIX THE PATH
loaded_model = pickle.load(open(PATH + '5_label_model.sav', 'rb'))

In [None]:
# load the labels that can be taken from fixed path because of the later merge on id
label_data = pd.read_csv('/content/drive/MyDrive/Masterthesis/data/destilled_and_labels.csv')
label_data = label_data.drop_duplicates(subset=['id'])

# merge the labels with the test data
test_data = pd.read_csv(PATH + data_eval, header = None)
test_label_data = pd.merge(test_data, label_data, how = 'left', left_on = [3], right_on = 'id')
test_label_data = test_label_data.iloc[:, 14:]

In [None]:
test_label_data

In [None]:
# get precentages from the models predictions
label_proba = loaded_model.predict_proba(test_label_data)

### Evaluation of Label MixText model

In [None]:
model_eval(true_labels, label_proba)

# Combination of MixText model and Label model

### Evaluation of the combined models

In [None]:
model_eval(true_labels, probabilities + label_proba)

# Combination of all three models

In [None]:
import pandas as pd
import numpy as np

# open LOTClass prediction probabilities
df = pd.read_csv('/content/drive/MyDrive/Masterthesis/LOTClass/datasets/' + folder_name + '/probabilities.csv')

results = np.empty((0,2))
for index, row in df.iterrows():
  results = np.append(results, [[float(row['0'][7:13]), float(row['1'][7:13])]], axis=0)

In [None]:
model_eval(true_labels, results)

In [None]:
model_eval(true_labels, probabilities + label_proba + results)