In [None]:
import numpy as np
import torch
from keras.utils import pad_sequences
from transformers import RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification


In [None]:
import pandas as pd
datat = pd.read_csv('ILDC.csv')
data = datat[datat['split'] == 'test']
data = data[['text']]
data.head()


In [None]:
data.shape


In [4]:
def input_id_maker(dataf, tokenizer):
  input_ids = []
  lengths = []

  for i in range(len(dataf)):
      sen = dataf['text'].iloc[i]
      # sen = tokenizer.tokenize(sen, add_prefix_space=True)
      sen = tokenizer.tokenize(sen)
      CLS = tokenizer.cls_token
      SEP = tokenizer.sep_token
      if (len(sen) > 510):
          sen = sen[len(sen)-510:]

      sen = [CLS] + sen + [SEP]
      encoded_sent = tokenizer.convert_tokens_to_ids(sen)
      input_ids.append(encoded_sent)
      lengths.append(len(encoded_sent))

  input_ids = pad_sequences(
      input_ids, maxlen=512, value=0, dtype="long", truncating="pre", padding="post")
  return input_ids, lengths


def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks


In [5]:

config = "Transformers_GPU/bert/bert-large-uncased_L2e-06_E15_B8/config.json"
vocab = "Transformers_GPU/bert/bert-large-uncased_L2e-06_E15_B8/vocab.txt"
merges = "Transformers_GPU/bert/bert-large-uncased_L2e-06_E15_B8/merges.txt"

tokenizer = BertTokenizer(vocab, merges)
model = torch.load("Transformers_GPU/bert/bert-large-uncased_L2e-06_E15_B8/model_bert_8_15.bin", map_location=torch.device('cpu'))

In [6]:

input_ids, lengths = input_id_maker(data, tokenizer)
input_ids = torch.tensor(input_ids)

input_masks = att_masking(input_ids)
input_masks = torch.tensor(input_masks)

In [None]:
with torch.no_grad():
    outputs = model(input_ids, token_type_ids=None, 
                      attention_mask=input_masks)
    print(outputs)


In [8]:
logits = outputs[0]
logits = logits.detach().cpu().numpy()


In [9]:
prediction = [np.argmax(i) for i in logits]

In [10]:
datal = datat[datat['split'] == 'test']
labels = datal['label'].to_numpy().astype('int')

In [None]:
# Now we can finally calculate the accuracy score
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(labels, prediction)

print("Accuracy: {:.2f}%".format(accuracy * 100))


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming labels and prediction are your true labels and predicted labels, respectively

# Calculate precision, recall, and F1 score
precision = precision_score(labels, prediction, average='weighted')
recall = recall_score(labels, prediction, average='weighted')
f1 = f1_score(labels, prediction, average='weighted')

# Classification report
class_report = classification_report(labels, prediction, target_names=['Accepted', 'Rejected'])

# Confusion Matrix
conf_matrix = confusion_matrix(labels, prediction)

# Create a dictionary to store the results
results_dict = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'classification_report': class_report,
    'confusion_matrix': conf_matrix.tolist()
}

from sklearn.metrics import average_precision_score

avg_precision = average_precision_score(labels, prediction)

results_dict['average_precision'] = avg_precision

print(results_dict)

# Save results to a JSON file
with open('metrics_results.json', 'w') as json_file:
    json.dump(results_dict, json_file, indent=4)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1', 'Class 2'],
            yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

# Assuming you have binary classification
fpr, tpr, _ = roc_curve(labels, prediction)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.savefig('roc_curve.png')
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(labels, prediction)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.savefig('precision_recall_curve.png')
plt.show()
