In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import transformers
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import confusion_matrix, classification_report
from datetime import datetime
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define path to label mapping
label_mapping_path = './label_mapping_binary.json'

# Check if GPU is available
if (torch.backends.mps.is_available()) and (torch.backends.mps.is_built()): # type: ignore (pylance confused about torch.backends.mps)
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

#device = torch.device("cpu")
print(f'Using device: {device}')

# Load X, y training and validation data
data_dir = 'data_binary'
X_test = pd.read_csv(f'./{data_dir}/test.csv')
y_test = pd.read_csv(f'./{data_dir}/test_labels.csv')

Using device: mps


In [3]:
model_path = '/Users/anvil/Documents/python/github/colasreqs/checkpoints/2023-10-04_15-59-AND_lr=1e-06/model'

In [4]:
with open(label_mapping_path, 'r') as f:
    label_mapping = json.load(f)
num_labels = len(label_mapping.values())
print(f'Number of labels: {num_labels}')

Number of labels: 2


In [5]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large', do_lower_case=True)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
model.to(device)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): Layer

In [27]:
test_dataset = Dataset.from_pandas(X_test)

# Encode text
test_encodings = tokenizer(test_dataset['text'], truncation=True, padding=True, max_length=512)

# Create conversion class

class BertProcessedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

# Convert encodings to PyTorch tensors
test_dataset = BertProcessedDataset(test_encodings, y_test['label'].tolist())



In [35]:
training_args = TrainingArguments(
    output_dir=f'./',
    per_device_eval_batch_size=8,
    use_mps_device=True,
    eval_accumulation_steps=50,
)
trainer = Trainer(
        model=model,
        args=training_args,
)

In [36]:
X_test.shape

(3112, 1)

In [37]:
y_pred = trainer.predict(test_dataset)

In [38]:
predictions, label_ids, metrics = y_pred

In [47]:
pred_frame = pd.DataFrame()

In [48]:
pred_frame['text'] = X_test['text']
pred_frame['y_true'] = y_test
pred_frame['y_pred'] = label_ids

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [51]:
accuracy_score(pred_frame['y_true'], pred_frame['y_pred'])

1.0

In [53]:
precision_score(pred_frame['y_true'], pred_frame['y_pred'])

1.0

In [54]:
recall_score(pred_frame['y_true'], pred_frame['y_pred'])

1.0

In [55]:
confusion_matrix(pred_frame['y_true'], pred_frame['y_pred'])

array([[2285,    0],
       [   0,  827]])

In [60]:
id_to_label = {v: i for i,v in label_mapping.items()}
id_to_label

{0: 'Other', 1: 'Heading'}

In [71]:
pred_frame_decoded = pred_frame.copy()
pred_frame_decoded['y_true'] = pred_frame_decoded['y_true'].map(id_to_label)
pred_frame_decoded['y_pred'] = pred_frame_decoded['y_pred'].map(id_to_label)

In [76]:
pred_frame_decoded['Prediction is mistaken (1 if prediction is mistaken, 0 if prediction is right)'] = (
    pred_frame_decoded['y_true'] != pred_frame_decoded['y_pred']
).astype(int)

In [77]:
pred_frame_decoded

Unnamed: 0,text,y_true,y_pred,"Prediction is mistaken (1 if prediction is mistaken, 0 if prediction is right)"
0,(f) dc switch gear for the regenerati...,Other,Other,0
1,(4) the contractor shall be responsib...,Other,Other,0
2,as for specifically designed for lighting cont...,Other,Other,0
3,► motor operated lbd units\ndc tracti...,Other,Other,0
4,preventive measures shall be taken to avoid th...,Other,Other,0
...,...,...,...,...
3107,(a) the boarding step made of aluminu...,Other,Other,0
3108,(7) emergency mode\n(a) in e...,Other,Other,0
3109,(1) the contractor shall conduct firs...,Other,Other,0
3110,ato shall control the opening and closing of p...,Other,Other,0


In [78]:
pred_frame_decoded.to_excel('./binary_model_predictions.xlsx', index=False)

In [79]:
X_train = pd.read_csv(f'./{data_dir}/train.csv')
X_val = pd.read_csv(f'./{data_dir}/val.csv')

In [80]:
X_train.shape

(9956, 1)

In [81]:
X_val.shape

(2490, 1)