# Legal Rhetorical Roles Classification using LEGAL-BERT and LEGAL-ToBERT

In [1]:
import os

import joblib

from rhetorical_roles_classification import (
    RhetoricalRolesDataset,
    RhetoricalRolesDatasetForTransformerOverBERT
)
from rhetorical_roles_classification.test import test_BERT, test_ToBERT

In [2]:
DATA_FOLDER = "./BUILD/data"
MODELS_FOLDER = "./models/eng"

In [3]:
BERT, BERT_config = joblib.load(
    os.path.join(MODELS_FOLDER, "LEGAL-BERT.joblib")
)
ToBERT, ToBERT_config = joblib.load(
    os.path.join(MODELS_FOLDER, "LEGAL-ToBERT.joblib")
)

In [4]:
BERT_test_dataset = RhetoricalRolesDataset(
    data_filepath=os.path.join(DATA_FOLDER, "test.csv"),
    max_segment_length=ToBERT_config["max_segment_length"],
    tokenizer_model_name=ToBERT_config["tokenizer_model_name"]
)
ToBERT_test_dataset = RhetoricalRolesDatasetForTransformerOverBERT(
    data_filepath=os.path.join(DATA_FOLDER, "test.json"),
    max_document_length=ToBERT_config["max_document_length"],
    max_segment_length=ToBERT_config["max_segment_length"],
    tokenizer_model_name=ToBERT_config["tokenizer_model_name"]
)

In [5]:
test_BERT(
    model=BERT,
    test_dataset=BERT_test_dataset,
    label2rhetRole=BERT_config["label2rhetRole"]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████████████████████████████████████████████████████████████████| 23/23 [10:30<00:00, 27.43s/it]


Overall
	Accuracy: 0.6561306009030914
	MCC: 0.5594308542642985
	Macro F1: 0.4723346093512131
	Macro Precision: 0.5321535407608603
	Macro Recall: 0.45666527835444104
	Micro F1: 0.6561306009030914
	Micro Precision: 0.6561306009030914
	Micro Recall: 0.6561306009030914
Rhetorical role: PREAMBLE
	Macro F1: 0.8368535342545244
	Macro Precision: 0.874517200031333
	Macro Recall: 0.8096196827146922
Rhetorical role: FAC
	Macro F1: 0.7893681195430859
	Macro Precision: 0.7712810714837115
	Macro Recall: 0.8162551934124282
Rhetorical role: RLC
	Macro F1: 0.6334970322033109
	Macro Precision: 0.7031749288174369
	Macro Recall: 0.6019678135959164
Rhetorical role: ISSUE
	Macro F1: 0.8895394343068495
	Macro Precision: 0.9261040334357218
	Macro Recall: 0.8589395546129375
Rhetorical role: ARG_PETITIONER
	Macro F1: 0.6460517504760709
	Macro Precision: 0.6503813463465081
	Macro Recall: 0.6419900320398718
Rhetorical role: ARG_RESPONDENT
	Macro F1: 0.49667832167832165
	Macro Precision: 0.4934004862799583
	Macro 

In [6]:
test_ToBERT(
    model=ToBERT,
    test_dataset=ToBERT_test_dataset,
    label2rhetRole=ToBERT_config["label2rhetRole"]
)

100%|██████████████████████████████████████████████████████████████████████| 30/30 [46:13<00:00, 92.46s/it]


Overall
	Accuracy: 0.7846474470302188
	MCC: 0.7267670113883938
	Macro F1: 0.5738055350853982
	Macro Precision: 0.6233189100752514
	Macro Recall: 0.564134905480945
	Micro F1: 0.7846474470302188
	Micro Precision: 0.7846474470302188
	Micro Recall: 0.7846474470302188
Rhetorical role: PREAMBLE
	Macro F1: 0.972360717424781
	Macro Precision: 0.9650979366930781
	Macro Recall: 0.9800347539328567
Rhetorical role: FAC
	Macro F1: 0.8728314702807327
	Macro Precision: 0.8770990744745795
	Macro Recall: 0.868752906061106
Rhetorical role: RLC
	Macro F1: 0.7119394276215356
	Macro Precision: 0.7941883986660105
	Macro Recall: 0.6678897250614649
Rhetorical role: ISSUE
	Macro F1: 0.886315915977784
	Macro Precision: 0.875589205357262
	Macro Recall: 0.8977023683280312
Rhetorical role: ARG_PETITIONER
	Macro F1: 0.5753200137675577
	Macro Precision: 0.6372785829307568
	Macro Recall: 0.553760870670803
Rhetorical role: ARG_RESPONDENT
	Macro F1: 0.6979748896386189
	Macro Precision: 0.6810762903052533
	Macro Recall: