In [2]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.

device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


In [3]:
!pip install torch
!pip install transformers
!pip install -U -q PyDrive
!pip install datasets
!pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Using cached datasets-2.12.0-py3-none-any.whl (474 kB)
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification



In [5]:
labels_list = [" OTHERS", " PETITIONER", " COURT", " RESPONDENT", " JUDGE", " OTHER_PERSON", " LAWYER", " DATE", " ORG", " GPE", " STATUTE", " PROVISION", " PRECEDENT", " CASE_NUMBER", " WITNESS"]
label_encoding_dict = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14}
label_list_encoding_dict = {0: " OTHERS", 1: " PETITIONER", 2: " COURT", 3: " RESPONDENT", 4: " JUDGE", 5: " OTHER_PERSON", 6: " LAWYER", 7: " DATE", 8: " ORG", 9: " GPE", 10: " STATUTE", 11: " PROVISION", 12: " PRECEDENT", 13: " CASE_NUMBER", 14: " WITNESS"}

# ERROR ANALYSIS: Demonstrating Retrained Gold+legalBART model Results on an Indian Legal Examples

In [6]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Data_Augmentation_for_Low_Resource_Indian_Legal_NER

Mounted at /content/drive/
/content/drive/My Drive/Data_Augmentation_for_Low_Resource_Indian_Legal_NER


In [7]:
# importing the trained model from the checkpoint
tokenizer = AutoTokenizer.from_pretrained('indian_legal_ner_retrain.model')
model = AutoModelForTokenClassification.from_pretrained('indian_legal_ner_retrain.model', num_labels=len(labels_list))
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [11]:
def assign_labels(sentence):
  tokenizer = AutoTokenizer.from_pretrained('indian_legal_ner_retrain.model/')
  input_tensor = tokenizer(sentence)['input_ids']
  attention_tensor = tokenizer(sentence)['attention_mask']

  model = AutoModelForTokenClassification.from_pretrained('indian_legal_ner_retrain.model/', num_labels=len(labels_list))

  predictions = model.forward(input_ids=torch.tensor(input_tensor).unsqueeze(0), attention_mask=torch.tensor(attention_tensor).unsqueeze(0))
  predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
  entities = [labels_list[i] for i in predictions]

  tokens = tokenizer.batch_decode(input_tensor)
  for tag, entity, token in zip(predictions, entities, tokens):
    print(f"{token:<12}{tag:<12}{entity}")

# EXAMPLE 1

In [12]:
# {
#         "id": "924b401e4ef841478ff133b278daf3d3",
#         "annotations": [
#             {
#                 "result": [
#                     {
#                         "value": {
#                             "start": 40,
#                             "end": 45,
#                             "text": "Patil",
#                             "labels": [
#                                 "OTHER_PERSON"
#                             ]
#                         },
#                         "id": "AO73DB1J",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 73,
#                             "end": 92,
#                             "text": "WP No. 2126 of 2018",
#                             "labels": [
#                                 "CASE_NUMBER"
#                             ]
#                         },
#                         "id": "AKX7H23K",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 122,
#                             "end": 134,
#                             "text": "Section 4(3)",
#                             "labels": [
#                                 "PROVISION"
#                             ]
#                         },
#                         "id": "E00WAY3S",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 172,
#                             "end": 197,
#                             "text": "Backward Class Commission",
#                             "labels": [
#                                 "ORG"
#                             ]
#                         },
#                         "id": "50JMW1BY",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     }
#                 ]
#             }
#         ],
#         "data": {
#             "text": "We have also extensively heard Advocate Patil, counsel for petitioner in WP No. 2126 of 2018 who has posed a challenge to Section 4(3) of the Act and would submit that the Backward Class Commission cannot create a separate class."
#         },
#         "meta": {
#             "source": "tax_bombay_high_court judgement https://indiankanoon.org/doc/153458629"
#         }
#     }

In [13]:
sentence = '''We have also extensively heard Advocate Patil, counsel for petitioner in WP No. 2126 of 2018 who has posed a challenge to Section 4(3) of the Act and would submit that the Backward Class Commission cannot create a separate class.'''
assign_labels(sentence)

[CLS]       0            OTHERS
we          0            OTHERS
have        0            OTHERS
also        0            OTHERS
extensive   0            OTHERS
##ly        0            OTHERS
heard       0            OTHERS
advocate    0            OTHERS
pat         5            OTHER_PERSON
##il        5            OTHER_PERSON
,           0            OTHERS
counsel     0            OTHERS
for         0            OTHERS
petitioner  0            OTHERS
in          0            OTHERS
w           13           CASE_NUMBER
##p         13           CASE_NUMBER
no          13           CASE_NUMBER
.           13           CASE_NUMBER
212         13           CASE_NUMBER
##6         13           CASE_NUMBER
of          13           CASE_NUMBER
2018        13           CASE_NUMBER
who         0            OTHERS
has         0            OTHERS
posed       0            OTHERS
a           0            OTHERS
challenge   0            OTHERS
to          0            OTHERS
section     11      

Backward Class Commission is ORG

# EXAMPLE 2

In [None]:
# {
#         "id": "c139ad698d804749975ca5a6578789f4",
#         "annotations": [
#             {
#                 "result": [
#                     {
#                         "value": {
#                             "start": 82,
#                             "end": 92,
#                             "text": "02.09.2019",
#                             "labels": [
#                                 "DATE"
#                             ]
#                         },
#                         "id": "KYOV4HYL",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 99,
#                             "end": 109,
#                             "text": "Section 14",
#                             "labels": [
#                                 "PROVISION"
#                             ]
#                         },
#                         "id": "0F7JQXTL",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 117,
#                             "end": 219,
#                             "text": "Securitisation and Reconstruction of Financial Assets and Enforcement of Securities Interest Act, 2002",
#                             "labels": [
#                                 "STATUTE"
#                             ]
#                         },
#                         "id": "8NAWX1FY",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 225,
#                             "end": 237,
#                             "text": "SARFAESI Act",
#                             "labels": [
#                                 "STATUTE"
#                             ]
#                         },
#                         "id": "YHLZB1RD",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 274,
#                             "end": 283,
#                             "text": "Prayagraj",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "Y6GUIKHH",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 319,
#                             "end": 329,
#                             "text": "13.10.2020",
#                             "labels": [
#                                 "DATE"
#                             ]
#                         },
#                         "id": "VAJZODY0",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 381,
#                             "end": 390,
#                             "text": "Prayagraj",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "8M0RX7PR",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     }
#                 ]
#             }
#         ],
#         "data": {
#             "text": "Petitioners have filed the present writ petition praying to quash the order dated 02.09.2019 under Section 14 of The Securitisation and Reconstruction of Financial Assets and Enforcement of Securities Interest Act, 2002 (the SARFAESI Act) passed by the District Magistrate, Prayagraj and the consequential letter dated 13.10.2020 issued by the Additional District Magistrate (II), Prayagraj."
#         },
#         "meta": {
#             "source": "financial_allahabad_high_court judgement https://indiankanoon.org/doc/196086243"
#         }
#     }

In [15]:
sentence = '''Petitioners have filed the present writ petition praying to quash the order dated 02.09.2019 under Section 14 of The Securitisation and Reconstruction of Financial Assets and Enforcement of Securities Interest Act, 2002 (the SARFAESI Act) passed by the District Magistrate, Prayagraj and the consequential letter dated 13.10.2020 issued by the Additional District Magistrate (II), Prayagraj.'''
assign_labels(sentence)

[CLS]       0            OTHERS
petitioners 0            OTHERS
have        0            OTHERS
filed       0            OTHERS
the         0            OTHERS
present     0            OTHERS
writ        0            OTHERS
petition    0            OTHERS
pray        0            OTHERS
##ing       0            OTHERS
to          0            OTHERS
quash       0            OTHERS
the         0            OTHERS
order       0            OTHERS
dated       0            OTHERS
02          7            DATE
.           7            DATE
09          7            DATE
.           7            DATE
2019        7            DATE
under       0            OTHERS
section     11           PROVISION
14          11           PROVISION
of          0            OTHERS
the         0            OTHERS
securit     10           STATUTE
##isation   10           STATUTE
and         10           STATUTE
reconstruction10           STATUTE
of          10           STATUTE
financial   10           STATUTE
asse

district magistrate is not COURT

# EXAMPLE 3

In [None]:
# {
#         "id": "22858d06fa06422884f8c8ba93e94e9f",
#         "annotations": [
#             {
#                 "result": [
#                     {
#                         "value": {
#                             "start": 31,
#                             "end": 44,
#                             "text": "UTTAR PRADESH",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "RNUJBZA1",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 46,
#                             "end": 55,
#                             "text": "ALLAHABAD",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "723AXM3U",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 98,
#                             "end": 107,
#                             "text": "Allahabad",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "5VNHG4VI",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 116,
#                             "end": 126,
#                             "text": "11.07.2006",
#                             "labels": [
#                                 "DATE"
#                             ]
#                         },
#                         "id": "2PTN5AN3",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 137,
#                             "end": 160,
#                             "text": "McDowell & Company Ltd.",
#                             "labels": [
#                                 "ORG"
#                             ]
#                         },
#                         "id": "52DN6U0G",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 162,
#                             "end": 166,
#                             "text": "Rosa",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "2RJICME9",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 177,
#                             "end": 188,
#                             "text": "Shahjhanpur",
#                             "labels": [
#                                 "GPE"
#                             ]
#                         },
#                         "id": "6D44VB66",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     }
#                 ]
#             }
#         ],
#         "data": {
#             "text": "OFFICE OF EXCISE COMMISSIONER, UTTAR PRADESH, ALLAHABAD No. 7244/9-Alcohol/131/Rosa/Fire Incident Allahabad Dated \u2013 11.07.2006 ORDER M/s McDowell & Company Ltd., Rosa, District Shahjhanpur is a PD-2 Licensed distillery."
#         },
#         "meta": {
#             "source": "motorvehicles_supremecourts judgement https://indiankanoon.org/doc/16920944"
#         }
#     }

In [17]:
sentence = '''OFFICE OF EXCISE COMMISSIONER, UTTAR PRADESH, ALLAHABAD No. 7244/9-Alcohol/131/Rosa/Fire Incident Allahabad Dated \u2013 11.07.2006 ORDER M/s McDowell & Company Ltd., Rosa, District Shahjhanpur is a PD-2 Licensed distillery.'''
assign_labels(sentence)

[CLS]       0            OTHERS
office      0            OTHERS
of          0            OTHERS
excise      0            OTHERS
commissioner0            OTHERS
,           0            OTHERS
u           9            GPE
##tta       9            GPE
##r         9            GPE
pra         9            GPE
##des       9            GPE
##h         9            GPE
,           0            OTHERS
all         0            OTHERS
##a         0            OTHERS
##hab       0            OTHERS
##a         0            OTHERS
##d         0            OTHERS
no          0            OTHERS
.           0            OTHERS
724         0            OTHERS
##4         0            OTHERS
/           0            OTHERS
9           0            OTHERS
-           0            OTHERS
alcohol     0            OTHERS
/           0            OTHERS
131         0            OTHERS
/           0            OTHERS
rosa        0            OTHERS
/           0            OTHERS
fire        0            O

Classified Allahabad as OTHERS instead of GPE

# EXAMPLE 4

In [None]:
# {
#         "id": "479ac75959604053992ee9d7db10a190",
#         "annotations": [
#             {
#                 "result": [
#                     {
#                         "value": {
#                             "start": 199,
#                             "end": 204,
#                             "text": "MoHUA",
#                             "labels": [
#                                 "ORG"
#                             ]
#                         },
#                         "id": "TXQBD8N7",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     }
#                 ]
#             }
#         ],
#         "data": {
#             "text": "The minutes of the meeting read thus:\n \u201cItem No.18/2020 Regarding proposed change of land use of Plot Nos.1,2,3,4,5,6,7 and 8.F.20(12)2019/MP  a) The proposal was presented by Joint Secretary (L&E), MoHUA, In-charge of Central Vista Development/Redevelopment Project, who was present as Special Invitee."
#         },
#         "meta": {
#             "source": "motorvehicles_supremecourts judgement https://indiankanoon.org/doc/126137620"
#         }
#     }

In [18]:
sentence = '''The minutes of the meeting read thus:\n \u201cItem No.18/2020 Regarding proposed change of land use of Plot Nos.1,2,3,4,5,6,7 and 8.F.20(12)2019/MP  a) The proposal was presented by Joint Secretary (L&E), MoHUA, In-charge of Central Vista Development/Redevelopment Project, who was present as Special Invitee.'''
assign_labels(sentence)

[CLS]       0            OTHERS
the         0            OTHERS
minutes     0            OTHERS
of          0            OTHERS
the         0            OTHERS
meeting     0            OTHERS
read        0            OTHERS
thus        0            OTHERS
:           0            OTHERS
[UNK]       0            OTHERS
item        0            OTHERS
no          0            OTHERS
.           0            OTHERS
18          0            OTHERS
/           0            OTHERS
2020        0            OTHERS
regarding   0            OTHERS
proposed    0            OTHERS
change      0            OTHERS
of          0            OTHERS
land        0            OTHERS
use         0            OTHERS
of          0            OTHERS
plot        0            OTHERS
no          0            OTHERS
##s         0            OTHERS
.           0            OTHERS
1           0            OTHERS
,           0            OTHERS
2           0            OTHERS
,           0            OTHERS
3       

Misclassified MoHUA as a geopolitical location (GPE) when it is an ORG, misclassified Central Vista as ORG

# EXAMPLE 5

In [None]:
# {
#         "id": "930f3680406c4b1bba867cd85871d70f",
#         "annotations": [
#             {
#                 "result": [
#                     {
#                         "value": {
#                             "start": 1,
#                             "end": 30,
#                             "text": "Writ Tax Petition No.524/2021",
#                             "labels": [
#                                 "CASE_NUMBER"
#                             ]
#                         },
#                         "id": "N3LF9HS7",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 39,
#                             "end": 49,
#                             "text": "30.09.2021",
#                             "labels": [
#                                 "DATE"
#                             ]
#                         },
#                         "id": "0RRV0N3C",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 63,
#                             "end": 79,
#                             "text": "2021(10) TMI 517",
#                             "labels": [
#                                 "RESPONDENT"
#                             ]
#                         },
#                         "id": "FBC6G9RU",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     },
#                     {
#                         "value": {
#                             "start": 143,
#                             "end": 154,
#                             "text": "Section 148",
#                             "labels": [
#                                 "PROVISION"
#                             ]
#                         },
#                         "id": "OQUBION7",
#                         "from_name": "label",
#                         "to_name": "text",
#                         "type": "labels"
#                     }
#                 ]
#             }
#         ],
#         "data": {
#             "text": "(Writ Tax Petition No.524/2021), dated 30.09.2021, reported in 2021(10) TMI 517, the learned Single Judge had quashed the notices issued under Section 148 of the Act."
#         },
#         "meta": {
#             "source": "tax_rajasthan_high_court judgement https://indiankanoon.org/doc/33425696"
#         }
#     }

In [20]:
sentence = '''(Writ Tax Petition No.524/2021), dated 30.09.2021, reported in 2021(10) TMI 517, the learned Single Judge had quashed the notices issued under Section 148 of the Act.'''
assign_labels(sentence)

[CLS]       0            OTHERS
(           0            OTHERS
writ        13           CASE_NUMBER
tax         13           CASE_NUMBER
petition    13           CASE_NUMBER
no          13           CASE_NUMBER
.           13           CASE_NUMBER
524         13           CASE_NUMBER
/           13           CASE_NUMBER
2021        13           CASE_NUMBER
)           0            OTHERS
,           0            OTHERS
dated       0            OTHERS
30          7            DATE
.           7            DATE
09          7            DATE
.           7            DATE
2021        7            DATE
,           0            OTHERS
reported    0            OTHERS
in          0            OTHERS
2021        13           CASE_NUMBER
(           13           CASE_NUMBER
10          13           CASE_NUMBER
)           13           CASE_NUMBER
t           13           CASE_NUMBER
##mi        13           CASE_NUMBER
517         13           CASE_NUMBER
,           0            OTHERS
the    

classified "2021(10) TMI 517" as CASE_NUMBER but it is RESPONDENT