In [1]:
import json
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy import displacy
import random
import os

In [2]:
# Function to load JSON data from a file
def load_json_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

In [17]:
# Function to convert data to spaCy format
def convert_data(data):
    training_data = []
    for item in data["annotations"]:
        text, annotations = item
        entities = [(start, end, label) for start, end, label in annotations["entities"]]
        training_data.append((text, {"entities": entities}))
    return training_data

# Directory containing JSON files
data_directory ="/Users/ATHEETHA M SURESH/Desktop/NLP Project/dataset/dataset"


In [18]:
# Collect all training data
all_training_data = []
for filename in os.listdir(data_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(data_directory, filename)
        data = load_json_data(file_path)
        training_data = convert_data(data)
        all_training_data.extend(training_data)


In [19]:
print("Training data examples:", all_training_data[:2])

Training data examples: [('als Nos.\r\n56 and 57 of 1949.\r\nAppeals from the orders of the High Court of Judica ture at Madras (Wadsworth and Patanjali Sastri JJ.) dated 24th October, 1945, in A.A.O. Nos.\r\n372 of 1943 and 634 of 1944 which were appeals from the orders of the Subordinate Judge of Ellore in E.A. No. 440 of 1937 and C.M.P. No. 152 of 1943 in O.S. No. 87 of 1923.\r\nP. Somasundaram (V. V. Choudhry, with him) for the appellant.\r\nV. Rangachari (K. Mangachari, with him) for the respond ents. 1950.\r\nOctober 17.\r\nThe Court delivered judgment as follows.\r\nFAZL ALI J.\r\nThese appeals arise out of an execution proceeding, and the main point to be decided in them is what is the effect of certain provisions of the Madras Agricul turists \' Relief Act (Madras Act IV of 1938, which will hereinafter be referred to as "the Madras Act"), on the rights of the parties.\r\nHow this point arises will be clear from a brief statement of the facts of the case.\r\nIt appears that in 

In [20]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add new labels to the NER component
for _, annotations in all_training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipeline components (if any)
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [21]:
# Training the NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(300):
        random.shuffle(all_training_data)
        losses = {}
        batches = minibatch(all_training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

# Save the trained model
nlp.to_disk("trained_model")

Appeal from a Judgm..." with entities "[(0, 29, 'CASE_NUMBER'), (71, 92, 'DATE'), (101, 1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal against the J..." with entities "[(0, 28, 'CASE_NUMBER'), (79, 93, 'DATE'), (101, 1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal from the judgm..." with entities "[(1, 27, 'CASE_NUMBER'), (71, 92, 'COURT'), (93, 1...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal from the Judg..." with entities "[(5, 28, 'CASE_NUMBER'), (73, 90, 'COURT'), (128, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during tr

Iteration 1, Losses: {'ner': 225420.5956954956}
Iteration 2, Losses: {'ner': 32780.80654710531}
Iteration 3, Losses: {'ner': 11298.483091655187}
Iteration 4, Losses: {'ner': 10325.472730308771}
Iteration 5, Losses: {'ner': 11360.60239739716}
Iteration 6, Losses: {'ner': 8918.797103732824}
Iteration 7, Losses: {'ner': 9087.844554472715}
Iteration 8, Losses: {'ner': 8976.33122252766}
Iteration 9, Losses: {'ner': 9137.378659403883}
Iteration 10, Losses: {'ner': 9342.786753237247}
Iteration 11, Losses: {'ner': 7792.38045019377}
Iteration 12, Losses: {'ner': 14300.178966104984}
Iteration 13, Losses: {'ner': 9034.544362336397}
Iteration 14, Losses: {'ner': 7940.3845972446725}
Iteration 15, Losses: {'ner': 9081.067364349961}
Iteration 16, Losses: {'ner': 8200.016674626851}
Iteration 17, Losses: {'ner': 7828.38159047626}
Iteration 18, Losses: {'ner': 8356.60676702857}
Iteration 19, Losses: {'ner': 7130.881564474665}
Iteration 20, Losses: {'ner': 7316.3590707977855}
Iteration 21, Losses: {'ner'

In [22]:
#Test the model


# Load the trained model
nlp = spacy.load("trained_model")

# Function to visualize entities in text
def visualize_ner(text):
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)



In [23]:
# Specify the path to your text file
file_path = "/Users/ATHEETHA M SURESH/Desktop/dataset/dataset/IN-Abs/test-data/judgement/6276.txt"


# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()


visualize_ner(text)