In [None]:
# The SLM aids in tailoring the model responses effective to the use case.

# Install necessary libraries
# !pip install transformers datasets torch

from transformers import DistilBertTokenizer, DistilBertForTokenClassification
from transformers import pipeline
import torch

# Load the pre-trained DistilBERT model and tokenizer for NER
model_name = "dbmdz/distilbert-base-uncased-finetuned-conll03-english"  # Pre-trained DistilBERT model fine-tuned for NER
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForTokenClassification.from_pretrained(model_name)

# Initialize NER pipeline
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)

# Example of a legal document text
legal_document = """
The plaintiff, John Doe, filed a lawsuit against XYZ Corporation for breach of contract on January 15, 2023. The complaint alleges that XYZ Corporation, based in New York, failed to fulfill its contractual obligations, leading to financial damages. The legal team representing John Doe is headed by Sarah Johnson, an attorney at Law Group LLC.
"""

# Use the pipeline to extract named entities from the legal document
entities = nlp_ner(legal_document)

# Print out the identified entities in the document
print("Named Entities in Legal Document:")
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Confidence: {entity['score']:.4f}")


In [None]:
!pip install transformers datasets torch sentencepiece

In [None]:
from transformers import ElectraTokenizerFast, ElectraForTokenClassification
from transformers import pipeline

# Load pre-trained ELECTRA model and tokenizer for NER
model_name = "google/electra-large-discriminator"
tokenizer = ElectraTokenizerFast.from_pretrained(model_name)
model = ElectraForTokenClassification.from_pretrained(model_name)

# Initialize NER pipeline
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [None]:
legal_document = """
The plaintiff, John Doe, filed a lawsuit against XYZ Corporation for breach of contract on January 15, 2023. The complaint alleges that XYZ Corporation, based in New York, failed to fulfill its contractual obligations, leading to financial damages. The legal team representing John Doe is headed by Sarah Johnson, an attorney at Law Group LLC.
"""

# Use the pipeline to extract named entities from the legal document
entities = nlp_ner(legal_document)

# Print out the identified entities in the document
print("Named Entities in Legal Document:")
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Confidence: {entity['score']:.4f}")

In [None]:

from transformers import ElectraTokenizerFast, ElectraForTokenClassification, TrainingArguments, Trainer
from transformers import pipeline
import torch
import time

# Sample dataset (expanded with diverse examples)
train_dataset = [
    {"text": "John Doe filed a lawsuit against XYZ Corporation.", "entities": [{"start": 0, "end": 8, "entity": "PERSON"}, {"start": 22, "end": 36, "entity": "ORG"}]},
    {"text": "Sarah Johnson is an attorney at Law Group LLC.", "entities": [{"start": 0, "end": 13, "entity": "PERSON"}, {"start": 28, "end": 39, "entity": "ORG"}]},
    {"text": "The contract was signed on January 15, 2023.", "entities": [{"start": 20, "end": 30, "entity": "DATE"}]},
    {"text": "The breach of contract caused significant financial damages.", "entities": [{"start": 4, "end": 19, "entity": "LEGAL_ISSUE"}]},
    {"text": "The court ruled in favor of the plaintiff.", "entities": [{"start": 0, "end": 5, "entity": "ORG"}, {"start": 24, "end": 32, "entity": "LEGAL_ROLE"}]},
    # ... add even more examples
]


# Load pre-trained ELECTRA model and tokenizer
model_name = "google/electra-large-discriminator"
tokenizer = ElectraTokenizerFast.from_pretrained(model_name)

# Get unique entity labels
unique_entities = set([entity["entity"] for example in train_dataset for entity in example["entities"]])
num_labels = len(unique_entities)

# Create label mapping
label2id = {label: i for i, label in enumerate(unique_entities)}
id2label = {i: label for i, label in enumerate(unique_entities)}

# Load model with correct number of labels
model = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# Function to tokenize and format data
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True)
    labels = []
    for i, example in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Check if the current word is within any entity span
                entity_id = -100  # Default to -100 (ignore)
                for entity_info in examples["entities"][i]: # Iterate through entities for this example
                    if entity_info["start"] <= word_idx < entity_info["end"]:
                        entity_id = label2id[entity_info["entity"]]
                        break  # Found the entity, stop searching
                label_ids.append(entity_id)  # Append the entity ID (or -100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs




# Tokenize dataset
import datasets
train_dataset = datasets.Dataset.from_list(train_dataset)
tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)

# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    run_name="my_legal_ner_run",  # Set a distinct run name
    # ... other training arguments
)

# Create Trainer and fine-tune
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    # ... (eval_dataset if available)
)

trainer.train()
trainer.save_model("./fine_tuned_model")

fine_tuned_model = ElectraForTokenClassification.from_pretrained("./fine_tuned_model")


# Quantization (dynamic)
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

torch.save(quantized_model.state_dict(), "./quantized_model.pth")



In [None]:

# Inference and time measurement
def measure_inference_time(model, text):
    start_time = time.time()
    nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    entities = nlp_ner(text)
    end_time = time.time()
    return end_time - start_time

legal_document = "John Doe filed a lawsuit against XYZ Corporation."

# Before fine-tuning and quantization
before_time = measure_inference_time(model, legal_document)

# After fine-tuning and quantization
fine_tuned_model = ElectraForTokenClassification.from_pretrained("./fine_tuned_model")
fine_tuned_model.load_state_dict(torch.load("./quantized_model.pth"))

after_time = measure_inference_time(fine_tuned_model, legal_document)

# Inference time delta
delta = before_time - after_time
print(f"Inference time delta: {delta:.4f} seconds")
