In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, BartTokenizer, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and models
# 1. RoBERTa for classification
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 2. LegalBERT for classification (better suited for legal texts)
legalbert_tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
legalbert_model = AutoModelForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=2)

# 3. BART for summarization
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

print("Models and tokenizers loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Models and tokenizers loaded successfully!


In [7]:
# Load the legal text file
with open('legal_document.txt', 'r') as f:
    legal_text = f.read()

print(legal_text[:])

Case No. 12345
In the matter of Jane Doe vs. John Smith

The Plaintiff, Jane Doe, alleges that the Defendant, John Smith, violated the terms of the contract signed on January 5th, 2020. The contract pertains to the lease of commercial property located at 123 Business Ave, where the Plaintiff asserts that the Defendant has failed to make the agreed-upon monthly payments for six consecutive months.

The Defendant contends that the payment was withheld due to significant maintenance issues that were not addressed by the Plaintiff, including structural damage and flooding, which made the property unsafe for conducting business.

Upon review of the presented evidence, the court finds that while the Defendant experienced hardship due to the condition of the property, the contract's terms do not provide a clause for withholding rent without prior notification to the Plaintiff. Therefore, the Defendant's failure to notify the Plaintiff and continue withholding payments constitutes a breach of 

In [8]:
# Summarization using BART
inputs = bart_tokenizer([legal_text], max_length=1024, return_tensors='pt', truncation=True)
summary_ids = bart_model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode and print summary
summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)

Summary: The court rules in favor of the Plaintiff and orders the Defendant to pay the outstanding amount of $18,000, covering the past due rent and associated late fees. The Defendant is advised to submit a formal complaint regarding the property conditions for further legal action concerning maintenance disputes.


In [9]:
# Tokenize the text for RoBERTa
inputs = roberta_tokenizer(legal_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Perform classification
outputs = roberta_model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax(dim=-1).item()

# Display the result (0 for against, 1 for in favor)
roberta_label = "In Favor of Plaintiff" if predicted_class == 1 else "In Favor of Defendant"
print("RoBERTa Prediction:", roberta_label)

RoBERTa Prediction: In Favor of Plaintiff


In [10]:
# Tokenize the text for LegalBERT
inputs = legalbert_tokenizer(legal_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Perform classification
outputs = legalbert_model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax(dim=-1).item()

# Display the result (0 for against, 1 for in favor)
legalbert_label = "In Favor of Plaintiff" if predicted_class == 1 else "In Favor of Defendant"
print("LegalBERT Prediction:", legalbert_label)



LegalBERT Prediction: In Favor of Plaintiff


In [12]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load DistilBERT for classification
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the text for DistilBERT
inputs = distilbert_tokenizer(legal_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Perform classification
outputs = distilbert_model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax(dim=-1).item()

# Display the result (0 for against, 1 for in favor)
distilbert_label = "In Favor of Plaintiff" if predicted_class == 1 else "In Favor of Defendant"
print("DistilBERT Prediction:", distilbert_label)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT Prediction: In Favor of Defendant
