In [3]:
import fitz
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining, pipeline
import spacy

In [4]:
file_name = r'D:\Project\LegalDocumentAnalyzer\contract.pdf'

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " "
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

pdf_text = extract_text_from_pdf(file_name)
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModelForPreTraining.from_pretrained("nlpaueb/legal-bert-base-uncased")
nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def generate_summary(text):
    summary = summarizer(text, max_length = 150, min_length = 50, do_sample = False)
    return summary[0]["summary_text"]

def extract_entities(text):
    doc = nlp(text)
    entities = {ent.label_: [] for ent in doc.ents}
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return entities

def extract_clauses(text):
    clauses = {
        "Payment Terms": [],
        "Confidentiality": [],
        "Termination": [],
        "Governing Law": []
    }
    for line in text.split("\n"):
        if "pay" in line.lower() or "compensation" in line.lower():
            clauses["Payment Terms"].append(line)
        elif "confidential" in line.lower() or "disclose" in line.lower():
            clauses["Confidentiality"].append(line)
        elif "terminate" in line.lower():
            clauses["Termination"].append(line)
        elif "law" in line.lower() or "jurisdiction" in line.lower():
            clauses["Governing Law"].append(line)
    return clauses

def analyze_risks(clauses):
    risk_keywords = ["breach", "liability", "penalty", "damages", "termination without cause"]
    risk = []
    for category, clause_list in clauses.items():
        for clauses in clause_list:
            if any(word in clauses.lower() for word in risk_keywords):
                risk.append(f"Possible Risk in {category}: {clauses}")
    return risk

summary = generate_summary(pdf_text)
entities = extract_entities(pdf_text)
clauses = extract_clauses(pdf_text)
risks = analyze_risks(clauses)

print("\n Summary of the Legal Document")
print(summary)

print("\n Named Entities (Parties, Dates, Amounts, etc)")
for key, values in entities.items():
    if values:
        print(f"{key}: {', '.join(values)}")
print("\n Extracted Clauses")
for key, values in clauses.items():
    if values:
        print(f"{key}:")
        for clause in values:
            print(f"-{clause}")
print("\n Risk Analysis")
if risks:
    for risk in risks:
        print(risk)
else:
    print("No High Risk Clauses Detected")

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Summary of the Legal Document
This Service Contract ("Agreement") is made and entered into as of March 31, 2025. The Service Provider agrees to provide AI-driven agricultural analytics and data management solutions. The Client agrees to pay the Service Provider INR 10,00,000 for the services rendered, payable in three installments.

 Named Entities (Parties, Dates, Amounts, etc)
DATE: March 31, 2025, 10 days, years, 30-day, 30 days, 1996
WORK_OF_ART: Effective Date, this Agreement
ORG: TechSolutions Pvt. Ltd., GreenFarm Technologies, COMPENSATION & PAYMENT, LIABILITY & RISK MANAGEMENT, INR 5,00,000, PENALTIES & BREACH CONSEQUENCES, INR 50,000 per week, the Service Provider, GOVERNING LAW & DISPUTE, Arbitration, Effective Date
GPE: India, Bangalore, India, India, Green Street, Mumbai, India, AI, Client, India, Bangalore, India
MONEY: 123 Tech Park
CARDINAL: 456, 1, 2.1, three, 2.2, 3, 3.1, 3.2, five, 4, 4.1, 4.2, 5, 5.1, 5.2, 6, 6.1, 6.2, 7, 7.1, 7.2, 8, 8.2
PERCENT: 30%, 40%, 30%, 2%