# **AI system that can extract and summarize key legal information from complex legal documents**

In [2]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/232.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
import spacy
import re
from PyPDF2 import PdfReader
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Load summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def preprocess_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Add more preprocessing steps as needed
    return text

def extract_entities(text):
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}
    return entities

def extract_key_clauses(text):
    # Implement logic to identify key clauses
    # This could involve keyword matching, machine learning, etc.
    # For simplicity, let's just split by newlines for now
    clauses = text.split('\n')
    return [clause for clause in clauses if len(clause) > 50]  # arbitrary length threshold

def summarize_text(text, max_length=150):
    summary = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
    return summary[0]['summary_text']

def generate_report(text):
    preprocessed_text = preprocess_text(text)
    entities = extract_entities(preprocessed_text)
    key_clauses = extract_key_clauses(preprocessed_text)
    summary = summarize_text(preprocessed_text)

    report = f"Summary:\n{summary}\n\n"
    report += "Key Entities:\n"
    for label, entity in entities.items():
        report += f"{label}: {entity}\n"
    report += "\nKey Clauses:\n"
    for clause in key_clauses[:5]:  # Limit to first 5 for brevity
        report += f"- {clause}\n"

    return report

# Main execution
pdf_path = "R.pdf"
legal_text = extract_text_from_pdf(pdf_path)
report = generate_report(legal_text)
print(report)

Your max_length is set to 150, but your input_length is only 3. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=1)


Summary:
CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots of New York for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots.

Key Entities:

Key Clauses:

