In [4]:
import re
import torch
import joblib
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pdfplumber
from docx import Document


In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])


In [6]:
def split_into_clauses(text):
    # Split on numbering, bullets, or double newlines
    clauses = re.split(r'\n\d+\.|\n\d+\)|\n•|\n-|\n\n', text)
    # Remove very short lines
    clauses = [c.strip() for c in clauses if len(c.strip()) > 20]
    return clauses


In [7]:
model_path = "./legalbert_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# Load LabelEncoder
le = joblib.load("label_encoder.pkl")


In [8]:
def predict_clause_label(clause):
    inputs = tokenizer(clause, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        pred_id = torch.argmax(outputs.logits, dim=-1).item()
    return le.inverse_transform([pred_id])[0], pred_id


In [9]:
def classify_contract(file_path):
    # Step 1: Extract text
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type")

    # Step 2: Split into clauses
    clauses = split_into_clauses(text)

    # Step 3: Define Label ID -> Tier mapping
    label_id_to_tier = {
        0: 5, 1: 5, 2: 5, 3: 5, 4: 1, 5: 1, 6: 1, 7: 3, 8: 2, 9: 2,
        10: 5, 11: 5, 12: 5, 13: 1, 14: 5, 15: 5, 16: 1, 17: 1, 18: 2,
        19: 2, 20: 2, 21: 2, 22: 1, 23: 2, 24: 2, 25: 2, 26: 2, 27: 1,
        28: 2, 29: 2, 30: 4, 31: 4, 32: 5, 33: 5, 34: 2, 35: 2, 36: 4,
        37: 3, 38: 3, 39: 2, 40: 2, 41: 1, 42: 4, 43: 1, 44: 2, 45: 2, 46: 3
    }

    # Step 4: Predict label & assign tier for each clause
    results = []
    for clause in clauses:
        label, pred_id = predict_clause_label(clause)
        tier = label_id_to_tier.get(pred_id, 5)  # default Tier 5 if missing
        results.append({
            "predicted_class_id": pred_id,
            "Predicted Label": label,
            "Tier": tier,
            "Clause": clause
        })

    # Step 5: Save CSV
    df = pd.DataFrame(results)
    df.to_csv("classified_contract.csv", index=False, encoding="utf-8")
    print("✅ Classification complete. Saved to 'classified_contract.csv' with Tiers")
    return df


In [None]:
file_path = r"D:\AI\Projects\Contract_NLP\PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_Franchise Agreement1.pdf"
classified_df = classify_contract(file_path)
classified_df.head()

In [None]:
import openai
import os

# Set your OpenAI API key in env variables
openai.api_key = os.getenv("OPENAI_API_KEY")

SUMMARIZATION_PROMPT = """
You are a legal summarization assistant. Given the contract clause(s) below, create a concise, abstractive legal summary.
Focus on: parties, effective/expiration dates, termination rights, payment/compensation obligations, liability caps, indemnities, IP ownership/licensing, exclusivity, and any unusual risks.
Keep the answer concise (about 3-6 sentences) and use plain language but preserve legal facts and numeric values.

Clause(s):
{chunk_text}

Provide:
1) A short 1-2 sentence overview.
2) Bullet list of top 4 obligations / risks with short tags (e.g., TERMINATION: either party may..., LIABILITY CAP: $X...).
3) If present, list any key dates or numeric amounts found.
"""

def summarize_chunk_openai(chunk_text: str, model: str = "gpt-4o-mini", temperature: float = 0.0, max_tokens: int = 400):
    prompt = SUMMARIZATION_PROMPT.format(chunk_text=chunk_text)
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[{"role":"system","content":"You are a helpful legal assistant."},
                  {"role":"user","content":prompt}],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return resp.choices[0].message.content.strip()


In [None]:
def hierarchical_summary_openai(text: str, chunk_size_chars: int = 3000, overlap_chars: int = 200, model="gpt-4o-mini"):
    # Clean text
    text = re.sub(r'\n{2,}', '\n\n', text).strip()
    
    # Chunk contract text
    chunks = []
    i = 0
    while i < len(text):
        chunk = text[i:i+chunk_size_chars]
        chunks.append(chunk)
        i += chunk_size_chars - overlap_chars

    # Summarize each chunk
    chunk_summaries = []
    for ch in chunks:
        summary = summarize_chunk_openai(ch, model=model)
        chunk_summaries.append(summary)

    # Combine summaries and summarize again for a final summary
    combined = "\n\n".join(chunk_summaries)
    final_prompt = (
        "You are a legal summarization assistant. The following are intermediate summaries "
        "of parts of a contract. Produce a single concise abstractive summary of the whole contract, "
        "emphasizing obligations, risks, and important dates and numeric values. "
        "Also produce a short (4-item) prioritized checklist of clauses that require human review.\n\n"
        f"{combined}"
    )

    resp = openai.ChatCompletion.create(
        model=model,
        messages=[{"role":"system","content":"You are a helpful legal assistant."},
                  {"role":"user","content":final_prompt}],
        temperature=0.0,
        max_tokens=600
    )
    final_summary = resp.choices[0].message.content.strip()
    return final_summary, chunk_summaries


In [None]:
# Use the original extracted contract text
text = extract_text_from_pdf(file_path) if file_path.lower().endswith(".pdf") else extract_text_from_docx(file_path)

final_summary, chunk_summaries = hierarchical_summary_openai(text, chunk_size_chars=2500, overlap_chars=200, model="gpt-4o-mini")

# Print & save
print("=== Final Abstractive Summary ===")
print(final_summary)

with open("contract_abstractive_summary.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)

print("✅ Abstractive summary saved to 'contract_abstractive_summary.txt'")


NameError: name 'file_path' is not defined