EDA

In [1]:
import pandas as pd

In [47]:
df = pd.read_csv("/content/legal_text_classification.csv")

In [48]:
import re
import pandas as pd
from bs4 import BeautifulSoup
from typing import Dict, List


# ---------------------------
# 1. Base text preprocessing
# ---------------------------

def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text(" ")

    # Replace emails
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' <EMAIL> ', text)

    # Replace URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' <URL> ', text)

    # Replace IPs
    text = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', ' <IP> ', text)

    # Replace IDs
    text = re.sub(r'\b[A-ZА-Я]{2,}-?\d{3,}\b', ' <ID> ', text)

    # aaaaa -> aaa
    text = re.sub(r"(.)\1{3,}", r"\1\1\1", text)

    # Remove very long tokens
    text = re.sub(r"\b\w{30,}\b", " ", text)

    # Normalize punctuation
    text = re.sub(r'([!?.,]){2,}', r'\1', text)
    text = re.sub(r"([_\-])\1{2,}", r"\1", text)
    text = re.sub(r"([!?]){2,}", r"\1", text)

    # Remove hashtags
    text = re.sub(r"#(\w+)", r"\1", text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


# ---------------------------
# 2. Block patterns
# ---------------------------

REQUISTES_PATTERNS_EN = [
    r"\bvat\b",
    r"\btax id\b",
    r"\bregistration number\b",
    r"\bcompany number\b",
    r"\baccount (no|number)\b",
    r"\biban\b",
    r"\bswift\b",
    r"\bbic\b",
    r"\bregistered address\b",
]


PAYMENT_PATTERNS_EN = [
    r"\btotal amount\b",
    r"\bamount due\b",
    r"\bsubtotal\b",
    r"\bvat amount\b",
    r"\bgrand total\b",
    r"\bbalance due\b",
]

DATE_PATTERNS_EN = [
    r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
    r"\b(january|february|march|april|may|june|july|august|september|october|november|december)\b.*?\d{4}",
    r"\binvoice date\b",
    r"\bdue date\b",
]

SIGNATURE_PATTERNS_EN = [
    r"\bauthorized signature\b",
    r"\bsigned by\b",
    r"\bon behalf of\b",
    r"\bdirector\b",
]


BLOCK_PATTERNS_EN = {
    "requisites": REQUISTES_PATTERNS_EN,
    "payment": PAYMENT_PATTERNS_EN,
    "dates": DATE_PATTERNS_EN,
    "signatures": SIGNATURE_PATTERNS_EN,
}



# ---------------------------
# 3. Extract blocks
# ---------------------------

def extract_blocks(text: str) -> dict:
    text_lower = text.lower()
    blocks = {}

    for block, patterns in BLOCK_PATTERNS_EN.items():
        matches = []

        for p in patterns:
            for m in re.finditer(p, text_lower):
                start = max(0, m.start() - 150)
                end = min(len(text), m.end() + 150)
                matches.append(text[start:end])

        blocks[block] = " ".join(set(matches))

    return blocks


# ---------------------------
# 4. High-level document processing
# ---------------------------

def process_document(text: str) -> Dict:
    clean_text = preprocess_text(text)
    blocks = extract_blocks(clean_text)

    return {
        "clean_text": clean_text,
        "blocks": blocks,
        "has_requisites": int(bool(blocks["requisites"])),
        "has_payment": int(bool(blocks["payment"])),
        "has_dates": int(bool(blocks["dates"])),
        "has_signatures": int(bool(blocks["signatures"]))
    }


# ---------------------------
# 5. Batch processing DataFrame
# ---------------------------

def process_dataframe(df: pd.DataFrame, text_col: str) -> pd.DataFrame:
    results = df[text_col].apply(process_document).apply(pd.Series)
    return pd.concat([df, results], axis=1)

In [49]:
df = process_dataframe(df, text_col="case_text")

In [60]:
def score_block(block_text: str, patterns: List[str]) -> float:
    if not block_text:
        return 0.0
    hits = 0
    for p in patterns:
        if re.search(p, block_text.lower()):
            hits += 1
    return min(1.0, hits / len(patterns))

def compute_scores(text: str, blocks: Dict[str, str]) -> Dict[str, float]:
    return {
        "requisites_score": score_block(blocks["requisites"], BLOCK_PATTERNS_EN["requisites"]),
        "payment_score": score_block(blocks["payment"], BLOCK_PATTERNS_EN["payment"]),
        "dates_score": score_block(blocks["dates"], BLOCK_PATTERNS_EN["dates"]),
        "signatures_score": score_block(blocks["signatures"], BLOCK_PATTERNS_EN["signatures"]),
    }


# Router (LLM fallback decision)

def should_call_llm(scores: dict) -> bool:
    # fallback если критический блок не найден
    if scores["payment_score"] < 0.5 or scores["requisites_score"] < 0.5:
        return True
    # иначе проверяем общий weighted score
    weighted_score = 0.4 * scores["payment_score"] + 0.4 * scores["requisites_score"] + 0.2 * scores["dates_score"]
    return weighted_score < 0.5


# Process single document

def process_document(text: str) -> Dict:
    clean_text = preprocess_text(text)
    blocks = extract_blocks(clean_text)
    scores = compute_scores(clean_text, blocks)
    return {
        "scores": scores,
        "call_llm": should_call_llm(scores)
    }


# Process DataFrame of documents

def process_dataframe(df: pd.DataFrame, text_col: str) -> pd.DataFrame:
    results = df[text_col].apply(process_document).apply(pd.Series)
    return pd.concat([df, results], axis=1)


In [61]:
    df_processed = process_dataframe(df, "case_text")
    print("\nDataFrame result:")
    print(df_processed[["case_id","call_llm","scores","blocks"]])


DataFrame result:
         case_id  call_llm                                             scores  \
0          Case1      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
1          Case2      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
2          Case3      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
3          Case4      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
4          Case5      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
...          ...       ...                                                ...   
24980  Case25203      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
24981  Case25204      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
24982  Case25205      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
24983  Case25206      True  {'requisites_score': 0.0, 'payment_score': 0.0...   
24984  Case25207      True  {'requisites_score': 0.0, 'payment_score': 0.0...   

        

In [62]:
df_processed

Unnamed: 0,case_id,case_outcome,case_title,case_text,clean_text,blocks,has_requisites,has_payment,has_dates,has_signatures,scores,call_llm
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...,Ordinarily that discretion will be exercised s...,"{'requisites': '', 'payment': '', 'dates': '',...",0,0,0,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...,The general principles governing the exercise ...,"{'requisites': '', 'payment': '', 'dates': ' I...",0,0,1,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...,Ordinarily that discretion will be exercised s...,"{'requisites': '', 'payment': '', 'dates': '',...",0,0,0,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...,The general principles governing the exercise ...,"{'requisites': '', 'payment': '', 'dates': ' I...",0,0,1,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...,The preceding general principles inform the ex...,"{'requisites': '', 'payment': '', 'dates': '',...",0,0,0,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
...,...,...,...,...,...,...,...,...,...,...,...,...
24980,Case25203,cited,Reches Pty Ltd v Tadiran Pty Ltd (1998) 85 FCR...,That is not confined to persons who control th...,That is not confined to persons who control th...,"{'requisites': '', 'payment': '', 'dates': 'pa...",0,0,1,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
24981,Case25204,cited,Sir Lindsay Parkinson &amp; Co Ltd v Triplan L...,Once the threshold prescribed by s 1335 is sat...,Once the threshold prescribed by s 1335 is sat...,"{'requisites': '', 'payment': '', 'dates': '',...",0,0,0,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
24982,Case25205,cited,Spiel v Commodity Brokers Australia Pty Ltd (I...,Once the threshold prescribed by s 1335 is sat...,Once the threshold prescribed by s 1335 is sat...,"{'requisites': '', 'payment': '', 'dates': '',...",0,0,0,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True
24983,Case25206,distinguished,"Tullock Ltd v Walker (Unreported, Supreme Cour...",Given the extent to which Deumer stands to gai...,Given the extent to which Deumer stands to gai...,"{'requisites': '', 'payment': '', 'dates': 'o ...",0,0,1,0,"{'requisites_score': 0.0, 'payment_score': 0.0...",True


In [64]:
df_processed.to_excel('сlean_text.xlsx', index=False)