In [None]:
!pip install pdfplumber spacy pandas


Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import pdfplumber
import json
import spacy
import pandas as pd
from difflib import SequenceMatcher
from pathlib import Path
import re
import pprint

# === CONFIG ===
MODEL_PATH = "/content/drive/MyDrive/citation_ner_model_v6"
CITATION_CSV = "/content/drive/MyDrive/Citation Data For Fine-Tuning - Sheet1 up.csv"
OUTPUT_DIR = "/content/drive/MyDrive/Extracted citations_v6"

Path(OUTPUT_DIR).mkdir(exist_ok=True)

# === REGEX PATTERNS ===
regex_patterns = [
    r"(Ex\.?\s*\d+[a-zA-Z]?(,?\s*[\“\"][^\"”]+[\”\"])*\.?)",
    r"(Exhibit\s+[A-Z0-9]+(?: at \d+)?\.?)",
    r"(Dkt\. No\. \d+(?:; [A-Za-z]{3}\. \d{1,2}, \d{4})?)",
    r"Group Ex\. [A-Z0-9]+,? \d+(-\d+)?",
    r"\bExhibit\s+[A-Z0-9]+\b",
    r"\bSchedule\s+[A-Z0-9]+\b",
    r"(Ex\.?\s+[A-Z])",
    r"(Ex\.?\s+[A-Z] at \d+)",
    r"(see also [A-Z][^\n;,]+(?:v\.|,))",
]

# === HELPER FUNCTIONS ===

def load_citation_phrases(csv_path):
    df = pd.read_csv(csv_path, encoding='ISO-8859-1')
    df.columns = df.columns.str.strip()
    df = df.rename(columns={"Citation Language": "citation"})
    return df['citation'].dropna().unique().tolist()

def convert_pdf_to_json(pdf_path):
    text_by_page = {}
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text_by_page[f"page_{i+1}"] = page.extract_text() or ""
    return text_by_page

def load_ner_model(model_path):
    return spacy.load(model_path)

def is_close_match(text, phrases, threshold=0.85):
    return any(SequenceMatcher(None, text.lower(), phrase.lower()).ratio() >= threshold for phrase in phrases)

def extract_regex_citations(text, page_key):
    matches = []
    for pattern in regex_patterns:
        found = re.findall(pattern, text, flags=re.IGNORECASE)
        for citation in found:
            if isinstance(citation, tuple):
                citation = citation[0]
            matches.append({
                "page": page_key,
                "citation": citation.strip(),
                "source": "regex"
            })
    return matches

def extract_citations(text_by_page, nlp, known_phrases):
    matched = []
    for page, text in text_by_page.items():
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == "CITATION":
                citation = ent.text.strip()
                if is_close_match(citation, known_phrases):
                    matched.append({
                        "page": page,
                        "citation": citation,
                        "source": "model"
                    })
        # Also add regex citations from same page
        matched.extend(extract_regex_citations(text, page))
    return matched

def save_output(data, pdf_path):
    pdf_name = Path(pdf_path).stem
    output_json = os.path.join(OUTPUT_DIR, f"{pdf_name}_matched_citations.json")
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    return output_json

# === MAIN FUNCTION FOR SINGLE PDF ===
def process_pdf(pdf_path):
    phrases = load_citation_phrases(CITATION_CSV)
    nlp = load_ner_model(MODEL_PATH)
    print(f"\n📄 Processing PDF: {pdf_path}")
    try:
        text_by_page = convert_pdf_to_json(pdf_path)
        matched = extract_citations(text_by_page, nlp, phrases)
        save_path = save_output(matched, pdf_path)
        print(f"\n✅ Total citations found: {len(matched)}")
        print(f"📄 Saved to: {save_path}")
        print("\n📌 Sample citations:")
        pprint.pprint(matched[:5])
        return matched
    except Exception as e:
        print(f"❌ Error processing {pdf_path}: {e}")
        return []

# === EXAMPLE USAGE ===
# Uncomment and update this with your actual PDF path:
pdf_path = "/content/drive/MyDrive/Narayani/MSJ.pdf"
results = process_pdf(pdf_path)





📄 Processing PDF: /content/drive/MyDrive/Narayani/MSJ.pdf

✅ Total citations found: 17
📄 Saved to: /content/drive/MyDrive/Extracted citations_v6/MSJ_matched_citations.json

📌 Sample citations:
[{'citation': 'Ex. 1', 'page': 'page_2', 'source': 'regex'},
 {'citation': 'Exhibit A', 'page': 'page_2', 'source': 'regex'},
 {'citation': 'Exhibit A', 'page': 'page_2', 'source': 'regex'},
 {'citation': 'Compl. at Ex. 1', 'page': 'page_3', 'source': 'model'},
 {'citation': 'Affidavit of J. Noon, ¶ 10',
  'page': 'page_3',
  'source': 'model'}]


In [None]:

import json

# Replace this with the path to your JSON file
json_file_path = "/content/citation_outputs/1_23-cv-13207_22_PRIMARY DOCUMENT_matched_citations.json"  # or ..._matched_citations.json

# Load and pretty-print the content
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Print the entire JSON (can be large)
import pprint
pprint.pprint(data)


[{'citation': 'Serlin v. Arthur Andersen &\n'
              'Co., 3 F.3d 221, 223 (7th Cir. 1993)',
  'page': 'page_4',
  'source': 'model'},
 {'citation': 'Tavistock Rest. Grp., LLC v. Zurich Am. Ins.\n'
              'Co., 2021 WL 1614519, at *9 (N.D. Ill. Apr. 26, 2021)',
  'page': 'page_4',
  'source': 'model'},
 {'citation': 'Id. ¶ 13', 'page': 'page_5', 'source': 'model'},
 {'citation': 'Id. ¶¶ 27, 28', 'page': 'page_5', 'source': 'model'},
 {'citation': 'Exhibit A.', 'page': 'page_5', 'source': 'regex'},
 {'citation': 'Exhibit B.', 'page': 'page_5', 'source': 'regex'},
 {'citation': 'Exhibit A', 'page': 'page_5', 'source': 'regex'},
 {'citation': 'Exhibit B', 'page': 'page_5', 'source': 'regex'},
 {'citation': 'Huertas v. Bayer U.S., LLC, 2023 WL\n'
              '3773139 (D.N.J. May 23, 2023)',
  'page': 'page_6',
  'source': 'model'},
 {'citation': 'Circuit. Huertas v. Bayer U.S.,\nLLC, No. 23-2178 (3d Cir.)',
  'page': 'page_6',
  'source': 'model'},
 {'citation': 'Exhibit C.

In [None]:

import json

# Replace this with the path to your JSON file
json_file_path = "/content/citation_outputs/Affidavit_matched_citations.json"  # or ..._matched_citations.json

# Load and pretty-print the content
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Print the entire JSON (can be large)
import pprint
pprint.pprint(data)


[{'citation': 'Exhibit 1', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 2', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 3', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 4', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 1; Aug. 15, 2016', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 21; Sep. 27, 2016',
  'page': 'page_1',
  'source': 'regex'},
 {'citation': 'Exhibit 1', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 2', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 3', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 4', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Exhibit 5', 'page': 'page_2', 'source': 'regex'},
 {'citation': 'Dkt. No. 10; Oct. 13, 2016',
  'page': 'page_2',
  'source': 'regex'},
 {'citation': 'Exhibit 5', 'page': 'page_2', 'source': 'regex'}]


In [None]:

import json

# Replace this with the path to your JSON file
json_file_path = "/content/citation_outputs/Dkt. No. 30_matched_citations.json"  # or ..._matched_citations.json

# Load and pretty-print the content
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Print the entire JSON (can be large)
import pprint
pprint.pprint(data)


[{'citation': '(N.D. of Ga.,\nNo. 1:16-cv-02970-LMM, ECF No. 1 ¶¶ 12, 27.)\n29',
  'page': 'page_4',
  'source': 'model'},
 {'citation': '(Id. ECF Nos.\n4, 8. 18.)\n30',
  'page': 'page_4',
  'source': 'model'},
 {'citation': '(Id. ECF No. 105.)\n32', 'page': 'page_5', 'source': 'model'},
 {'citation': '(Id. ECF No. 133-9 at 4, 5 of 11.)\n34',
  'page': 'page_5',
  'source': 'model'},
 {'citation': '(Id. ECF No. 133-\n9 at 10 of 11.)\n35',
  'page': 'page_5',
  'source': 'model'},
 {'citation': '(Id. at 6 of 11.)\n36', 'page': 'page_5', 'source': 'model'},
 {'citation': '(Id. ECF No. 133-9 at\n10 of 11.)\n5',
  'page': 'page_5',
  'source': 'model'},
 {'citation': '35 U.S.C. §§ 102 or 103', 'page': 'page_14', 'source': 'model'},
 {'citation': 'U.S.C. § 101, 102, 103, 112, and/or 116',
  'page': 'page_15',
  'source': 'model'},
 {'citation': '35 U.S.C. §§ 102 or 103', 'page': 'page_15', 'source': 'model'},
 {'citation': 'U.S.C. § 101, 102, 103, 112, and/or 116',
  'page': 'page_16',
  '

In [None]:

import json

# Replace this with the path to your JSON file
json_file_path = "/content/citation_outputs/Primary Doc_matched_citations.json"  # or ..._matched_citations.json

# Load and pretty-print the content
with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Print the entire JSON (can be large)
import pprint
pprint.pprint(data)


[{'citation': 'Dkt. No. 30', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 20', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 25', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 19', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 30', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'Dkt. No. 31', 'page': 'page_1', 'source': 'regex'},
 {'citation': 'ex N', 'page': 'page_3', 'source': 'regex'},
 {'citation': 'ex C', 'page': 'page_4', 'source': 'regex'},
 {'citation': 'Exhibit 1', 'page': 'page_6', 'source': 'regex'},
 {'citation': 'Exhibit 1', 'page': 'page_6', 'source': 'regex'},
 {'citation': 'See Exhibit 2 (AerSale’s Answer and Counterclaims)',
  'page': 'page_7',
  'source': 'model'},
 {'citation': 'See Exhibit 3 (Docket), Dkt. No. 105',
  'page': 'page_7',
  'source': 'model'},
 {'citation': '’s Defamation claim.\n'
              'See Exhibit 4, at Dkt. No. 133-9, at p. 7-8 (Notice of '
              'Arbitration)',
 