# ESG Pipeline Notebook
Run all cells to produce the cross-company summary.


## PDF Parsing Upgrade: Docling

We now use **Docling** as the primary parser for higher-fidelity text extraction from PDFs (tables/columns often parse better).
If Docling isn't available, we transparently fall back to PyPDF2.

**Setup (outside this environment if needed):**
```bash
pip install docling docling-core pypdfium2
```


In [2]:

import re

# Prefer Docling if available; fall back to PyPDF2.
def read_pdf_text(pdf_path: str):
    """
    Returns: List[{"page": int, "text": str}]
    """
    pages = []
    # Try Docling
    try:
        from docling.document_converter import DocumentConverter
        converter = DocumentConverter()
        result = converter.convert(pdf_path)
        doc = result.document

        # Try different ways to get page text, since docling APIs can vary by version
        for i, page in enumerate(getattr(doc, "pages", [])):
            txt = ""
            try:
                # Some versions expose export_text()
                if hasattr(page, "export_text"):
                    txt = page.export_text() or ""
                elif hasattr(page, "to_text"):
                    txt = page.to_text() or ""
                elif hasattr(page, "content"):
                    # As a last resort, join textual content
                    try:
                        txt = " ".join([c.text for c in page.content if getattr(c, "text", "")])
                    except Exception:
                        txt = ""
            except Exception:
                txt = ""
            # Normalize docling text newlines/hyphenation similar to earlier logic
            txt = re.sub(r"(\w)-\n(\w)", r"\1\2", txt or "")
            txt = txt.replace("\r", "\n")
            txt = re.sub(r"[ \t]+", " ", txt)
            txt = re.sub(r"\n{3,}", "\n\n", txt)
            pages.append({"page": i+1, "text": (txt or "").strip()})
        # If we collected any pages via Docling, return them
        if pages:
            return pages
    except Exception as e:
        # Docling not installed or failed; fall back below
        pass

    # Fall back to PyPDF2
    try:
        import PyPDF2
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for i, page in enumerate(reader.pages):
                try:
                    raw = page.extract_text() or ""
                except Exception:
                    raw = ""
                raw = re.sub(r"(\w)-\n(\w)", r"\1\2", raw or "")
                raw = raw.replace("\r", "\n")
                raw = re.sub(r"[ \t]+", " ", raw)
                raw = re.sub(r"\n{3,}", "\n\n", raw)
                pages.append({"page": i+1, "text": (raw or "").strip()})
    except Exception:
        pass
    return pages


In [3]:

import os, pandas as pd
from pathlib import Path

# If running in a new environment, ensure dependencies:
# pip install pandas numpy PyPDF2



PDFS_DIR = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\data"
OUT_DIR  = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\outputs"

# Reuse the pipeline from this notebook cell

import re, math
import numpy as np
try:
    import PyPDF2
    HAS_PYPDF2 = True
except:
    HAS_PYPDF2 = False

def read_pdf_text(pdf_path):
    pages = []
    if not HAS_PYPDF2:
        return pages
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for i, page in enumerate(reader.pages):
                try:
                    raw = page.extract_text() or ""
                except Exception:
                    raw = ""
                pages.append({"page": i+1, "text": raw})
    except Exception:
        return []
    return pages

def clean_text(t):
    import re
    if not t: return ""
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)
    t = t.replace("\r", "\n")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

def segment_text(text, max_chars=700):
    import re
    segments = []
    paras = re.split(r"\n\s*\n", text) if text else []
    for p in paras:
        p = p.strip()
        if not p: 
            continue
        if len(p) <= max_chars:
            segments.append(p)
        else:
            words = p.split()
            buf = []
            cur_len = 0
            for w in words:
                buf.append(w)
                cur_len += len(w) + 1
                if cur_len > max_chars:
                    segments.append(" ".join(buf).strip())
                    buf, cur_len = [], 0
            if buf:
                segments.append(" ".join(buf).strip())
    return segments

ESG_KEYWORDS = {
  "GHG Emissions": [
    "\\bghg\\b",
    "scope\\s*1",
    "scope\\s*2",
    "scope\\s*3",
    "\\bcarbon\\b",
    "co2e",
    "emission(s)?",
    "net\\s*zero",
    "decarboni[sz]e",
    "\\btcf(d)?\\b"
  ],
  "Water & Effluents": [
    "\\bwater\\b",
    "effluent(s)?",
    "wastewater",
    "m3",
    "withdrawal",
    "discharge"
  ],
  "Waste Management": [
    "\\bwaste\\b",
    "landfill",
    "recycl(e|ing)",
    "circular",
    "hazardous"
  ],
  "Energy Management": [
    "\\benergy\\b",
    "renewable",
    "electricity",
    "kwh",
    "megawatt|mwh"
  ],
  "Biodiversity & Land Use": [
    "\\bbiodiversit(y|ies)\\b",
    "habitat",
    "deforest",
    "land use"
  ],
  "Labor Practices": [
    "\\bsupply\\s*chain\\b",
    "\\blabor\\b",
    "collective bargaining",
    "working hours",
    "wages?"
  ],
  "Diversity, Equity & Inclusion": [
    "\\bdiversit(y|ies)\\b",
    "inclusion",
    "gender",
    "women in leadership",
    "underrepresented"
  ],
  "Data Security & Privacy": [
    "data\\s+(security|privacy|protection)",
    "cyber",
    "breach",
    "gdpr"
  ],
  "Product Quality & Safety": [
    "product\\s+(safety|quality|recall)"
  ],
  "Business Ethics & Compliance": [
    "\\bethics?\\b",
    "anti[-\\s]?corruption",
    "bribery",
    "compliance",
    "whistleblow"
  ]
}

def classify_topics(text):
    t = text.lower()
    hits = []
    import re
    for topic, patterns in ESG_KEYWORDS.items():
        if any(re.search(p, t) for p in patterns):
            hits.append(topic)
    return hits

CLAIM_PATTERNS = ["\\b(we|the company|our)\\s+(will|plan|aim|target|commit|committed|intend)\\b", "\\bby\\s+20\\d{2}\\b", "\\b(increase|reduce|cut|lower|improve|achieve|reach)\\b", "\\b\\d+(\\.\\d+)?\\s*%", "\\bfrom a \\d{4} baseline\\b", "\\bsc(ope)?\\s*1\\b|\\bsc(ope)?\\s*2\\b|\\bsc(ope)?\\s*3\\b"]

def is_claim(text):
    import re
    t = text.lower()
    return any(re.search(p, t) for p in CLAIM_PATTERNS)

POS_WORDS = set(['leadership', 'achieve', 'reduction', 'renewable', 'reduced', 'delivered', 'met', 'exceeded', 'inclusive', 'improved', 'progress', 'improving', 'exceedance', 'strong', 'efficient', 'reduce', 'improve', 'target', 'exceed', 'robust', 'milestone'])
NEG_WORDS = set(['worsening', 'failure', 'delays', 'risks', 'violate', 'risk', 'negative', 'violation', 'concern', 'incident', 'noncompliance', 'incidents', 'penalty', 'failed', 'worsen', 'problematic', 'breaches', 'penalties', 'concerns', 'breach', 'delay', 'non-compliance'])

def sentiment_score(text):
    import re, math
    tokens = re.findall(r"[a-zA-Z]+", text.lower())
    if not tokens:
        return 0.5
    pos = sum(1 for w in tokens if w in POS_WORDS)
    neg = sum(1 for w in tokens if w in NEG_WORDS)
    raw = pos - neg
    return 1/(1 + math.exp(-raw))

HEDGE_WORDS = set(['could', 'as', 'ambition', 'planning', 'consider', 'might', 'may', 'appropriate', 'seek', 'strive', 'potentially', 'towards', 'work', 'encourage', 'intend', 'explore', 'aim', 'ambitioned', 'plan', 'where', 'feasible'])
METRIC_TERMS = set(['co2', 'kwh', 'tonnes', 'liters', 'gwh', 'co2e', 'm3', 'scope', 'baseline', 'tco2e', 'intensity', 'frequency', 'mwh', 'tons', 'water', 'emissions', 'rate', 'waste', 'energy'])

def specificity_score(text):
    import re
    t = text.lower()
    has_percent = bool(re.search(r"\d+(\.\d+)?\s*%", t))
    has_year    = bool(re.search(r"\b(19|20)\d{2}\b", t))
    has_numbers = bool(re.search(r"\b\d+(\.\d+)?\b", t))
    has_by_year = bool(re.search(r"\bby\s+(19|20)\d{2}\b", t))
    metrics     = sum(1 for m in METRIC_TERMS if m in t)
    hedges      = sum(1 for h in HEDGE_WORDS if h in t)

    base = 0.2
    if has_numbers: base += 0.2
    if has_percent: base += 0.2
    if has_year:    base += 0.15
    if has_by_year: base += 0.15
    base += min(metrics, 5) * 0.03
    base -= min(hedges, 5) * 0.05
    return float(max(0.0, min(1.0, base)))

def hedging_score(text):
    t = text.lower()
    hedges = sum(1 for h in HEDGE_WORDS if h in t)
    return float(min(1.0, hedges / 5.0))

def future_focus_score(text):
    import re
    t = text.lower()
    future_hits = len(re.findall(r"\b(will|aim|plan|target|by\s+20\d{2}|net zero|2030|2050)\b", t))
    past_hits   = len(re.findall(r"\b(achieved|delivered|completed|reached|reduced|improved|decreased)\b", t))
    total = future_hits + past_hits
    if total == 0: 
        return 0.5
    return float(min(1.0, future_hits / total))

def greenwashing_risk(sentiment, specificity, hedging, future_focus):
    w_pos = 0.25; w_low_spec = 0.35; w_hedge = 0.25; w_future = 0.15
    low_spec = 1 - specificity
    return float(max(0.0, min(1.0, w_pos*sentiment + w_low_spec*low_spec + w_hedge*hedging + w_future*future_focus)))

def derive_company_and_sector_from_filename(filename):
    import os, re
    base = os.path.basename(filename)
    m = re.match(r"([A-Za-z0-9&\-\s]+)_(Tech|Energy|Finance|Consumer|Industrial|Healthcare|Utilities|Materials)", base, re.I)
    if m: return (m.group(1).strip(), m.group(2).title())
    name = os.path.splitext(base)[0]
    return (name, "Unknown")

def process_pdf(pdf_path):
    company, sector = derive_company_and_sector_from_filename(pdf_path)
    pages = read_pdf_text(pdf_path)
    rows = []
    for p in pages:
        text = clean_text(p["text"])
        for seg in segment_text(text):
            topics = classify_topics(seg)
            if not topics:
                continue
            iscl = is_claim(seg)
            sent = sentiment_score(seg)
            spec = specificity_score(seg)
            hedge = hedging_score(seg)
            fut = future_focus_score(seg)
            green = greenwashing_risk(sent, spec, hedge, fut)
            rows.append({
                "Company": company, "Sector": sector, "Doc": os.path.basename(pdf_path),
                "Page": p["page"], "Text": seg, "Topics": topics, "IsClaim": iscl,
                "Sentiment": sent, "Specificity": spec, "Hedging": hedge, "FutureFocus": fut, "Greenwash": green
            })
    return rows

def run_pipeline(pdf_paths):
    import pandas as pd
    all_rows = []
    for pdf in pdf_paths:
        if os.path.exists(pdf):
            all_rows.extend(process_pdf(pdf))
    if not all_rows:
        return pd.DataFrame(columns=["Company","Sector","ESG Topic","Extracted Claim","Sentiment Score","Specificity Score","Greenwashing Risk Score","Doc","Page"])
    df = pd.DataFrame(all_rows)
    df_claims = df[(df["IsClaim"] == True) & (df["Topics"].map(lambda t: len(t)>0))].copy()
    if df_claims.empty:
        return pd.DataFrame(columns=["Company","Sector","ESG Topic","Extracted Claim","Sentiment Score","Specificity Score","Greenwashing Risk Score","Doc","Page"])
    df_claims["ESG Topic"] = df_claims["Topics"].map(lambda x: x[0] if x else "")
    out = df_claims.rename(columns={
        "Text":"Extracted Claim", "Sentiment":"Sentiment Score", "Specificity":"Specificity Score", "Greenwash":"Greenwashing Risk Score"
    })[["Company","Sector","ESG Topic","Extracted Claim","Sentiment Score","Specificity Score","Greenwashing Risk Score","Doc","Page"]]
    out = out.drop_duplicates(subset=["Company","Extracted Claim","Page"]).sort_values(by=["Company","ESG Topic","Specificity Score"], ascending=[True,True,False])
    return out

pdfs = sorted([os.path.join(PDFS_DIR, f) for f in os.listdir(PDFS_DIR) if f.lower().endswith(".pdf")])
df_summary = run_pipeline(pdfs)
df_summary.to_csv(os.path.join(OUT_DIR, "cross_company_esg_claim_summary.csv"), index=False)
df_summary.head(20)


Unnamed: 0,Company,Sector,ESG Topic,Extracted Claim,Sentiment Score,Specificity Score,Greenwashing Risk Score,Doc,Page



## Groq-powered Metadata Extraction (Report-level)

This section uses the **Groq Chat Completions API** to produce **structured metadata** for each PDF.
It sends a compact context from the document (first/last pages and high-signal segments) and asks the model to return JSON following a schema.

> **Setup**  
> 1. Create a Groq API key and export it as an environment variable:
> ```bash
> export GROQ_API_KEY=your_key_here
> ```
> 2. (Optional) Install dependencies if you run outside ChatGPT:
> ```bash
> pip install requests pandas PyPDF2
> ```
> 3. Put your PDFs into `pdfs/`, then run this section.


In [4]:

import os, json, re, time, math
from pathlib import Path
from typing import List, Dict, Any
import requests
import pandas as pd




PDFS_DIR = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\data"
OUT_DIR  = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\outputs"

# --- Minimal PDF text extraction (rely on previous section if already imported) ---
try:
    import PyPDF2
    HAS_PYPDF2 = True
except Exception:
    HAS_PYPDF2 = False

def read_pdf_text(pdf_path: str):
    pages = []
    if not HAS_PYPDF2:
        return pages
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for i, page in enumerate(reader.pages):
                try:
                    raw = page.extract_text() or ""
                except Exception:
                    raw = ""
                pages.append({"page": i+1, "text": raw})
    except Exception:
        return []
    return pages

def clean_text(t: str) -> str:
    if not t:
        return ""
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)
    t = t.replace("\r", "\n")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

# --- Compact context builder ---
def compact_context(pages: List[Dict[str,Any]], max_chars=6000) -> str:
    """Take first ~2 pages, last ~1 page, and up to 6 'dense' paragraphs from the middle."""
    if not pages:
        return ""
    txts = []
    # first 2 pages
    for p in pages[:2]:
        txts.append(clean_text(p["text"]))
    # middle dense paragraphs (longest paragraphs heuristic)
    mids = pages[2:-1] if len(pages) > 3 else []
    paras = []
    for p in mids:
        for para in re.split(r"\n\s*\n", clean_text(p["text"])):
            s = para.strip()
            if len(s) > 150:
                paras.append(s)
    paras = sorted(paras, key=lambda s: -len(s))[:6]
    txts.extend(paras)
    # last page
    txts.append(clean_text(pages[-1]["text"]))
    joined = "\n\n".join([t for t in txts if t])
    return joined[:max_chars]

# --- Groq Chat Completions wrapper (OpenAI-compatible) ---
def groq_chat_completion(messages, model="llama-3.1-70b-versatile", temperature=0.1, max_tokens=1000, retries=2, timeout=60):
    api_key = "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"
    if not api_key:
        print("⚠️ GROQ_API_KEY not set; skipping API call.")
        return Noneyyyyy
    
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "response_format": {"type": "json_object"}
    }
    for attempt in range(retries+1):
        try:
            r = requests.post(url, headers=headers, json=payload, timeout=timeout)
            if r.status_code == 200:
                data = r.json()
                content = data["choices"][0]["message"]["content"]
                return content
            else:
                print(f"Groq API status {r.status_code}: {r.text[:200]}")
        except Exception as e:
            print(f"Groq call failed (attempt {attempt+1}/{retries+1}): {e}")
        time.sleep(1 + attempt)
    return None

# --- JSON schema (report-level metadata) ---
SCHEMA = {
    "company_name": "string",
    "report_title": "string",
    "report_year": "string|int",
    "reporting_period": "string",
    "sector_guess": "string",
    "geography": "string",
    "standards_referenced": ["SASB|GRI|TCFD|CDP|UNGC|Other"],
    "scope_coverage": {"scope1": "bool", "scope2": "bool", "scope3": "bool"},
    "material_topics": ["string"],
    "targets": [{"topic": "string", "metric": "string", "baseline": "string", "target": "string", "target_year": "string"}],
    "kpis": [{"name": "string", "value": "string", "unit": "string", "year": "string"}],
    "assurance_audit": "string",
    "risks_highlights": ["string"],
    "notes": "string",
    "_confidence": 0.0
}

SYSTEM_PROMPT = """You are an ESG analyst assistant. Extract *concise, factual* report-level metadata from the provided context. 
Return strictly valid JSON following the provided schema. If uncertain, use empty strings, false, or []. Avoid hallucinating."""

USER_TEMPLATE = """SCHEMA:
{schema}

CONTEXT (excerpted from the PDF; may be partial):
---
{context}
---

Rules:
- Use short strings.
- Keep arrays small (≤8 items each).
- Prefer info explicitly stated in context.
- If the company name or year is ambiguous, leave it blank.
- Return only JSON (no markdown)."""

def extract_metadata_for_pdf(pdf_path: str) -> Dict[str,Any]:
    pages = read_pdf_text(pdf_path)
    ctx = compact_context(pages)
    if not ctx:
        return {"_error": "no_text"}
    user_msg = USER_TEMPLATE.format(schema=json.dumps(SCHEMA, indent=2), context=ctx)
    content = groq_chat_completion([
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_msg}
    ])
    if not content:
        return {"_error": "no_api_or_failed"}
    try:
        data = json.loads(content)
        return data
    except Exception as e:
        # If model returns non-JSON, try to salvage
        try:
            content_stripped = content[content.find("{"):content.rfind("}")+1]
            data = json.loads(content_stripped)
            return data
        except Exception:
            return {"_error": f"bad_json: {str(e)[:120]}", "_raw": content[:500]}

def run_groq_metadata():
    pdfs = sorted([str(p) for p in Path(PDFS_DIR).glob("*.pdf")])
    results = {}
    for p in pdfs:
        print(f"Processing with Groq: {Path(p).name}")
        md = extract_metadata_for_pdf(p)
        results[Path(p).name] = md
    # Save JSON
    out_json = Path(OUT_DIR) / "metadata_groq.json"
    out_json.write_text(json.dumps(results, indent=2))
    # Also normalize to CSV (best-effort flatten)
    rows = []
    for fname, md in results.items():
        base = {"file": fname}
        if isinstance(md, dict):
            for k, v in md.items():
                if isinstance(v, (str, int, float, bool)) or v is None:
                    base[k] = v
            # join list-like fields succinctly
            base["material_topics"] = ", ".join(md.get("material_topics", [])[:8]) if isinstance(md.get("material_topics"), list) else ""
            base["standards_referenced"] = ", ".join(md.get("standards_referenced", [])[:8]) if isinstance(md.get("standards_referenced"), list) else ""
            base["risks_highlights"] = ", ".join(md.get("risks_highlights", [])[:8]) if isinstance(md.get("risks_highlights"), list) else ""
            # summarize targets/kpis counts
            base["targets_count"] = len(md.get("targets", [])) if isinstance(md.get("targets"), list) else 0
            base["kpis_count"] = len(md.get("kpis", [])) if isinstance(md.get("kpis"), list) else 0
        rows.append(base)
    df = pd.DataFrame(rows)
    out_csv = Path(OUT_DIR) / "metadata_groq.csv"
    df.to_csv(out_csv, index=False)
    print("Saved:", out_json, out_csv)
    return results

_ = run_groq_metadata()


Processing with Groq: LSE_PAG_2024_business_esg.pdf
Processing with Groq: Samsung_Electronics_Sustainability_Report_2025.pdf
Processing with Groq: acer-incorporated_2023.pdf
Processing with Groq: deutsche_esg.pdf
Processing with Groq: intel_esg.pdf
Processing with Groq: nippon_esg.pdf
Processing with Groq: tata_steel_esg.pdf
Saved: B:\mandg\data\metadata_groq.json B:\mandg\data\metadata_groq.csv



### Parser robustness + diagnostics

We now try multiple backends in order: **Docling → pdfplumber → PyPDF2 → (optional OCR)**.
A quick diagnostics table will show per-file page and character counts, so you can see if a PDF is image-only.


In [8]:

import re, os, io, warnings

def _clean_text_basic(t: str) -> str:
    if not t: return ""
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)
    t = t.replace("\r", "\n")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

def _try_docling(pdf_path: str):
    pages = []
    try:
        from docling.document_converter import DocumentConverter
        converter = DocumentConverter()
        result = converter.convert(pdf_path)
        doc = result.document
        got = False
        for i, page in enumerate(getattr(doc, "pages", [])):
            txt = ""
            try:
                if hasattr(page, "export_text"):
                    txt = page.export_text() or ""
                elif hasattr(page, "to_text"):
                    txt = page.to_text() or ""
                elif hasattr(page, "content"):
                    try:
                        txt = " ".join([c.text for c in page.content if getattr(c, "text", "")])
                    except Exception:
                        txt = ""
            except Exception:
                txt = ""
            txt = _clean_text_basic(txt or "")
            pages.append({"page": i+1, "text": txt})
            if txt.strip(): got = True
        if got:
            return pages
    except Exception:
        pass
    return []

def _try_pdfplumber(pdf_path: str):
    pages = []
    try:
        import pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for i, p in enumerate(pdf.pages):
                try:
                    txt = p.extract_text() or ""
                except Exception:
                    txt = ""
                pages.append({"page": i+1, "text": _clean_text_basic(txt)})
        if any(pg["text"] for pg in pages):
            return pages
    except Exception:
        pass
    return []

def _try_pypdf2(pdf_path: str):
    pages = []
    try:
        import PyPDF2
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for i, page in enumerate(reader.pages):
                try:
                    raw = page.extract_text() or ""
                except Exception:
                    raw = ""
                pages.append({"page": i+1, "text": _clean_text_basic(raw)})
        if any(pg["text"] for pg in pages):
            return pages
    except Exception:
        pass
    return []

def _try_ocr(pdf_path: str, max_pages=10):
    """Very slow; only used if pytesseract + pdf2image are installed and no other parser worked."""
    pages = []
    try:
        import pytesseract
        from pdf2image import convert_from_path
    except Exception:
        return []
    try:
        images = convert_from_path(pdf_path, dpi=200)
        for i, img in enumerate(images[:max_pages]):
            try:
                txt = pytesseract.image_to_string(img) or ""
            except Exception:
                txt = ""
            pages.append({"page": i+1, "text": _clean_text_basic(txt)})
        if any(pg["text"] for pg in pages):
            return pages
    except Exception:
        pass
    return []

def read_pdf_text(pdf_path: str):
    # Try in order: docling -> pdfplumber -> PyPDF2 -> OCR
    for fn in (_try_docling, _try_pdfplumber, _try_pypdf2, _try_ocr):
        pages = fn(pdf_path)
        if any(pg.get("text") for pg in pages):
            return pages
    return []

def parse_diagnostics(pdf_dir: str):
    from pathlib import Path
    rows = []
    for p in sorted(Path(pdf_dir).glob("*.pdf")):
        pages = read_pdf_text(str(p))
        total_chars = sum(len(pg.get("text","")) for pg in pages)
        rows.append({"file": p.name, "pages": len(pages), "total_chars": total_chars})
    import pandas as pd
    df = pd.DataFrame(rows)
    display(df)
    return df


PDFS_DIR = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\data"
# OUT_DIR  = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\outputs"

parse_diagnostics(PDFS_DIR);






Unnamed: 0,file,pages,total_chars
0,acer-incorporated_2023.pdf,153,611933
1,deutsche_esg.pdf,737,2766179
2,intel_esg.pdf,492,1224657
3,LSE_PAG_2024_business_esg.pdf,50,137411
4,nippon_esg.pdf,34,307887
5,Samsung_Electronics_Sustainability_Report_2025...,87,380782
6,tata_steel_esg.pdf,62,191166



### Fallback behavior (no claims found)

If a PDF has text but no explicit *claim* matches, we will still output the **top topic segments** by specificity
so the summary table is not empty and you have material to review.


In [9]:

import os, re, math, pandas as pd, numpy as np
from pathlib import Path

# Reuse classify_topics, sentiment_score, specificity_score, etc. from earlier cells.
# If they're not in memory (fresh kernel), you may need to re-run the earlier pipeline cells first.

def _collect_segments(pdf_path: str):
    pages = read_pdf_text(pdf_path)
    segs = []
    for p in pages:
        text = p["text"]
        # simple segmenter: split paragraphs
        paras = re.split(r"\n\s*\n", text) if text else []
        for para in paras:
            s = para.strip()
            if not s:
                continue
            segs.append((p["page"], s))
    return segs

def run_pipeline_with_fallback(pdf_dir: str, out_csv: str):
    pdfs = [str(p) for p in Path(pdf_dir).glob("*.pdf")]
    rows = []
    for pdf in pdfs:
        company, sector = ("Unknown", "Unknown")
        try:
            from os.path import basename, splitext
            base = basename(pdf)
            name = splitext(base)[0]
            company = name
        except Exception:
            pass

        # gather segments
        segs = _collect_segments(pdf)
        found_claim = False
        topic_rows = []

        for page, seg in segs:
            topics = classify_topics(seg) if 'classify_topics' in globals() else []
            if not topics:
                continue
            sent = sentiment_score(seg) if 'sentiment_score' in globals() else 0.5
            spec = specificity_score(seg) if 'specificity_score' in globals() else 0.2
            iscl = is_claim(seg) if 'is_claim' in globals() else False
            green = greenwashing_risk(sent, spec, hedging_score(seg), future_focus_score(seg)) if 'greenwashing_risk' in globals() else 0.5

            if iscl:
                found_claim = True
                rows.append({
                    "Company": company, "Sector": sector, "ESG Topic": topics[0],
                    "Extracted Claim": seg[:600],
                    "Sentiment Score": round(sent,3), "Specificity Score": round(spec,3),
                    "Greenwashing Risk Score": round(green,3),
                    "Doc": os.path.basename(pdf), "Page": page
                })
            else:
                topic_rows.append({
                    "Company": company, "Sector": sector, "ESG Topic": topics[0],
                    "Extracted Claim": seg[:600],
                    "Sentiment Score": round(sent,3), "Specificity Score": round(spec,3),
                    "Greenwashing Risk Score": round(green,3),
                    "Doc": os.path.basename(pdf), "Page": page
                }) 

        if not found_claim and topic_rows:
            # take top N by specificity as fallback surface
            topic_rows.sort(key=lambda r: (-r["Specificity Score"], r["ESG Topic"]))
            rows.extend(topic_rows[:10])

    df = pd.DataFrame(rows).drop_duplicates(subset=["Company","Extracted Claim","Page"])
    if not df.empty:
        df = df.sort_values(by=["Company","ESG Topic","Specificity Score"], ascending=[True, True, False])
    df.to_csv(out_csv, index=False)
    display(df.head(20))
    return df

OUT_CSV = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\outputs\cross_company_esg_claim_summary.csv"
_ = run_pipeline_with_fallback(PDFS_DIR, OUT_CSV)
print("Wrote:", OUT_CSV)






Unnamed: 0,Company,Sector,ESG Topic,Extracted Claim,Sentiment Score,Specificity Score,Greenwashing Risk Score,Doc,Page
609,LSE_PAG_2024_business_esg,Unknown,Biodiversity & Land Use,Home Introduction About Paragon Environment So...,0.269,0.35,0.645,LSE_PAG_2024_business_esg.pdf,12
632,LSE_PAG_2024_business_esg,Unknown,Business Ethics & Compliance,Home Introduction About Paragon Environment So...,0.007,0.53,0.545,LSE_PAG_2024_business_esg.pdf,47
633,LSE_PAG_2024_business_esg,Unknown,Data Security & Privacy,Home Introduction About Paragon Environment So...,0.0,0.53,0.565,LSE_PAG_2024_business_esg.pdf,48
630,LSE_PAG_2024_business_esg,Unknown,"Diversity, Equity & Inclusion",Home Introduction About Paragon Environment So...,0.998,0.75,0.637,LSE_PAG_2024_business_esg.pdf,40
629,LSE_PAG_2024_business_esg,Unknown,"Diversity, Equity & Inclusion",Home Introduction About Paragon Environment So...,0.998,0.65,0.622,LSE_PAG_2024_business_esg.pdf,39
626,LSE_PAG_2024_business_esg,Unknown,"Diversity, Equity & Inclusion",Home Introduction About Paragon Environment So...,0.881,0.63,0.65,LSE_PAG_2024_business_esg.pdf,33
628,LSE_PAG_2024_business_esg,Unknown,"Diversity, Equity & Inclusion",Home Introduction About Paragon Environment So...,0.993,0.53,0.713,LSE_PAG_2024_business_esg.pdf,38
618,LSE_PAG_2024_business_esg,Unknown,GHG Emissions,Home Introduction About Paragon Environment So...,0.731,0.97,0.363,LSE_PAG_2024_business_esg.pdf,21
613,LSE_PAG_2024_business_esg,Unknown,GHG Emissions,Home Introduction About Paragon Environment So...,0.881,0.94,0.441,LSE_PAG_2024_business_esg.pdf,16
611,LSE_PAG_2024_business_esg,Unknown,GHG Emissions,Home Introduction About Paragon Environment So...,0.999,0.9,0.575,LSE_PAG_2024_business_esg.pdf,14


Wrote: B:\mandg\data\cross_company_esg_claim_summary.csv



## BRSR / SEBI-Core Flow (Structure-aware parsing for Indian CSR/ESG reports)

This step adds structure-aware rules for **Business Responsibility and Sustainability Report (BRSR)** style PDFs.
It extracts company/FY, **assurance provider**, reporting boundary, plants/offices, export %, key employee totals,
POSH complaints, **material issues**, and standards referenced.  
Outputs: `out/brsr_metadata.json` and `out/brsr_metadata.csv`.


In [11]:

import os, re, json
from pathlib import Path
from typing import Dict, Any, List
import pandas as pd

PDFS_DIR = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\data"
OUT_DIR  = r"B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\outputs"

try:
    read_pdf_text
except NameError:
    try:
        import PyPDF2
        def read_pdf_text(pdf_path: str):
            pages = []
            with open(pdf_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                for i, page in enumerate(reader.pages):
                    try:
                        raw = page.extract_text() or ""
                    except Exception:
                        raw = ""
                    pages.append({"page": i+1, "text": raw})
            return pages
    except Exception:
        def read_pdf_text(pdf_path: str): return []

def _join(pages: List[Dict[str, Any]]) -> str:
    return "\n\n".join([p.get("text","") for p in pages])

def _search(pattern, text, flags=re.IGNORECASE|re.MULTILINE|re.DOTALL, group=1, default=""):
    m = re.search(pattern, text, flags)
    if not m: return default
    try:
        return m.group(group).strip()
    except Exception:
        return default

def extract_brsr_structured(pdf_path: str) -> Dict[str, Any]:
    pages = read_pdf_text(pdf_path)
    txt = _join(pages)

    data: Dict[str, Any] = {
        "file": os.path.basename(pdf_path),
        "company_name": "",
        "report_title": "",
        "financial_year": "",
        "assurance_provider": "",
        "assurance_type": "",
        "plants_india": "",
        "offices_india": "",
        "plants_outside_india": "",
        "offices_outside_india": "",
        "exports_pct_standalone": "",
        "employees_total_consolidated": "",
        "employees_perm_consolidated": "",
        "workers_perm_consolidated": "",
        "complaints_posh_total_standalone": "",
        "standards_referenced": [],
        "material_issues": [],
        "policies_weblink": ""
    }

    data["report_title"] = _search(r"(?i)(BUSINESS RESPONSIBILITY AND SUSTAINABILITY REPORT)", txt, group=1, default="").title()
    data["financial_year"] = _search(r"(?i)Financial Year\s+([0-9\-–/ ]{7,})", txt, group=1, default="")
    data["company_name"] = _search(r"(?i)\bName of the Listed Entity\s+([^\n]+)", txt)

    data["assurance_provider"] = _search(r"(?i)Name of Assurance Provider\s+.*?\n(.+?)\n", txt)
    data["assurance_type"] = _search(r"(?i)Type of Assurance Obtained\s+(.+?)\n", txt)

    data["policies_weblink"] = _search(r"(?i)Web-Link of the\s+Policies.*?Link:\s*(https?://\S+)", txt)

    plants_block = _search(r"(?i)Number of locations where plants.*?\n+Location.*?\n+(.*?)(?:\n{2,}|\Z)", txt, group=1, default="")
    if plants_block:
        def _get_num(label):
            m = re.search(rf"{label}\s+(\d+)\s+(\d+)\s+(\d+)", plants_block, re.IGNORECASE)
            return m.groups() if m else ("","","")
        ind = _get_num("India")
        out = _get_num("Outside India")
        if ind != ("","",""):
            data["plants_india"], data["offices_india"], _total = ind
        if out != ("","",""):
            data["plants_outside_india"], data["offices_outside_india"], _tot2 = out

    data["exports_pct_standalone"] = _search(r"(?i)% of exports in total revenue\s+(\d+)\s", txt)

    data["employees_perm_consolidated"] = _search(r"(?i)Permanent\s*\(E\)\s+(\d{2,})\s", txt)
    data["workers_perm_consolidated"] = _search(r"(?i)Workers.*?Permanent\s*\(G\)\s+(\d{2,})\s", txt)
    data["employees_total_consolidated"] = _search(r"(?i)Total Employees\s*\(E\+\s*F\)\s+(\d{2,})", txt)

    data["complaints_posh_total_standalone"] = _search(r"(?i)Total Complaints reported.*?Standalone.*?\n.*?\n.*?\n.*?\n.*?\n?(\d+)\s", txt)

    std_hits = set()
    for tag in ["SASB", "GRI", "TCFD", "CDP", "UNGC", "ResponsibleSteel"]:
        if re.search(rf"\b{tag}\b", txt, re.IGNORECASE):
            std_hits.add(tag)
    data["standards_referenced"] = sorted(std_hits)

    mat_block = _search(r"(?i)Material issues identified(.*?)(?:SECTION B|Governance, Leadership)", txt, group=1, default="")
    if not mat_block:
        mat_block = _search(r"(?i)Material issues.*?(A\.\s*STRATEGIC.*?)(?:\n\d+\.\s|SECTION B)", txt, group=1, default="")
    issues = []
    for m in re.finditer(r"(?m)^[ABC]\d+\.\s+([^\n]+)", mat_block):
        issues.append(m.group(1).strip())
    data["material_issues"] = issues[:12]

    return data

def run_brsr_batch(pdf_dir=PDFS_DIR):
    pdfs = sorted([str(p) for p in Path(pdf_dir).glob("*.pdf")])
    results = [extract_brsr_structured(p) for p in pdfs]
    out_json = Path(OUT_DIR) / "brsr_metadata.json"
    out_json.write_text(json.dumps(results, indent=2))
    rows = []
    for d in results:
        row = {k: (", ".join(v) if isinstance(v, list) else v) for k, v in d.items() if k != "material_issues"}
        row["material_issues"] = "; ".join(d.get("material_issues", []))
        rows.append(row)
    df = pd.DataFrame(rows)
    out_csv = Path(OUT_DIR) / "brsr_metadata_1.csv"
    df.to_csv(out_csv, index=False)
    display(df.head(20))
    print("Wrote:", out_json, out_csv)
    return df

df_brsr = run_brsr_batch()






Unnamed: 0,file,company_name,report_title,financial_year,assurance_provider,assurance_type,plants_india,offices_india,plants_outside_india,offices_outside_india,exports_pct_standalone,employees_total_consolidated,employees_perm_consolidated,workers_perm_consolidated,complaints_posh_total_standalone,standards_referenced,policies_weblink,material_issues
0,LSE_PAG_2024_business_esg.pdf,,,,,,,,,,,,,,,"CDP, TCFD",,
1,Samsung_Electronics_Sustainability_Report_2025...,,,,,,,,,,,,,,,"CDP, GRI, SASB, TCFD",,
2,acer-incorporated_2023.pdf,,,,,,,,,,,,,,,"CDP, GRI, SASB, TCFD",,
3,deutsche_esg.pdf,,,0 0 8 32 40,,,,,,,,,,,,"CDP, GRI, TCFD",,
4,intel_esg.pdf,,,,,,,,,,,,,,,"CDP, GRI, SASB, TCFD",,
5,nippon_esg.pdf,,,,,,,,,,,,,,,"GRI, TCFD",,
6,tata_steel_esg.pdf,Tata Steel Limited,Business Responsibility And Sustainability Report,2024-25,assurance on BRSR Core indicators and select i...,PW & Co CA LLP has undertaken reasonable assur...,76.0,143.0,39.0,20.0,6.0,75.0,,,2014.0,"ResponsibleSteel, TCFD",,Greenhouse Gas Emissions and Climate Change Ma...


Wrote: B:\mandg\data\brsr_metadata.json B:\mandg\data\brsr_metadata_1.csv


In [3]:
"""
Post-processing & Global Summarization for ESG Claim Analysis

This module expects a DataFrame with at least the following columns:
- Company (str)
- Sector (str)
- ESG Topic (str)
- Extracted Claim (str)
- Sentiment Score (float, -1..1 or 0..1)
- Specificity Score (float, 0..1)
- Greenwashing Risk Score (Optional) (float, 0..1)

Outputs:
- outputs/cross_company_summary.csv
- outputs/executive_summary.md
- outputs/topic_briefs.md
"""

from __future__ import annotations
import os
import math
import re
import pandas as pd
from collections import defaultdict, Counter
from typing import List, Dict

def _safe_mean(series):
    s = pd.to_numeric(series, errors="coerce").dropna()
    return float(s.mean()) if len(s) else float("nan")

def _normalize_topic(t: str) -> str:
    if not isinstance(t, str):
        return "Unknown"
    return re.sub(r"\s+", " ", t.strip()).title()

def _top_n_strings(strings: List[str], n=5, min_len=30):
    seen = set()
    uniq = []
    for s in strings:
        if not isinstance(s, str):
            continue
        s_clean = s.strip()
        if len(s_clean) < min_len:
            continue
        if s_clean in seen:
            continue
        seen.add(s_clean)
        uniq.append(s_clean)
    return uniq[:n]

def aggregate(df: pd.DataFrame) -> pd.DataFrame:
    """Compute cross-company aggregates by ESG Topic and Sector."""
    if "ESG Topic" not in df.columns:
        raise ValueError("Expected 'ESG Topic' column in DataFrame")
    tmp = df.copy()
    tmp["ESG Topic"] = tmp["ESG Topic"].map(_normalize_topic)

    group_cols = ["ESG Topic", "Sector"]
    agg = tmp.groupby(group_cols).agg(
        num_claims=("Extracted Claim", "count"),
        num_companies=("Company", lambda s: s.nunique()),
        avg_sentiment=("Sentiment Score", _safe_mean),
        avg_specificity=("Specificity Score", _safe_mean),
        avg_greenwash=("Greenwashing Risk Score (Optional)", _safe_mean) if "Greenwashing Risk Score (Optional)" in tmp.columns else ("Sentiment Score", lambda s: float("nan")),
    ).reset_index()

    # topic coverage (#companies per topic)
    coverage = tmp.groupby("ESG Topic")["Company"].nunique().rename("companies_per_topic").reset_index()

    # save aggregates
    out_path = os.path.join("outputs", "cross_company_summary.csv")
    os.makedirs("outputs", exist_ok=True)
    agg.to_csv(out_path, index=False)
    return agg.merge(coverage, on="ESG Topic", how="left")

def _quantize(series, q=(0.25, 0.75)):
    s = pd.to_numeric(series, errors="coerce").dropna()
    if len(s) < 3:
        return (None, None)
    return (float(s.quantile(q[0])), float(s.quantile(q[1])))

def _mk_bullets(items: List[str]) -> str:
    return "".join([f"- {it}\n" for it in items])

def _summarize_findings(agg: pd.DataFrame, df: pd.DataFrame) -> str:
    """Create a compact executive narrative with data-backed highlights."""
    low_q, high_q = _quantize(agg["avg_specificity"])
    low_s, high_s = _quantize(agg["avg_sentiment"])

    strengths = agg.sort_values(["avg_specificity","avg_sentiment"], ascending=[False, False])\
                   .head(5)["ESG Topic"].tolist()
    risks = agg.sort_values(["avg_specificity","avg_sentiment"], ascending=[True, True])\
               .head(5)["ESG Topic"].tolist()

    # Topic breadth
    topic_breadth = agg[["ESG Topic","companies_per_topic"]]\
        .sort_values("companies_per_topic", ascending=False).head(5)
    breadth_lines = [f"{r['ESG Topic']} ({int(r['companies_per_topic'])} companies)"
                     for _, r in topic_breadth.iterrows()]

    # Claim exemplars: pick a few representative claims with high specificity and polarity extremes
    pos_examples = df.sort_values("Sentiment Score", ascending=False)\
                     .dropna(subset=["Extracted Claim"])\
                     .groupby("ESG Topic")["Extracted Claim"].apply(lambda s: _top_n_strings(list(s), n=1))\
                     .explode().dropna().head(5).tolist()

    neg_examples = df.sort_values("Sentiment Score", ascending=True)\
                     .dropna(subset=["Extracted Claim"])\
                     .groupby("ESG Topic")["Extracted Claim"].apply(lambda s: _top_n_strings(list(s), n=1))\
                     .explode().dropna().head(5).tolist()

    lines = []
    lines.append("## Executive Summary")
    lines.append("This section synthesizes all parsed PDFs to highlight cross-company patterns and notable claims.")
    lines.append("")
    lines.append("### Where companies look strongest (high specificity & positive sentiment)")
    lines.append(_mk_bullets(strengths) if strengths else "- (insufficient data)\n")
    lines.append("")
    lines.append("### Potential risk/greenwashing hotspots (low specificity & positive tone or ambiguity)")
    lines.append(_mk_bullets(risks) if risks else "- (insufficient data)\n")
    lines.append("")
    lines.append("### Most commonly covered topics (by # of companies)")
    lines.append(_mk_bullets(breadth_lines) if breadth_lines else "- (insufficient data)\n")
    lines.append("")
    if pos_examples:
        lines.append("### Representative positive claims")
        lines.append(_mk_bullets(pos_examples))
        lines.append("")
    if neg_examples:
        lines.append("### Representative challenges/risks")
        lines.append(_mk_bullets(neg_examples))
        lines.append("")

    return "\n".join(lines).strip() + "\n"

def _topic_briefs(agg: pd.DataFrame) -> str:
    """Create concise briefs per topic with directional guidance."""
    lines = ["# Topic Briefs", ""]
    for _, r in agg.sort_values("ESG Topic").iterrows():
        topic = r["ESG Topic"]
        lines.append(f"## {topic}")
        lines.append(f"- Companies covering: **{int(r['companies_per_topic']) if not pd.isna(r['companies_per_topic']) else 0}**")
        lines.append(f"- Avg. Sentiment: **{r['avg_sentiment']:.2f}** | Avg. Specificity: **{r['avg_specificity']:.2f}**")
        if not pd.isna(r.get("avg_greenwash", float('nan'))):
            try:
                lines.append(f"- Avg. Greenwashing Risk (opt): **{r['avg_greenwash']:.2f}**")
            except Exception:
                pass
        lines.append("")
    return "\n".join(lines).strip() + "\n"

def run_global_summary(input_csv: str, output_dir: str = "outputs") -> Dict[str, str]:
    """
    Execute full post-processing:
    1) Load the per-claim CSV
    2) Aggregate across companies
    3) Write cross_company_summary.csv, executive_summary.md, topic_briefs.md
    Returns dict of generated paths.
    """
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(input_csv)
    agg = aggregate(df)
    agg_path = os.path.join(output_dir, "cross_company_summary.csv")
    agg.to_csv(agg_path, index=False)

    exec_md = _summarize_findings(agg, df)
    exec_path = os.path.join(output_dir, "executive_summary.md")
    with open(exec_path, "w", encoding="utf-8") as f:
        f.write(exec_md)

    briefs_md = _topic_briefs(agg)
    briefs_path = os.path.join(output_dir, "topic_briefs.md")
    with open(briefs_path, "w", encoding="utf-8") as f:
        f.write(briefs_md)

    return {"aggregates": agg_path, "executive_summary": exec_path, "topic_briefs": briefs_path}


In [2]:
# from src.postprocess_summary import run_global_summary

# Replace with your pipeline’s claims table path:
INPUT_CSV = "B:\mandg\ESG_Deliverables\ESG_Deliverables_with_Summary\outputs\final_claims_table_1.csv"

paths = run_global_summary(INPUT_CSV, output_dir="outputs")
paths  # shows generated file paths


ModuleNotFoundError: No module named 'src'