# Green Terms Communication Analysis

## Overview
This module identifies and quantifies environmental terminology usage in sustainability reports, supporting the Green Communication Intensity dimension. It detects green terms across multiple categories including basic nouns ("sustainability", "renewable"), context-dependent terms ("reduced emissions"), and multi-word phrases ("low carbon economy").

## Term Detection Strategy  
- **Systematic overlap resolution**: Longer terms processed first to prevent double-counting ("renewable energy generation" before "renewable")
- **Negation filtering**: Excludes green terms in negative contexts ("lack of renewable energy investments")
- **Context-dependent classification**: Neutral words only count when paired with positive modifiers ("captured CO2", "improved efficiency")
- **POS-aware extraction**: Handles nouns, adjectives, verbs, adverbs, and multi-word combinations

## Key Processing Steps
1. **Term identification**: Matches lemmatized tokens against comprehensive green vocabulary lists
2. **Overlap resolution**: Prevents double-counting through systematic precedence rules
3. **Negation detection**: Uses dependency parsing to identify negated green terms
4. **Frequency calculation**: (green words ÷ total content words) × 100, excluding stopwords and punctuation
5. **Vocabulary diversity**: Tracks unique environmental terms relative to total unique words

## Variables Produced for Communication Scoring
According to the analysis framework:
- **Green Term Frequency** → Green Communication Intensity dimension
- **Vocabulary Diversity** → Green Communication Intensity dimension

These variables measure environmental content coverage across climate, energy, and sustainability topics, providing quantitative assessment of how frequently companies discuss environmental themes relative to other content.

## Validation Features
Comprehensive term breakdown by POS type, negation analysis, and context verification ensure accurate green term identification across varied reporting styles and linguistic structures.

In [None]:
import spacy
from spacy_layout import spaCyLayout
from pathlib import Path
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter

# Load spaCy model and configure for large documents
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 1_500_000

In [None]:
from pathlib import Path

# Toggle between "test" and "actual"
MODE = "actual"  

# Define configuration based on mode
if MODE == "test":
    report_names = [ 
        "Axpo_Holding_AG", "NEOEN_SA"
    ]
    folders = {
        "2021": Path("data/NLP/Testing/Reports/Clean/2021"),
        "2022": Path("data/NLP/Testing/Reports/Clean/2022")
    }

elif MODE == "actual":
    report_names = [ 
        "Akenerji_Elektrik_Uretim_AS",
        "Arendals_Fossekompani_ASA",
        "Atlantica_Sustainable_Infrastructure_PLC",
        "CEZ",
        "EDF",
        "EDP_Energias_de_Portugal_SA",
        "Endesa",
        "ERG_SpA",
        "Orsted",
        "Polska_Grupa_Energetyczna_PGE_SA",
        "Romande_Energie_Holding_SA",
        "Scatec_ASA",
        "Solaria_Energia_y_Medio_Ambiente_SA",
        "Terna_Energy_SA"
    ]

    folders = {
        "2021": Path("data/NLP/Reports/Cleanest/2021"),
        "2022": Path("data/NLP/Reports/Cleanest/2022")
    }

else:
    raise ValueError("Invalid MODE. Use 'test' or 'actual'.")

# Check availability
for name in report_names:
    file_name = f"{name}.txt"
    in_2021 = (folders["2021"] / file_name).exists()
    in_2022 = (folders["2022"] / file_name).exists()
    print(f"{file_name}: 2021: {'YES' if in_2021 else 'NO'} | 2022: {'YES' if in_2022 else 'NO'}")


In [None]:
# Dictionary to store processed docs
documents = {}

# Load and process all documents
for version, folder_path in folders.items():
    for name in report_names:
        txt_path = folder_path / f"{name}.txt"
        try:
            with open(txt_path, "r", encoding="utf-8") as f:
                text = f.read()
            doc_key = f"{name}_{version}"
            documents[doc_key] = nlp(text)
            print(f"Processed {doc_key}")
        except Exception as e:
            print(f"Error processing {txt_path.name}: {e}")

print(f"\nTotal documents loaded: {len(documents)}")

## Green word frequency

In [None]:
# List of green/sustainable nouns (lemmas) - EMISSIONS FOCUSED
green_nouns = [
    "adaptation", "afforestation", "biodiversity", "biofuel", "biogas", "biomass", 
    "ccs", "ccus", "cogeneration", "decarbonisation", "decarbonization", "ecology", 
    "ecosystem", "electrification", "environment", "ess", "geothermal", "hydropower", 
    "improvement", "innovation", "mitigation", "optimization", "photovoltaic", 
    "preservation", "pv", "recycling", "reforestation", "regeneration", "renewable", 
    "renewables", "responsibility", "restoration", "solar", "sustainability", 
    "transition", "transparency", "wind"
]

# Multi-word green nouns (lemmas) - EMISSIONS FOCUSED
green_multiword_nouns = {
    "abatement": ["carbon", "co2", "co2e", "emission", "ghg", "pollution"], 
    "bond": ["climate", "green", "sustainability"], 
    "bonds": ["climate", "green", "sustainability"], 
    "capture": ["carbon", "co2", "ghg", "methane"],
    "development": ["clean", "renewable", "sustainable"],
    "economy": ["circular", "green", "hydrogen", "sustainable"],
    "energy": ["alternative", "clean", "geothermal", "hydro", "renewable", "solar", "tidal", "wind"],
    "farm": ["offshore", "solar", "wind"],
    "farms": ["offshore", "solar", "wind"],
    "finance": ["climate", "green", "sustainable"],
    "financing": ["climate", "green", "sustainable"],
    "fuel": ["alternative", "bio", "clean", "hydrogen", "synthetic"],
    "fuels": ["alternative", "bio", "clean", "hydrogen", "synthetic"], 
    "fund": ["climate", "green", "sustainability"], 
    "funds": ["climate", "green", "sustainability"], 
    "generation": ["clean", "renewable"],
    "goal": ["carbon", "climate", "emission"], 
    "goals": ["carbon", "climate", "emission"],
    "growth": ["clean", "climate", "green", "renewable", "sustainable"],
    "hydrogen": ["blue", "clean", "green", "renewable"],
    "investment": ["clean", "climate", "green", "renewable", "sustainable"],
    "investments": ["clean", "climate", "green", "renewable", "sustainable"],
    "management": ["carbon", "energy", "environmental", "waste"],
    "neutral": ["carbon", "climate", "co2", "emission"],
    "neutrality": ["carbon", "climate", "co2", "emission"],
    "panel": ["photovoltaic", "pv", "solar"],
    "panels": ["photovoltaic", "pv", "solar"],
    "plant": ["biomass", "geothermal", "hydro", "renewable", "solar", "wind"],
    "plants": ["biomass", "geothermal", "hydro", "renewable", "solar", "wind"],
    "power": ["clean", "geothermal", "hydro", "renewable", "solar", "tidal", "wind"],
    "program": ["conservation", "efficiency", "renewable", "retrofit"],
    "project": ["clean", "efficiency", "green", "renewable"],
    "reduction": ["carbon", "co2", "emission", "ghg", "waste"],
    "reductions": ["carbon", "co2", "emission", "ghg", "waste"],
    "sequestration": ["carbon", "co2", "ghg"], 
    "solution": ["clean", "climate", "green", "renewable"],
    "solutions": ["clean", "climate", "green", "renewable"],
    "source": ["clean", "geothermal", "green", "hydro", "renewable", "solar", "wind"], 
    "sources": ["clean", "geothermal", "green", "hydro", "renewable", "solar", "wind"],
    "standard": ["efficiency", "environmental", "green", "performance"],
    "standards": ["efficiency", "environmental", "green", "performance"],
    "station": ["clean", "geothermal", "green", "hydro", "renewable", "solar", "wind"],
    "stations": ["clean", "geothermal", "green", "hydro", "renewable", "solar", "wind"],
    "storage": ["carbon", "co2"],
    "technology": ["carbon", "clean", "efficiency", "green", "renewable"],
    "technologies": ["carbon", "clean", "efficiency", "green", "renewable"],
    "transition": ["climate", "energy", "green"],
    "turbine": ["offshore", "onshore", "wind"],
    "turbines": ["offshore", "onshore", "wind"],
    "zero": ["carbon", "climate", "emission", "footprint", "ghg", "net", "pollution", "waste"]
}

# List of single-word green adjectives (lemmas) - EMISSIONS FOCUSED
green_adjectives = [
    "circular", "clean", "decarbonise", "decarbonised", "decarbonising", "decarbonize", 
    "decarbonized", "decarbonizing", "durable", "ecological", "ecosystemic", "efficient", 
    "enriching", "environmental", "environmentally", "green", "hydroelectric", "innovative",
    "optimal", "proenvironmental", "recover", "recoverable", "recovered", "recyclable", 
    "recycle", "recycled", "recycling", "reforested", "refurbish", "refurbished", 
    "regenerable", "regenerate", "regenerated", "renewable", "renewables", "responsible", 
    "restore", "restored", "reusable", "reuse", "sustainable", "sustainably"
]

# Multi-word green adjectives (lemmas) - EMISSIONS FOCUSED
green_multiword_adjectives = {
    "based": ["biomass", "nature", "plant", "renewable"], 
    "bio": ["energy", "fuel", "gas", "mass"],
    "biomass": ["fired", "fueled", "powered"],
    "carbon": ["captured", "capturing", "free", "low", "lower", "negative", "neutral", "non", "sequestered", "zero"],
    "ccs": ["enabled", "equipped", "ready"], 
    "efficient": ["eco", "energy", "fuel", "high", "resource"],
    "efficiency": ["eco", "energy", "fuel", "high", "resource"],
    "electric": ["all", "geothermal", "hydro", "solar", "tidal", "wind"],
    "emission": ["free", "low", "zero"], 
    "emissions": ["free", "low", "zero"], 
    "emitting": ["non", "zero"],
    "energy": ["alternative", "clean", "efficient", "environmental", "renewable", "saved"], 
    "energies": ["alternative", "clean", "efficient", "environmental", "renewable", "saved"],
    "environmental": ["certified", "energy", "management", "responsible"],
    "free": ["carbon", "coal", "co2", "emission", "emissions", "fossil", "waste"],
    "friendly": ["climate", "eco", "environment", "environmental", "planet"], 
    "friendlier": ["carbon", "climate", "eco", "environment", "environmental", "planet"],
    "gas": ["bio", "renewable"],
    "intensity": ["low", "reduced"], 
    "mitigating": ["key"],
    "natural": ["protected"],
    "negative": ["carbon", "co2", "emission", "emissions"],
    "neutral": ["carbon", "climate", "co2", "emission"],
    "oriented": ["climate", "ecosystem", "sustainability"],
    "planting": ["forest", "tree"],
    "pollutant": ["anti", "controlling", "low", "preventing", "reduced", "zero"],
    "pollution": ["anti", "controlling", "low", "preventing", "reduced", "zero"],
    "production": ["clean", "green", "renewable", "responsible", "sustainable"],
    "proof": ["climate", "future"],
    "protected": ["environmental", "natural"],
    "reducing": ["carbon", "emission", "ghg", "pollution"],
    "related": ["climate", "environment", "sustainability"],
    "resilient": ["climate", "environment"],
    "responsible": ["climate", "eco", "environmental"],
    "saving": ["carbon", "energy", "fuel", "resource"],
    "sustainable": ["certified", "climate"],
    "zero": ["carbon", "emission", "net"]
}

# List of green verbs (lemmas) - EMISSIONS FOCUSED
green_verbs = [
    "afforeste", "afforesting", "conserve", "conserving", "decarbonize", "decarbonizing", 
    "decarbonise", "decarbonising", "electrify", "electrifying", "innovate", "innovating",
    "minimize", "minimizing", "minimise", "minimising", "mitigate", "mitigating", 
    "optimize", "optimizing", "optimise", "optimising", "preserve", "preserving", 
    "recover", "recovering", "recycle", "recycling", "remediate", "remediating", 
    "reforest", "reforesting", "regenerate", "regenerating", "restore", "restoring", 
    "reuse", "reusing", "transition", "transitioning", "upgrade", "upgrading"
]

# List of green adverbs (lemmas) - EMISSIONS FOCUSED
green_adverbs = [
    "cleanly", "consciously", "ecologically", "efficiently", "environmentally", 
    "optimally", "renewably", "responsibly", "sustainably"
]

# Multi-word green adverbs (lemmas) - EMISSIONS FOCUSED
green_multiword_adverbs = {
    "aware": ["carbon", "climate", "eco", "environmentally"],
    "based": ["nature", "plant", "renewable", "sustainability"], 
    "compliant": ["climate", "emission", "environmentally"],
    "compatible": ["climate", "eco", "environmentally"],
    "conscious": ["carbon", "climate", "eco", "environmentally"],
    "designed": ["efficiently", "environmentally", "sustainably"],
    "driven": ["climate", "renewable", "sustainability"],
    "efficient": ["carbon", "energy", "fuel", "resource"],
    "focused": ["climate", "emission", "environmental", "sustainability"],
    "friendly": ["carbon", "climate", "co2", "eco", "environmentally"],
    "managed": ["environmentally", "responsibly", "sustainably"],
    "neutral": ["carbon", "climate", "emission"],
    "operated": ["cleanly", "efficiently", "sustainably"],
    "oriented": ["climate", "environment", "renewable", "sustainability"],
    "produced": ["cleanly", "renewably", "sustainably"],
    "responsible": ["carbon", "climate", "environmentally"],
    "safe": ["climate", "eco", "environmentally"],
    "sensitive": ["carbon", "climate", "environmentally"],
    "sound": ["carbon", "climate", "environmentally"]
}

# Neutral terms that become green with proper context (lemmas) - EMISSIONS FOCUSED
neutral_nouns = [
    "co2", "co2e", "cooling", "emission", "emissions", "energy", "footprint", 
    "fuel", "ghg", "heating", "methane", "pollution", "transportation", "waste"
]

# Multi-word neutral terms that become green with context (lemmas) - EMISSIONS FOCUSED
neutral_multiword_nouns = {
    "consumption": ["coal", "electricity", "energy", "fuel", "gas", "oil", "power"],
    "economy": ["carbon"],
    "emission": ["annual", "baseline", "carbon", "co2", "co2e", "direct", "ghg", "indirect", "scope", "total"],
    "emissions": ["annual", "baseline", "carbon", "co2", "co2e", "direct", "ghg", "indirect", "scope", "total"],
    "footprint": ["carbon", "co2", "ecological", "emission", "environmental", "ghg"],
    "impact": ["carbon", "climate", "ecological", "environmental"],
    "intensity": ["carbon", "co2", "emission", "energy", "fuel", "ghg"],
    "usage": ["electricity", "energy", "fuel", "power", "resource"],
    "use": ["electricity", "energy", "fuel", "resource"]
}

# Context words that indicate positive change - EMISSIONS FOCUSED
improvement_adjectives = [
    "advanced", "best", "better", "boosted", "enhanced", "excellent", "exceptional", 
    "improved", "impressive", "optimized", "optimised", "outstanding", "positive", 
    "remarkable", "strengthened", "successful", "superior", "upgraded"
]

# Improvement verbs that indicate positive change
improvement_verbs = [
    "advance", "advancing", "boost", "boosting", "deliver", "delivering", "enhance", 
    "enhancing", "improve", "improving", "optimize", "optimizing", "optimise", 
    "optimising", "outperform", "outperforming", "strengthen", "strengthening", 
    "upgrade", "upgrading"
]
# Improvement adverbs that indicate positive manner
improvement_adverbs = [
    "effectively", "efficiently", "excellently", "meaningfully", "positively", "successfully"
]
# Dependency-based green term patterns - EMISSIONS FOCUSED
dependency_green_patterns = {
    "improvement_adjectives": {
        "head_words": ["efficiency", "generation", "performance", "process", "solution", "system", "technology"],
        "dependent_words": ["advanced", "better", "cleaner", "enhanced", "greener", "improved", "innovative", "optimized"],
        "dependency_relations": ["amod"],
        "description": "Improvement adjectives with green nouns"
    },
    "increase_positive": {
        "head_words": ["boost", "develop", "enhance", "expand", "grow", "improve", "increase", "scale"],
        "dependent_words": ["capture", "conservation", "efficiency", "recycling", "renewable", "sequestration", "sustainability"],
        "dependency_relations": ["dobj"],
        "description": "Positive action verbs with green objects"
    },
    "achieve_climate": {
        "head_words": ["achieve", "attain", "deliver", "reach", "realize"],
        "dependent_words": ["neutrality", "reduction", "transition", "zero"],
        "dependency_relations": ["dobj"],
        "description": "Achievement verbs with climate goals"
    },
    "transition_to": {
        "head_words": ["change", "migrate", "move", "shift", "switch", "transition"],
        "dependent_words": ["circular", "clean", "green", "renewable", "sustainable", "zero"],
        "dependency_relations": ["prep", "pobj"],
        "description": "Transition phrases"
    },
    "investment_in": {
        "head_words": ["allocate", "finance", "funding", "invest", "investment", "spend"],
        "dependent_words": ["carbon", "clean", "climate", "environmental", "green", "renewable", "sustainable"],
        "dependency_relations": ["prep", "pobj"],
        "description": "Investment in green areas"
    },
    "commitment_to": {
        "head_words": ["commitment", "dedication", "pledge", "promise"],
        "dependent_words": ["climate", "neutrality", "reduction", "sustainability", "zero"],
        "dependency_relations": ["prep", "pobj"],
        "description": "Commitments to climate goals"
    },
    "sustainable_adverbs": {
        "head_words": ["develop", "generate", "grow", "manage", "manufacture", "operate", "produce"],
        "dependent_words": ["cleanly", "efficiently", "environmentally", "responsibly", "sustainably"],
        "dependency_relations": ["advmod"],
        "description": "Sustainable manner of operations"
    }
}


In [None]:
# NEGATION DETECTION SYSTEM - COMPREHENSIVE WORD LISTS

# Direct negation words (alphabetically sorted)
direct_negation_words = [
    "absence", "barely", "beneath", "below", "deficit", "devoid", "empty", "exempt", 
    "excluded", "failed", "gap", "hardly", "impossible", "inadequate", "insufficient", 
    "lacking", "minus", "never", "neither", "nil", "no", "nobody", "none", "nor", 
    "not", "nothing", "nowhere", "rarely", "scarcely", "seldom", "short", "shortfall", 
    "unable", "void", "without", "zero"
]

# Negative descriptor words (alphabetically sorted)
negative_descriptor_words = [
    "absence", "absent", "barrier", "blocks", "blocked", "cancel", "cancelled", 
    "cease", "ceased", "challenge", "concern", "constraint", "constraints", 
    "decline", "decrease", "deficit", "degrade", "degraded", "deteriorate", 
    "deteriorated", "difficulties", "difficulty", "drop", "downturn", "end", 
    "ended", "fail", "failed", "failure", "fall", "gap", "halt", "halted", 
    "harm", "hinders", "hindered", "impedes", "impeded", "inability", "incapable", 
    "inadequate", "ineffective", "inefficient", "insufficient", "issues", "lack", 
    "lacking", "lacks", "limited", "limiting", "loss", "missing", "obstacle", 
    "prevents", "prevented", "problem", "problems", "reduction", "reject", "rejected", 
    "refuse", "refused", "setback", "shortage", "shortfall", "shrinkage", "stop", 
    "stopped", "struggles", "struggling", "suspend", "suspended", "threat", 
    "unable", "unsuccessful", "weakening", "worse", "worsen", "worsening"
]

# Phrasal negation patterns (alphabetically sorted)
phrasal_negation_patterns = [
    "absence of", "anything but", "are no plans", "are not", "aren't", "can not", 
    "can't", "cannot", "cease", "could not", "couldn't", "devoid of", "did not", 
    "didn't", "do not", "does not", "doesn't", "don't", "exempt from", "failed to", 
    "failing to", "far from", "free from", "had not", "hadn't", "has not", "hasn't", 
    "have not", "haven't", "inability to", "incapable of", "inadequate to", 
    "insufficient to", "instead of", "is no plan", "is not", "isn't", "lack of", 
    "may not", "might not", "must not", "mustn't", "need not", "needn't", 
    "never", "no attempt to", "no chance to", "no effort to", "no intention to", 
    "no longer", "no means to", "no opportunity to", "no plans to", "no way to", 
    "not enough", "not yet", "other than", "rather than", "scarcity of", 
    "short of", "shortage of", "should not", "shouldn't", "too few", "too few to", 
    "too little", "too little to", "unable to", "was not", "wasn't", "were not", 
    "weren't", "will not", "without", "without any", "without proper", 
    "without sufficient", "without the", "won't", "would not", "wouldn't"
]

# Negation prefixes (alphabetically sorted)
negation_prefixes = [
    "anti", "counter", "de", "dis", "false", "fake", "il", "im", "in", "ir", 
    "mis", "non", "pseudo", "un"
]

# Words that shouldn't cause negation in positive green contexts
protected_green_context_words = [
    "abate", "curb", "cut", "declining", "decrease", "decreasing", "eliminate", 
    "fewer", "less", "low", "lower", "minimal", "reduce", "reduction", "slash", "zero"
]

# Verbs that when negated should exclude green terms in their scope
reduction_negative_verbs = [
    "abate", "abating", "control", "controlling", "curb", "curbing", "cut", "cutting", 
    "decrease", "decreasing", "eliminate", "eliminating", "limit", "limiting", 
    "lower", "lowering", "minimize", "minimizing", "minimise", "minimising", 
    "reduce", "reducing", "remove", "removing", "restrict", "restricting", 
    "slash", "slashing"
]

In [None]:
def find_context_for_neutral_term(doc, neutral_token_start, neutral_token_end, all_existing_terms):
    """
    Find context words (negative/improvement) for a neutral term.
    Returns: (context_found, context_word, context_type, context_relationship)
    """
    # Get the main token of the neutral term
    main_token = doc[neutral_token_start]
    
    # For multiword terms, find the syntactic head
    if neutral_token_end > neutral_token_start:
        tokens_in_term = [doc[i] for i in range(neutral_token_start, neutral_token_end + 1)]
        for token in tokens_in_term:
            if token.head not in tokens_in_term or token.head == token:
                main_token = token
                break
    
    # Apply distance filtering to head_subtree scope
    head_subtree_all = list(main_token.head.subtree)
    head_subtree_filtered = [
        token for token in head_subtree_all 
        if (main_token.i - 8) <= token.i <= (main_token.i + 4)
    ]
    
    # Define search scopes with restricted head_subtree
    search_scopes = [
        ("subtree", list(main_token.subtree)),
        ("ancestors", list(main_token.ancestors)),
        ("head_subtree", head_subtree_filtered)
    ]
    
    # Collect context word lists
    negative_context_words = (set(negative_descriptor_words) | 
                             set(protected_green_context_words) | 
                             set(reduction_negative_verbs))
    
    improvement_context_words = (set(improvement_adjectives) | 
                                set(improvement_verbs) | 
                                set(improvement_adverbs))
    
    # Search for context in each scope
    for scope_name, scope_tokens in search_scopes:
        for token in scope_tokens:
            token_lemma = token.lemma_.lower()
            
            # Skip if this token is part of existing green terms
            if is_word_part_of_green_terms(token, all_existing_terms):
                continue
            
            # Check for negative context
            if token_lemma in negative_context_words:
                if is_context_related_to_term(token, main_token, scope_name):
                    return True, token.text, "negative", token.dep_
            
            # Check for improvement context  
            if token_lemma in improvement_context_words:
                if is_context_related_to_term(token, main_token, scope_name):
                    return True, token.text, "improvement", token.dep_
    
    return False, None, None, None

def is_context_related_to_term(context_token, neutral_token, scope_name):
    """
    Validate that context word is syntactically related to neutral term.
    """
    # Direct dependency relationship
    if context_token.head == neutral_token or neutral_token.head == context_token:
        return True
    
    # Same head (siblings)
    if context_token.head == neutral_token.head and scope_name == "head_subtree":
        return True
    
    # Ancestor relationship
    if scope_name == "ancestors":
        neutral_ancestors = list(neutral_token.ancestors)
        if context_token in neutral_ancestors[:3]:
            return True
    
    # Subtree relationship
    if scope_name == "subtree":
        if context_token in list(neutral_token.subtree):
            return True
    
    return False

def find_context_dependent_terms(doc, excluded_positions, all_existing_terms):
    """
    Find neutral terms that become green when paired with context words.
    Returns: (found_terms, context_excluded_positions)
    """
    found_terms = []
    context_excluded_positions = set()
    
    # Find single neutral terms
    for i, token in enumerate(doc):
        if i in excluded_positions or i in context_excluded_positions:
            continue
        
        lemma_lower = token.lemma_.lower()
        if lemma_lower in neutral_nouns:
            context_found, context_word, context_type, context_rel = find_context_for_neutral_term(
                doc, i, i, all_existing_terms
            )
            
            if context_found:
                found_terms.append({
                    'term': f"{context_word} {token.text}",
                    'pos': f"context_dependent_noun",
                    'start_idx': i,
                    'end_idx': i,
                    'sentence': token.sent,
                    'neutral_part': token.text,
                    'context_word': context_word,
                    'context_type': context_type,
                    'context_relationship': context_rel,
                    'negated': False
                })
                context_excluded_positions.add(i)
    
    # Find multiword neutral terms
    tokens = [token.lemma_.lower() for token in doc]
    
    for base_word, modifiers in neutral_multiword_nouns.items():
        for modifier in modifiers:
            # Pattern 1: modifier-base (e.g., "carbon-footprint")
            pattern1 = f"{modifier}-{base_word}"
            # Pattern 2: modifier base (e.g., "carbon footprint")
            pattern2 = f"{modifier} {base_word}"
            
            # Search in original text for hyphenated version
            text_lower = doc.text.lower()
            for match in re.finditer(re.escape(pattern1), text_lower):
                start_char = match.start()
                end_char = match.end()
                
                # Find corresponding token positions
                start_token_idx = None
                end_token_idx = None
                
                for j, token in enumerate(doc):
                    if token.idx <= start_char < token.idx + len(token.text):
                        start_token_idx = j
                    if token.idx < end_char <= token.idx + len(token.text):
                        end_token_idx = j
                        break
                
                if (start_token_idx is not None and end_token_idx is not None and
                    start_token_idx not in excluded_positions and start_token_idx not in context_excluded_positions):
                    
                    context_found, context_word, context_type, context_rel = find_context_for_neutral_term(
                        doc, start_token_idx, end_token_idx, all_existing_terms
                    )
                    
                    if context_found:
                        found_terms.append({
                            'term': f"{context_word} {pattern1}",
                            'pos': f"context_dependent_multiword_noun",
                            'start_idx': start_token_idx,
                            'end_idx': end_token_idx,
                            'sentence': doc[start_token_idx].sent,
                            'neutral_part': pattern1,
                            'context_word': context_word,
                            'context_type': context_type,
                            'context_relationship': context_rel,
                            'negated': False
                        })
                        for idx in range(start_token_idx, end_token_idx + 1):
                            context_excluded_positions.add(idx)
            
            # Search for space-separated version
            for j in range(len(tokens) - 1):
                if (tokens[j] == modifier and tokens[j + 1] == base_word and
                    j not in excluded_positions and j not in context_excluded_positions):
                    
                    context_found, context_word, context_type, context_rel = find_context_for_neutral_term(
                        doc, j, j + 1, all_existing_terms
                    )
                    
                    if context_found:
                        found_terms.append({
                            'term': f"{context_word} {pattern2}",
                            'pos': f"context_dependent_multiword_noun",
                            'start_idx': j,
                            'end_idx': j + 1,
                            'sentence': doc[j].sent,
                            'neutral_part': pattern2,
                            'context_word': context_word,
                            'context_type': context_type,
                            'context_relationship': context_rel,
                            'negated': False
                        })
                        context_excluded_positions.add(j)
                        context_excluded_positions.add(j + 1)
    
    return found_terms, context_excluded_positions

In [None]:
def is_word_part_of_green_terms(token, all_found_green_terms):
    """Check if a token is part of any existing green term."""
    for green_term in all_found_green_terms:
        green_term_span = range(green_term['start_idx'], green_term['end_idx'] + 1)
        if token.i in green_term_span:
            return True
    return False

def was_used_in_green_pattern(token, all_found_green_terms):
    """Check if this token was used to construct any green dependency pattern."""
    for green_term in all_found_green_terms:
        if green_term['pos'].startswith('dependency_'):
            green_term_span = range(green_term['start_idx'], green_term['end_idx'] + 1)
            if token.i in green_term_span:
                if token.lemma_.lower() in protected_green_context_words:
                    return True
    return False

def is_negation_related_to_term(negation_token, green_term_token, all_found_green_terms):
    """
    Determine if a negation word is actually negating the green term.
    """
    # Check if this negation word is part of any existing green term
    if is_word_part_of_green_terms(negation_token, all_found_green_terms):
        return False
    
    # Check if this negation word was used to construct any green pattern
    if was_used_in_green_pattern(negation_token, all_found_green_terms):
        return False
    
    # Check syntactic relationship using dependency parsing
    if negation_token.head == green_term_token or green_term_token.head == negation_token:
        return True
    
    # Check if they're in the same sentence and have close syntactic relationship
    if negation_token.sent == green_term_token.sent:
        negation_ancestors = [negation_token] + list(negation_token.ancestors)
        green_ancestors = [green_term_token] + list(green_term_token.ancestors)
        
        # If they share close ancestors, they're likely related
        for neg_ancestor in negation_ancestors[:3]:
            if neg_ancestor in green_ancestors[:3]:
                return True
    
    return False

def validate_phrasal_negation(sentence, phrase, green_term_token):
    """
    Validate that a phrasal negation pattern actually applies to the green term.
    """
    sentence_text = sentence.text.lower()
    phrase_start = sentence_text.find(phrase)
    
    if phrase_start == -1:
        return False
    
    # Calculate character positions
    green_term_char_start = green_term_token.idx
    green_term_char_end = green_term_token.idx + len(green_term_token.text)
    
    # Convert to sentence-relative positions
    sentence_char_start = sentence.start_char
    relative_green_start = green_term_char_start - sentence_char_start
    relative_green_end = green_term_char_end - sentence_char_start
    
    phrase_end = phrase_start + len(phrase)
    
    # Check if the green term appears reasonably close after the negation phrase
    if relative_green_start > phrase_end and relative_green_start - phrase_end < 50:
        return True
    
    return False

def find_negated_reduction_verb(main_token, all_found_green_terms):
    """
    Check if the green term is the object/target of a negated reduction verb.
    Returns: (is_negated, negation_type, negation_text, scope_found)
    """
    # Get all context words that are already being used to create green terms
    used_context_words = set()
    for green_term in all_found_green_terms:
        if 'context_word' in green_term:
            context_word = green_term['context_word'].lower()
            used_context_words.add(context_word)
            used_context_words.add(context_word.rstrip('ed').rstrip('ing').rstrip('s'))
    
    # Get the context word for this specific green term (if it's context-dependent)
    this_term_context_word_lemma = None
    for green_term in all_found_green_terms:
        term_span = range(green_term['start_idx'], green_term['end_idx'] + 1)
        if main_token.i in term_span and 'context_word' in green_term:
            context_word = green_term['context_word']
            context_doc = nlp(context_word)
            if len(context_doc) > 0:
                this_term_context_word_lemma = context_doc[0].lemma_.lower()
            else:
                this_term_context_word_lemma = context_word.lower()
            break
    
    # Check if this green term is the object of a reduction verb
    for ancestor in main_token.ancestors:
        if ancestor.lemma_.lower() in reduction_negative_verbs:
            # Apply distance filtering to verb subtree and head_subtree scopes
            verb_subtree_all = list(ancestor.subtree)
            verb_subtree_filtered = [
                token for token in verb_subtree_all 
                if (ancestor.i - 5) <= token.i < ancestor.i
            ]
            
            verb_head_subtree_all = list(ancestor.head.subtree)
            verb_head_subtree_filtered = [
                token for token in verb_head_subtree_all 
                if (ancestor.i - 5) <= token.i < ancestor.i
            ]
            
            # Check the verb's subtree, ancestors, and head.subtree for negation
            verb_scopes = [
                ("verb_subtree", verb_subtree_filtered),
                ("verb_ancestors", list(ancestor.ancestors)),
                ("verb_head_subtree", verb_head_subtree_filtered)
            ]
            
            for scope_name, scope_tokens in verb_scopes:
                for token in scope_tokens:
                    if (token.dep_ == "neg" or 
                        token.lemma_.lower() in direct_negation_words):
                        if not is_word_part_of_green_terms(token, all_found_green_terms):
                            if token.lemma_.lower() not in used_context_words:
                                if this_term_context_word_lemma is None or token.lemma_.lower() != this_term_context_word_lemma:
                                    return True, f"negated_reduction_verb_{scope_name}", f"'{token.text}' negating '{ancestor.text}'", scope_name
    
    # Also check direct dependency relationships where green term is object
    if main_token.dep_ in ["dobj", "pobj", "attr"]:
        head_verb = main_token.head
        if head_verb.lemma_.lower() in reduction_negative_verbs:
            # Apply distance filtering to direct verb scopes
            direct_verb_subtree_all = list(head_verb.subtree)
            direct_verb_subtree_filtered = [
                token for token in direct_verb_subtree_all 
                if (head_verb.i - 5) <= token.i < head_verb.i
            ]
            
            direct_verb_head_subtree_all = list(head_verb.head.subtree)
            direct_verb_head_subtree_filtered = [
                token for token in direct_verb_head_subtree_all 
                if (head_verb.i - 5) <= token.i < head_verb.i
            ]
            
            # Check if this verb is negated
            verb_scopes = [
                ("direct_verb_subtree", direct_verb_subtree_filtered),
                ("direct_verb_ancestors", list(head_verb.ancestors)),
                ("direct_verb_head_subtree", direct_verb_head_subtree_filtered)
            ]
            
            for scope_name, scope_tokens in verb_scopes:
                for token in scope_tokens:
                    if (token.dep_ == "neg" or 
                        token.lemma_.lower() in direct_negation_words):
                        if not is_word_part_of_green_terms(token, all_found_green_terms):
                            if token.lemma_.lower() not in used_context_words:
                                if this_term_context_word_lemma is None or token.lemma_.lower() != this_term_context_word_lemma:
                                    return True, f"negated_reduction_verb_{scope_name}", f"'{token.text}' negating '{head_verb.text}'", scope_name
    
    return False, None, None, None

def find_negation_in_multiple_scopes(main_token, all_found_green_terms):
    """
    Enhanced negation detection that checks subtree, ancestors, and head.subtree.
    Returns: (is_negated, negation_type, negation_text, scope_found)
    """
    # Get all context words that are already being used to create green terms
    used_context_words = set()
    for green_term in all_found_green_terms:
        if 'context_word' in green_term:
            context_word = green_term['context_word'].lower()
            used_context_words.add(context_word)
            used_context_words.add(context_word.rstrip('ed').rstrip('ing').rstrip('s'))
    
    # Get the context word for this specific green term (if it's context-dependent)
    this_term_context_word_lemma = None
    for green_term in all_found_green_terms:
        term_span = range(green_term['start_idx'], green_term['end_idx'] + 1)
        if main_token.i in term_span and 'context_word' in green_term:
            context_word = green_term['context_word']
            context_doc = nlp(context_word)
            if len(context_doc) > 0:
                this_term_context_word_lemma = context_doc[0].lemma_.lower()
            else:
                this_term_context_word_lemma = context_word.lower()
            break
    
    # Apply distance filtering to subtree and head_subtree scopes
    subtree_all = list(main_token.subtree)
    subtree_filtered = [
        token for token in subtree_all 
        if (main_token.i - 6) <= token.i <= (main_token.i + 3)
    ]
    
    head_subtree_all = list(main_token.head.subtree)
    head_subtree_filtered = [
        token for token in head_subtree_all 
        if (main_token.i - 6) <= token.i <= (main_token.i + 3)
    ]
    
    # Define the three scopes to check
    scopes = [
        ("subtree", subtree_filtered),
        ("ancestors", list(main_token.ancestors)),
        ("head_subtree", head_subtree_filtered)
    ]
    
    # Method 1: Check spaCy's built-in negation detection in all scopes
    for scope_name, scope_tokens in scopes:
        for token in scope_tokens:
            if token.dep_ == "neg":
                if not is_word_part_of_green_terms(token, all_found_green_terms):
                    if token.lemma_.lower() not in used_context_words:
                        if this_term_context_word_lemma is None or token.lemma_.lower() != this_term_context_word_lemma:
                            return True, "spacy_neg", token.text, scope_name
    
    # Method 2: Check for direct negation words in all scopes
    for scope_name, scope_tokens in scopes:
        for token in scope_tokens:
            if token.lemma_.lower() in direct_negation_words:
                if (not is_word_part_of_green_terms(token, all_found_green_terms) and
                    is_negation_related_to_term(token, main_token, all_found_green_terms)):
                    if token.lemma_.lower() not in used_context_words:
                        if this_term_context_word_lemma is None or token.lemma_.lower() != this_term_context_word_lemma:
                            return True, "direct_negation", token.text, scope_name
    
    # Method 3: Check for negative descriptor words in all scopes
    for scope_name, scope_tokens in scopes:
        for token in scope_tokens:
            if token.lemma_.lower() in negative_descriptor_words:
                if (not is_word_part_of_green_terms(token, all_found_green_terms) and
                    not was_used_in_green_pattern(token, all_found_green_terms) and
                    is_negation_related_to_term(token, main_token, all_found_green_terms)):
                    if token.lemma_.lower() not in used_context_words:
                        if this_term_context_word_lemma is None or token.lemma_.lower() != this_term_context_word_lemma:
                            return True, "negative_descriptor", token.text, scope_name
    
    return False, None, None, None

def detect_negation_for_term(doc, green_term_start_idx, green_term_end_idx, all_found_green_terms):
    """
    Comprehensive negation detection for a green term using multiple scopes and reduction verbs.
    Returns: (is_negated, negation_type, negation_text, scope_found)
    """
    # Get the main token of the green term (head token for multiword terms)
    main_token = doc[green_term_start_idx]
    
    # For multiword terms, try to find the head token
    if green_term_end_idx > green_term_start_idx:
        tokens_in_term = [doc[i] for i in range(green_term_start_idx, green_term_end_idx + 1)]
        for token in tokens_in_term:
            if token.head not in tokens_in_term or token.head == token:
                main_token = token
                break
    
    # Check for negated reduction verbs first
    is_negated, negation_type, negation_text, scope = find_negated_reduction_verb(main_token, all_found_green_terms)
    if is_negated:
        return True, negation_type, negation_text, scope
    
    # Enhanced method: check subtree, ancestors, and head.subtree
    is_negated, negation_type, negation_text, scope = find_negation_in_multiple_scopes(main_token, all_found_green_terms)
    if is_negated:
        return True, f"{negation_type}_{scope}", negation_text, scope
    
    # Check for prefix negation on all tokens in the term
    for token_idx in range(green_term_start_idx, green_term_end_idx + 1):
        token = doc[token_idx]
        
        if token.i >= 2:
            prev_token = doc[token.i - 1]
            if prev_token.text == "-":
                prev_prev_token = doc[token.i - 2]
                if prev_prev_token.text.lower() in negation_prefixes:
                    return True, "prefix_negation", f"{prev_prev_token.text}-", "prefix"
    
    # Check for phrasal negation patterns
    sentence = main_token.sent
    sentence_text = sentence.text.lower()
    for phrase in phrasal_negation_patterns:
        if phrase in sentence_text:
            if validate_phrasal_negation(sentence, phrase, main_token):
                return True, "phrasal_negation", phrase, "sentence"
    
    return False, None, None, None

def filter_negated_terms(all_found_terms, doc):
    """
    Filter out negated terms and return both valid and negated term lists.
    """
    valid_terms = []
    negated_terms = []
    
    for term_info in all_found_terms:
        is_negated, negation_type, negation_text, scope = detect_negation_for_term(
            doc, 
            term_info['start_idx'], 
            term_info['end_idx'], 
            all_found_terms
        )
        
        if is_negated:
            term_info['negated'] = True
            term_info['negation_type'] = negation_type
            term_info['negation_text'] = negation_text
            term_info['negation_scope'] = scope
            negated_terms.append(term_info)
        else:
            term_info['negated'] = False
            valid_terms.append(term_info)
    
    return valid_terms, negated_terms

def get_negation_statistics(negated_terms):
    """
    Generate statistics about negation patterns including scope and reduction verb information.
    """
    if not negated_terms:
        return {}
    
    negation_types = Counter()
    negated_by_pos = Counter()
    negation_scopes = Counter()
    reduction_verb_negations = Counter()
    
    for term in negated_terms:
        negation_types[term['negation_type']] += 1
        negated_by_pos[term['pos']] += 1
        if 'negation_scope' in term:
            negation_scopes[term['negation_scope']] += 1
        if 'reduction_verb' in term.get('negation_type', ''):
            reduction_verb_negations[term['negation_type']] += 1
    
    return {
        'total_negated': len(negated_terms),
        'by_type': dict(negation_types),
        'by_pos': dict(negated_by_pos),
        'by_scope': dict(negation_scopes),
        'reduction_verb_negations': dict(reduction_verb_negations),
        'examples': [(term['term'], term['negation_type'], term['negation_text'], 
                     term.get('negation_scope', 'unknown')) 
                    for term in negated_terms[:8]]
    }

In [None]:
def find_multiword_terms(doc, multiword_dict, pos_tag):
    """Find multiword terms in document and return positions to exclude from single word counting."""
    found_terms = []
    excluded_positions = set()
    
    # Convert doc to lowercase tokens for matching
    tokens = [token.lemma_.lower() for token in doc]
    
    for base_word, modifiers in multiword_dict.items():
        for modifier in modifiers:
            # Pattern 1: modifier-base (e.g., "eco-friendly")
            pattern1 = f"{modifier}-{base_word}"
            # Pattern 2: modifier base (e.g., "eco friendly") 
            pattern2 = f"{modifier} {base_word}"
            
            # Search in original text for hyphenated version
            text_lower = doc.text.lower()
            for match in re.finditer(re.escape(pattern1), text_lower):
                start_char = match.start()
                end_char = match.end()
                
                # Find corresponding token positions
                start_token_idx = None
                end_token_idx = None
                
                for i, token in enumerate(doc):
                    if token.idx <= start_char < token.idx + len(token.text):
                        start_token_idx = i
                    if token.idx < end_char <= token.idx + len(token.text):
                        end_token_idx = i
                        break
                
                if start_token_idx is not None and end_token_idx is not None:
                    found_terms.append({
                        'term': pattern1,
                        'pos': pos_tag,
                        'start_idx': start_token_idx,
                        'end_idx': end_token_idx,
                        'sentence': doc[start_token_idx].sent,
                        'negated': False
                    })
                    # Mark positions as excluded
                    for idx in range(start_token_idx, end_token_idx + 1):
                        excluded_positions.add(idx)
            
            # Search for space-separated version
            for i in range(len(tokens) - 1):
                if tokens[i] == modifier and tokens[i + 1] == base_word:
                    found_terms.append({
                        'term': pattern2,
                        'pos': pos_tag,
                        'start_idx': i,
                        'end_idx': i + 1,
                        'sentence': doc[i].sent,
                        'negated': False
                    })
                    # Mark positions as excluded
                    excluded_positions.add(i)
                    excluded_positions.add(i + 1)
    
    return found_terms, excluded_positions

def find_single_word_terms(doc, word_list, pos_tag, excluded_positions):
    """Find single word terms, excluding positions already counted in multiword terms."""
    found_terms = []
    
    for i, token in enumerate(doc):
        if i in excluded_positions:
            continue
            
        lemma_lower = token.lemma_.lower()
        if lemma_lower in word_list:
            # Skip "sustainability" if followed by "report"
            if lemma_lower == "sustainability" and i + 1 < len(doc):
                next_token = doc[i + 1]
                if next_token.lemma_.lower() in {"report", "reporting"}:
                    continue

            # Skip "PV" if it's between brackets like "(PV)"
            if lemma_lower == "pv":
                left_char = doc.text[token.idx - 1] if token.idx > 0 else ""
                right_char = doc.text[token.idx + len(token)] if token.idx + len(token) < len(doc.text) else ""
                if left_char == "(" and right_char == ")":
                    continue

            found_terms.append({
                'term': lemma_lower,
                'pos': pos_tag,
                'start_idx': i,
                'end_idx': i,
                'sentence': token.sent,
                'negated': False
            })

    return found_terms

def find_dependency_patterns(doc, excluded_positions):
    """Find dependency-based green patterns."""
    found_terms = []
    dependency_excluded_positions = set()
    
    for pattern_name, pattern_info in dependency_green_patterns.items():
        head_words = pattern_info["head_words"]
        dependent_words = pattern_info["dependent_words"]
        dependency_relations = pattern_info["dependency_relations"]
        
        for token in doc:
            # Skip if token is already counted
            if token.i in excluded_positions or token.i in dependency_excluded_positions:
                continue
                
            token_lemma = token.lemma_.lower()
            
            # Check if current token is a head word
            if token_lemma in head_words:
                # Look for dependents
                for child in token.subtree:
                    if (child.dep_ in dependency_relations and 
                        child.lemma_.lower() in dependent_words and
                        child.i not in excluded_positions and
                        child.i not in dependency_excluded_positions):
                        
                        # Found a match
                        term_text = f"{child.text} {token.text}"
                        found_terms.append({
                            'term': term_text,
                            'pos': f"dependency_{pattern_name}",
                            'start_idx': min(child.i, token.i),
                            'end_idx': max(child.i, token.i),
                            'sentence': token.sent,
                            'pattern': pattern_name,
                            'dependency': child.dep_,
                            'negated': False
                        })
                        # Mark both positions as used
                        dependency_excluded_positions.add(child.i)
                        dependency_excluded_positions.add(token.i)
            
            # Check if current token is a dependent word
            elif token_lemma in dependent_words:
                # Look at its head
                head = token.head
                if (token.dep_ in dependency_relations and
                    head.lemma_.lower() in head_words and
                    head.i not in excluded_positions and
                    head.i not in dependency_excluded_positions):
                    
                    # Found a match
                    term_text = f"{token.text} {head.text}"
                    found_terms.append({
                        'term': term_text,
                        'pos': f"dependency_{pattern_name}",
                        'start_idx': min(token.i, head.i),
                        'end_idx': max(token.i, head.i),
                        'sentence': token.sent,
                        'pattern': pattern_name,
                        'dependency': token.dep_,
                        'negated': False
                    })
                    # Mark both positions as used
                    dependency_excluded_positions.add(token.i)
                    dependency_excluded_positions.add(head.i)
    
    return found_terms, dependency_excluded_positions

def get_context_words(doc, start_idx, end_idx, context_size=5):
    """Extract context words around the found term."""
    # Get the sentence containing the term
    sentence = doc[start_idx].sent
    sent_start = sentence.start
    sent_end = sentence.end
    
    # Calculate context boundaries within the sentence
    context_start = max(sent_start, start_idx - context_size)
    context_end = min(sent_end, end_idx + context_size + 1)
    
    # Extract context tokens
    context_tokens = []
    for i in range(context_start, context_end):
        if i == start_idx and start_idx == end_idx:
            # Single word term - highlight it
            context_tokens.append(f"**{doc[i].text}**")
        elif i == start_idx:
            # Start of multiword term
            context_tokens.append(f"**{doc[i].text}")
        elif i == end_idx:
            # End of multiword term
            context_tokens.append(f"{doc[i].text}**")
        elif start_idx < i < end_idx:
            # Middle of multiword term
            context_tokens.append(doc[i].text)
        else:
            # Regular context word
            context_tokens.append(doc[i].text)
    
    return " ".join(context_tokens)

def analyze_green_terms_with_negation(doc, doc_name):
    """
    Analyze all green terms in a document with enhanced multi-scope negation detection and context-dependent terms.
    Returns: (valid_counts, valid_terms, negated_terms, original_counts)
    """
    print(f"\n{'='*60}")
    print(f"ANALYZING: {doc_name}")
    print(f"{'='*60}")
    
    all_found_terms = []
    all_excluded_positions = set()
    
    # Step 1: Find multiword terms first (all types together)
    multiword_noun_terms, excluded_noun_pos = find_multiword_terms(doc, green_multiword_nouns, "multiword_noun")
    multiword_adj_terms, excluded_adj_pos = find_multiword_terms(doc, green_multiword_adjectives, "multiword_adjective")
    multiword_adv_terms, excluded_adv_pos = find_multiword_terms(doc, green_multiword_adverbs, "multiword_adverb")
    
    # Combine all multiword terms and check for overlaps
    all_multiword_terms = multiword_noun_terms + multiword_adj_terms + multiword_adv_terms
    
    # Remove duplicates based on position overlap
    filtered_multiword_terms = []
    used_positions = set()
    
    # Sort by start position to process in order
    all_multiword_terms.sort(key=lambda x: x['start_idx'])
    
    for term in all_multiword_terms:
        # Check if this term overlaps with any already used positions
        term_positions = set(range(term['start_idx'], term['end_idx'] + 1))
        if not term_positions.intersection(used_positions):
            filtered_multiword_terms.append(term)
            used_positions.update(term_positions)
            all_excluded_positions.update(term_positions)
    
    all_found_terms.extend(filtered_multiword_terms)
    
    # Step 2: Find dependency patterns (excluding already counted positions)
    dependency_terms, dependency_excluded_pos = find_dependency_patterns(doc, all_excluded_positions)
    
    # Filter dependency terms to avoid overlap with multiword terms
    filtered_dependency_terms = []
    for dep_term in dependency_terms:
        dep_positions = set(range(dep_term['start_idx'], dep_term['end_idx'] + 1))
        if not dep_positions.intersection(all_excluded_positions):
            filtered_dependency_terms.append(dep_term)
            all_excluded_positions.update(dep_positions)
    
    all_found_terms.extend(filtered_dependency_terms)
    
    # Step 3: Find context-dependent terms (excluding already counted positions)
    context_dependent_terms, context_excluded_pos = find_context_dependent_terms(doc, all_excluded_positions, all_found_terms)
    
    # Filter context-dependent terms to avoid overlap
    filtered_context_terms = []
    for ctx_term in context_dependent_terms:
        ctx_positions = set(range(ctx_term['start_idx'], ctx_term['end_idx'] + 1))
        if not ctx_positions.intersection(all_excluded_positions):
            filtered_context_terms.append(ctx_term)
            all_excluded_positions.update(ctx_positions)
    
    all_found_terms.extend(filtered_context_terms)
    
    # Step 4: Find single word terms with position tracking to prevent double counting
    # Process in priority order: verbs > nouns > adjectives > adverbs
    
    # Priority 1: Verbs (actions are often most important)
    single_verb_terms = find_single_word_terms(doc, green_verbs, "verb", all_excluded_positions)
    for term in single_verb_terms:
        all_excluded_positions.add(term['start_idx'])
    all_found_terms.extend(single_verb_terms)
    
    # Priority 2: Nouns (concrete green concepts)
    single_noun_terms = find_single_word_terms(doc, green_nouns, "noun", all_excluded_positions)
    for term in single_noun_terms:
        all_excluded_positions.add(term['start_idx'])
    all_found_terms.extend(single_noun_terms)
    
    # Priority 3: Adjectives (descriptive green terms)
    single_adj_terms = find_single_word_terms(doc, green_adjectives, "adjective", all_excluded_positions)
    for term in single_adj_terms:
        all_excluded_positions.add(term['start_idx'])
    all_found_terms.extend(single_adj_terms)
    
    # Priority 4: Adverbs (manner of green actions)
    single_adv_terms = find_single_word_terms(doc, green_adverbs, "adverb", all_excluded_positions)
    for term in single_adv_terms:
        all_excluded_positions.add(term['start_idx'])
    all_found_terms.extend(single_adv_terms)
    
    # Step 5: Apply enhanced multi-scope negation detection
    print(f"Found {len(all_found_terms)} green terms before negation filtering...")
    valid_terms, negated_terms = filter_negated_terms(all_found_terms, doc)
    print(f"After negation filtering: {len(valid_terms)} valid terms, {len(negated_terms)} negated terms")
    
    # Count by type for original, valid, and negated terms
    original_type_counts = Counter()
    valid_type_counts = Counter()
    negated_type_counts = Counter()
    
    for term_info in all_found_terms:
        original_type_counts[term_info['pos']] += 1
    
    for term_info in valid_terms:
        valid_type_counts[term_info['pos']] += 1
    
    for term_info in negated_terms:
        negated_type_counts[term_info['pos']] += 1
    
    # Print comprehensive counts
    print(f"\nGREEN TERMS COUNTS (ORIGINAL | NEGATED | VALID):")
    print(f"Nouns: {original_type_counts['noun']} | {negated_type_counts['noun']} | {valid_type_counts['noun']}")
    print(f"Multiword Nouns: {original_type_counts['multiword_noun']} | {negated_type_counts['multiword_noun']} | {valid_type_counts['multiword_noun']}")
    print(f"Adjectives: {original_type_counts['adjective']} | {negated_type_counts['adjective']} | {valid_type_counts['adjective']}")  
    print(f"Multiword Adjectives: {original_type_counts['multiword_adjective']} | {negated_type_counts['multiword_adjective']} | {valid_type_counts['multiword_adjective']}")
    print(f"Verbs: {original_type_counts['verb']} | {negated_type_counts['verb']} | {valid_type_counts['verb']}")
    print(f"Adverbs: {original_type_counts['adverb']} | {negated_type_counts['adverb']} | {valid_type_counts['adverb']}")
    print(f"Multiword Adverbs: {original_type_counts['multiword_adverb']} | {negated_type_counts['multiword_adverb']} | {valid_type_counts['multiword_adverb']}")
    print(f"Context-Dependent Nouns: {original_type_counts['context_dependent_noun']} | {negated_type_counts['context_dependent_noun']} | {valid_type_counts['context_dependent_noun']}")
    print(f"Context-Dependent Multiword Nouns: {original_type_counts['context_dependent_multiword_noun']} | {negated_type_counts['context_dependent_multiword_noun']} | {valid_type_counts['context_dependent_multiword_noun']}")
    
    # Count dependency patterns
    original_dependency_counts = Counter()
    valid_dependency_counts = Counter()
    negated_dependency_counts = Counter()
    
    for term_info in all_found_terms:
        if term_info['pos'].startswith('dependency_'):
            original_dependency_counts[term_info['pos']] += 1
    
    for term_info in valid_terms:
        if term_info['pos'].startswith('dependency_'):
            valid_dependency_counts[term_info['pos']] += 1
            
    for term_info in negated_terms:
        if term_info['pos'].startswith('dependency_'):
            negated_dependency_counts[term_info['pos']] += 1
    
    print(f"Dependency Patterns: {sum(original_dependency_counts.values())} | {sum(negated_dependency_counts.values())} | {sum(valid_dependency_counts.values())}")
    
    for dep_type in original_dependency_counts.keys():
        pattern_name = dep_type.replace('dependency_', '')
        orig_count = original_dependency_counts[dep_type]
        neg_count = negated_dependency_counts[dep_type]
        val_count = valid_dependency_counts[dep_type]
        print(f"  {pattern_name}: {orig_count} | {neg_count} | {val_count}")
    
    print(f"TOTAL: {sum(original_type_counts.values())} | {sum(negated_type_counts.values())} | {sum(valid_type_counts.values())}")
    
    # Generate negation statistics
    negation_stats = get_negation_statistics(negated_terms)
    if negation_stats:
        print(f"\nENHANCED NEGATION ANALYSIS:")
        print(f"Total negated terms: {negation_stats['total_negated']}")
        print(f"Negation types: {negation_stats['by_type']}")
        print(f"Negation scopes: {negation_stats['by_scope']}")
        print(f"Examples of negated terms (with scope):")
        for term, neg_type, neg_text, scope in negation_stats['examples']:
            print(f"  - '{term}' (negated by: {neg_text}, type: {neg_type}, scope: {scope})")
    
    # Sort valid terms by their position in the text
    valid_terms_sorted = sorted(valid_terms, key=lambda x: x['start_idx'])
    
    # Print valid terms in order they appear in the text
    print(f"\nVALID TERMS IN TEXT ORDER:")
    print("-" * 40)
    for i, term_info in enumerate(valid_terms_sorted[:20], 1):
        # Format the term type for display
        if term_info['pos'].startswith('dependency_'):
            pattern_name = term_info['pos'].replace('dependency_', '').upper()
            term_type = f"DEPENDENCY {pattern_name} TERM"
        elif term_info['pos'].startswith('context_dependent_'):
            ctx_type = term_info.get('context_type', 'unknown').upper()
            base_type = term_info['pos'].replace('context_dependent_', '').upper().replace('_', ' ')
            term_type = f"CONTEXT-DEPENDENT {base_type} ({ctx_type})"
        else:
            term_type = term_info['pos'].upper().replace('_', ' ') + " TERM"
        
        context = get_context_words(doc, term_info['start_idx'], term_info['end_idx'])
        print(f"{i}. {term_type}: {term_info['term']}")
        
        # Add dependency info if it's a dependency pattern
        if 'dependency' in term_info:
            print(f"   (Dependency: {term_info['dependency']})")
        
        # Add context info if it's a context-dependent term
        if 'context_word' in term_info:
            print(f"   (Context: '{term_info['context_word']}' -> Neutral: '{term_info['neutral_part']}')")
        
        print(f"   Context: {context}")
        print()
    
    # If there are negated terms, show some examples
    if negated_terms:
        print(f"\nEXAMPLES OF NEGATED TERMS (EXCLUDED FROM COUNT):")
        print("-" * 40)
        negated_terms_sorted = sorted(negated_terms, key=lambda x: x['start_idx'])
        for i, term_info in enumerate(negated_terms_sorted[:10], 1):
            if term_info['pos'].startswith('dependency_'):
                pattern_name = term_info['pos'].replace('dependency_', '').upper()
                term_type = f"DEPENDENCY {pattern_name} TERM"
            elif term_info['pos'].startswith('context_dependent_'):
                ctx_type = term_info.get('context_type', 'unknown').upper()
                base_type = term_info['pos'].replace('context_dependent_', '').upper().replace('_', ' ')
                term_type = f"CONTEXT-DEPENDENT {base_type} ({ctx_type})"
            else:
                term_type = term_info['pos'].upper().replace('_', ' ') + " TERM"
            
            context = get_context_words(doc, term_info['start_idx'], term_info['end_idx'])
            print(f"{i}. {term_type}: {term_info['term']}")
            scope_info = f" in {term_info.get('negation_scope', 'unknown')} scope" if 'negation_scope' in term_info else ""
            print(f"   Negated by: {term_info['negation_text']} (type: {term_info['negation_type']}{scope_info})")
            
            # Add context info if it's a context-dependent term
            if 'context_word' in term_info:
                print(f"   (Context: '{term_info['context_word']}' -> Neutral: '{term_info['neutral_part']}')")
            
            print(f"   Context: {context}")
            print()
    
    return valid_type_counts, valid_terms, negated_terms, original_type_counts

In [None]:
all_results = {}
for doc_name, doc in documents.items():
    valid_counts, valid_terms, negated_terms, original_counts = analyze_green_terms_with_negation(doc, doc_name)
    all_results[doc_name] = {
        'valid_counts': valid_counts,
        'valid_terms': valid_terms,
        'negated_terms': negated_terms,
        'original_counts': original_counts,
        'total_tokens': len(doc),
        'total_sentences': len(list(doc.sents))
    }

# Cell 10: Print Comprehensive Summary
print(f"\n{'='*140}")
print("COMPREHENSIVE SUMMARY - GREEN TERMS ANALYSIS WITH CONTEXT-DEPENDENT TERMS + MULTI-SCOPE NEGATION")
print(f"{'='*140}")

print("\n1. ORIGINAL COUNTS (Before Negation Filtering)")
print(f"{'Document':<25} {'Nouns':<8} {'M-Nouns':<8} {'Adj':<8} {'M-Adj':<8} {'Verbs':<8} {'Adv':<8} {'M-Adv':<8} {'Ctx-N':<8} {'Ctx-MN':<8} {'Dep-Pat':<8} {'Total':<8}")
print("-" * 140)

for doc_name, results in all_results.items():
    counts = results['original_counts']
    dependency_total = sum(count for pos_type, count in counts.items() if pos_type.startswith('dependency_'))
    total = sum(counts.values())
    print(f"{doc_name:<25} {counts['noun']:<8} {counts['multiword_noun']:<8} {counts['adjective']:<8} {counts['multiword_adjective']:<8} {counts['verb']:<8} {counts['adverb']:<8} {counts['multiword_adverb']:<8} {counts['context_dependent_noun']:<8} {counts['context_dependent_multiword_noun']:<8} {dependency_total:<8} {total:<8}")

print("\n2. NEGATED TERMS (Filtered Out)")
print(f"{'Document':<25} {'Nouns':<8} {'M-Nouns':<8} {'Adj':<8} {'M-Adj':<8} {'Verbs':<8} {'Adv':<8} {'M-Adv':<8} {'Ctx-N':<8} {'Ctx-MN':<8} {'Dep-Pat':<8} {'Total':<8}")
print("-" * 140)

for doc_name, results in all_results.items():
    negated_counts = Counter()
    for term in results['negated_terms']:
        negated_counts[term['pos']] += 1
    
    dependency_total = sum(count for pos_type, count in negated_counts.items() if pos_type.startswith('dependency_'))
    total = sum(negated_counts.values())
    print(f"{doc_name:<25} {negated_counts['noun']:<8} {negated_counts['multiword_noun']:<8} {negated_counts['adjective']:<8} {negated_counts['multiword_adjective']:<8} {negated_counts['verb']:<8} {negated_counts['adverb']:<8} {negated_counts['multiword_adverb']:<8} {negated_counts['context_dependent_noun']:<8} {negated_counts['context_dependent_multiword_noun']:<8} {dependency_total:<8} {total:<8}")

print("\n3. FINAL VALID COUNTS (After Multi-Scope + Reduction Verb Negation Filtering)")
print(f"{'Document':<25} {'Nouns':<8} {'M-Nouns':<8} {'Adj':<8} {'M-Adj':<8} {'Verbs':<8} {'Adv':<8} {'M-Adv':<8} {'Ctx-N':<8} {'Ctx-MN':<8} {'Dep-Pat':<8} {'Total':<8}")
print("-" * 140)

for doc_name, results in all_results.items():
    counts = results['valid_counts']
    dependency_total = sum(count for pos_type, count in counts.items() if pos_type.startswith('dependency_'))
    total = sum(counts.values())
    print(f"{doc_name:<25} {counts['noun']:<8} {counts['multiword_noun']:<8} {counts['adjective']:<8} {counts['multiword_adjective']:<8} {counts['verb']:<8} {counts['adverb']:<8} {counts['multiword_adverb']:<8} {counts['context_dependent_noun']:<8} {counts['context_dependent_multiword_noun']:<8} {dependency_total:<8} {total:<8}")

print("\n4. CONTEXT-DEPENDENT TERMS ANALYSIS")
print("-" * 80)

# Analyze context-dependent terms by type
all_context_types = Counter()
all_context_words = Counter()
context_examples = []

for doc_name, results in all_results.items():
    doc_context_negative = 0
    doc_context_improvement = 0
    
    for term in results['valid_terms']:
        if term['pos'].startswith('context_dependent_'):
            context_type = term.get('context_type', 'unknown')
            context_word = term.get('context_word', 'unknown')
            all_context_types[context_type] += 1
            all_context_words[context_word.lower()] += 1
            
            if context_type == 'negative':
                doc_context_negative += 1
            elif context_type == 'improvement':
                doc_context_improvement += 1
            
            if len(context_examples) < 10:
                context_examples.append((doc_name, term['term'], term['context_word'], term['neutral_part'], context_type))
    
    total_context = doc_context_negative + doc_context_improvement
    if total_context > 0:
        print(f"{doc_name}: {total_context} context-dependent terms ({doc_context_negative} negative, {doc_context_improvement} improvement)")

print(f"\nOverall context type distribution: {dict(all_context_types)}")
print(f"Top context words: {dict(all_context_words.most_common(10))}")

print(f"\nExamples of context-dependent terms:")
for doc, term, context, neutral, ctx_type in context_examples:
    print(f"  - '{term}' = '{context}' + '{neutral}' ({ctx_type} context) [{doc}]")

print("\n5. NEGATION IMPACT SUMMARY")
print(f"{'Document':<25} {'Original':<10} {'Negated':<10} {'Valid':<10} {'Negation %':<12}")
print("-" * 70)

for doc_name, results in all_results.items():
    original_total = sum(results['original_counts'].values())
    negated_total = len(results['negated_terms'])
    valid_total = sum(results['valid_counts'].values())
    negation_pct = (negated_total / original_total * 100) if original_total > 0 else 0
    
    print(f"{doc_name:<25} {original_total:<10} {negated_total:<10} {valid_total:<10} {negation_pct:<11.1f}%")

print(f"\n{'='*140}")
print("ANALYSIS COMPLETE - Enhanced with Context-Dependent Terms + Multi-Scope Negation Detection")
print("Key features:")
print("1. Context-dependent terms: neutral terms that become green with negative/improvement context")
print("2. Multi-scope negation detection: subtree, ancestors, head.subtree, and sentence-level")
print("3. Reduction verb negation: detects when positive verbs are negated")
print("4. Comprehensive term classification and overlap prevention")
print(f"{'='*140}")

## Green term density

In [None]:
def get_term_specificity_weight(pos_type):
    """Assign priority weights to different term types for overlap resolution."""
    weights = {
        'context_dependent_multiword_noun': 5,
        'multiword_noun': 4,
        'multiword_adjective': 4,
        'multiword_adverb': 4,
        'context_dependent_noun': 3,
        'dependency_': 3,
        'noun': 1,
        'adjective': 1,
        'verb': 1,
        'adverb': 1
    }
    
    # Check for dependency patterns first
    if pos_type.startswith('dependency_'):
        return 3
    
    # Check other patterns
    for pattern, weight in weights.items():
        if pattern in pos_type:
            return weight
    return 0

def resolve_overlapping_terms(all_terms, doc):
    """
    Resolve overlapping terms by prioritizing longer, more specific terms.
    Returns final terms with no overlapping positions.
    """
    # Sort by specificity weight (highest first), then by length (longest first)
    sorted_terms = sorted(all_terms, 
                         key=lambda x: (get_term_specificity_weight(x['pos']), 
                                       x['end_idx'] - x['start_idx']), 
                         reverse=True)
    
    used_positions = set()
    final_terms = []
    overlap_log = []
    
    for term in sorted_terms:
        term_positions = set(range(term['start_idx'], term['end_idx'] + 1))
        
        # Check for overlap with already used positions
        overlap = term_positions.intersection(used_positions)
        if not overlap:
            final_terms.append(term)
            used_positions.update(term_positions)
        else:
            overlap_log.append({
                'rejected_term': term['term'],
                'pos_type': term['pos'],
                'overlapping_positions': list(overlap)
            })
    
    return final_terms, overlap_log

def extract_green_word_tokens(term_info, doc, excluded_positions):
    """
    Extract individual word tokens from a green term, excluding already counted positions.
    Excludes stop words for consistency with total word count.
    Returns list of token indices that comprise this term.
    """
    start_idx = term_info['start_idx']
    end_idx = term_info['end_idx']
    
    term_token_indices = []
    
    for token_idx in range(start_idx, end_idx + 1):
        if token_idx not in excluded_positions:
            token = doc[token_idx]
            # Only count meaningful tokens (exclude punctuation, whitespace, and stop words)
            if (not token.is_punct and 
                not token.is_space and 
                token.text.strip() and
                len(token.text.strip()) > 0 and
                not token.is_stop):
                term_token_indices.append(token_idx)
    
    return term_token_indices

def count_total_content_words(doc, exclude_stop_words=True):
    """
    Count total content words in document, optionally excluding stop words.
    Returns (total_words, stop_words_count, punctuation_count)
    """
    total_words = 0
    stop_words_count = 0
    punctuation_count = 0
    
    for token in doc:
        if token.is_punct or token.is_space:
            if token.is_punct:
                punctuation_count += 1
            continue
            
        if not token.text.strip():
            continue
            
        if token.is_stop:
            stop_words_count += 1
            if not exclude_stop_words:
                total_words += 1
        else:
            total_words += 1
    
    return total_words, stop_words_count, punctuation_count

In [None]:
def analyze_green_term_frequency(all_results, documents, exclude_stop_words=True):
    """
    Comprehensive frequency analysis of green terms with precise word counting.
    
    Args:
        all_results: Dictionary with green term analysis results for each document
        documents: Dictionary with spaCy Doc objects for each document
        exclude_stop_words: Whether to exclude stop words from total word count
    
    Returns:
        Dictionary with frequency analysis results for each document
    """
    frequency_results = {}
    
    print(f"\n{'='*80}")
    print("GREEN TERM FREQUENCY ANALYSIS - DETAILED WORD COUNTING")
    print(f"{'='*80}")
    print(f"Stop words {'EXCLUDED' if exclude_stop_words else 'INCLUDED'} from total word count")
    print()
    
    for doc_name, results in all_results.items():
        print(f"Analyzing frequency for: {doc_name}")
        doc = documents[doc_name]
        
        # Resolve overlapping terms to prevent double counting
        all_valid_terms = results['valid_terms']
        final_terms, overlap_log = resolve_overlapping_terms(all_valid_terms, doc)
        
        # Extract green word tokens with position tracking
        green_word_positions = set()
        word_breakdown = {}
        term_contributions = []
        
        for term_info in final_terms:
            term_token_indices = extract_green_word_tokens(term_info, doc, green_word_positions)
            
            if term_token_indices:
                # Update position tracking
                green_word_positions.update(term_token_indices)
                
                # Store breakdown for validation
                term_tokens = [doc[idx].text for idx in term_token_indices]
                word_breakdown[term_info['term']] = {
                    'tokens': term_tokens,
                    'token_indices': term_token_indices,
                    'pos_type': term_info['pos'],
                    'word_count': len(term_token_indices)
                }
                
                term_contributions.append({
                    'term': term_info['term'],
                    'pos_type': term_info['pos'],
                    'word_count': len(term_token_indices),
                    'tokens': term_tokens
                })
        
        # Count total content words in document
        total_words, stop_words_count, punctuation_count = count_total_content_words(doc, exclude_stop_words)
        
        # Final green word count and validation
        green_word_count = len(green_word_positions)
        
        # Check for stop words in green terms (quality control)
        stop_words_in_green = 0
        green_stop_words = []
        for pos in green_word_positions:
            token = doc[pos]
            if token.is_stop:
                stop_words_in_green += 1
                green_stop_words.append(token.text)
        
        # Calculate frequency
        green_term_frequency = (green_word_count / total_words * 100) if total_words > 0 else 0
        
        # Create unique word analysis
        unique_green_words = set()
        unique_green_lemmas = set()
        for pos in green_word_positions:
            token = doc[pos]
            unique_green_words.add(token.text.lower())
            unique_green_lemmas.add(token.lemma_.lower())
        
        # Store comprehensive results
        frequency_results[doc_name] = {
            # Core metrics
            'green_word_count': green_word_count,
            'total_words': total_words,
            'green_term_frequency': round(green_term_frequency, 3),
            
            # Document stats
            'total_tokens': len(doc),
            'total_sentences': len(list(doc.sents)),
            'stop_words_in_doc': stop_words_count,
            'punctuation_tokens': punctuation_count,
            
            # Green term details
            'unique_green_words': len(unique_green_words),
            'unique_green_lemmas': len(unique_green_lemmas),
            'total_green_terms': len(final_terms),
            'original_terms_before_overlap_resolution': len(all_valid_terms),
            
            # Quality control
            'stop_words_in_green_terms': stop_words_in_green,
            'green_stop_words': green_stop_words,
            'overlapping_terms_resolved': len(all_valid_terms) - len(final_terms),
            'overlap_log': overlap_log,
            
            # Detailed breakdown
            'word_breakdown': word_breakdown,
            'term_contributions': term_contributions,
            'green_word_positions': sorted(list(green_word_positions)),
            
            # Validation data
            'unique_words_list': sorted(list(unique_green_words)),
            'unique_lemmas_list': sorted(list(unique_green_lemmas))
        }
        
        print(f"  → {green_word_count} green words out of {total_words} total words = {green_term_frequency:.3f}%")
        if overlap_log:
            print(f"  → Resolved {len(overlap_log)} overlapping terms")
        
    return frequency_results

def validate_frequency_results(frequency_results, documents):
    """Perform validation checks on frequency analysis results."""
    validation_report = {}
    
    for doc_name, results in frequency_results.items():
        doc = documents[doc_name]
        validation_issues = []
        
        # Check 1: Verify no double counting
        position_check = len(results['green_word_positions']) == len(set(results['green_word_positions']))
        if not position_check:
            validation_issues.append("Duplicate positions found in green_word_positions")
        
        # Check 2: Verify positions are valid
        max_pos = max(results['green_word_positions']) if results['green_word_positions'] else 0
        if max_pos >= len(doc):
            validation_issues.append(f"Invalid position {max_pos} (doc length: {len(doc)})")
        
        # Check 3: Verify word count consistency
        calculated_words = sum(term['word_count'] for term in results['term_contributions'])
        if calculated_words != results['green_word_count']:
            validation_issues.append(f"Word count mismatch: {calculated_words} vs {results['green_word_count']}")
        
        # Check 4: Verify frequency calculation
        expected_freq = (results['green_word_count'] / results['total_words'] * 100) if results['total_words'] > 0 else 0
        if abs(expected_freq - results['green_term_frequency']) > 0.001:
            validation_issues.append(f"Frequency calculation error: {expected_freq} vs {results['green_term_frequency']}")
        
        validation_report[doc_name] = {
            'passed_validation': len(validation_issues) == 0,
            'issues': validation_issues,
            'checks_performed': 4
        }
    
    return validation_report

In [None]:
# Execute the frequency analysis
frequency_results = analyze_green_term_frequency(all_results, documents, exclude_stop_words=True)

# Perform validation
validation_report = validate_frequency_results(frequency_results, documents)

# Print validation results
print(f"\nVALIDATION REPORT:")
print("-" * 50)
all_passed = True
for doc_name, validation in validation_report.items():
    status = "PASSED" if validation['passed_validation'] else "FAILED"
    print(f"{doc_name}: {status}")
    if validation['issues']:
        for issue in validation['issues']:
            print(f"  - {issue}")
    all_passed = all_passed and validation['passed_validation']

print(f"\nOverall validation: {'ALL PASSED' if all_passed else 'ISSUES DETECTED'}")

# Calculate summary statistics
print(f"\n{'='*80}")
print("FREQUENCY ANALYSIS SUMMARY")
print(f"{'='*80}")

frequencies = [result['green_term_frequency'] for result in frequency_results.values()]
word_counts = [result['green_word_count'] for result in frequency_results.values()]
total_words = [result['total_words'] for result in frequency_results.values()]

print(f"Frequency Statistics:")
print(f"  Mean: {np.mean(frequencies):.3f}%")
print(f"  Std Dev: {np.std(frequencies):.3f}%")
print(f"  Min: {min(frequencies):.3f}% ({[k for k, v in frequency_results.items() if v['green_term_frequency'] == min(frequencies)][0]})")
print(f"  Max: {max(frequencies):.3f}% ({[k for k, v in frequency_results.items() if v['green_term_frequency'] == max(frequencies)][0]})")

print(f"\nWord Count Statistics:")
print(f"  Total green words across all docs: {sum(word_counts)}")
print(f"  Total content words across all docs: {sum(total_words)}")
print(f"  Overall frequency: {(sum(word_counts) / sum(total_words) * 100):.3f}%")

# Document ranking by frequency
print(f"\nDOCUMENT RANKING BY GREEN TERM FREQUENCY:")
print("-" * 60)
ranked_docs = sorted(frequency_results.items(), key=lambda x: x[1]['green_term_frequency'], reverse=True)
for i, (doc_name, result) in enumerate(ranked_docs, 1):
    print(f"{i}. {doc_name}")
    print(f"   Frequency: {result['green_term_frequency']:.3f}%")
    print(f"   Green words: {result['green_word_count']} / {result['total_words']} total words")
    print(f"   Green terms: {result['total_green_terms']} terms")
    print(f"   Unique words: {result['unique_green_words']} unique")
    print()

In [None]:
# Detailed breakdown by document
print(f"\n{'='*100}")
print("DETAILED FREQUENCY BREAKDOWN BY DOCUMENT")
print(f"{'='*100}")

for doc_name, result in frequency_results.items():
    print(f"\n{doc_name.upper()}")
    print("-" * 80)
    
    # Core metrics
    print(f"GREEN TERM FREQUENCY: {result['green_term_frequency']:.3f}%")
    print(f"Green words: {result['green_word_count']:,} | Total words: {result['total_words']:,}")
    print(f"Total tokens in doc: {result['total_tokens']:,} | Sentences: {result['total_sentences']:,}")
    print(f"Unique green words: {result['unique_green_words']} | Unique lemmas: {result['unique_green_lemmas']}")
    
    # Quality metrics
    if result['overlapping_terms_resolved'] > 0:
        print(f"Overlapping terms resolved: {result['overlapping_terms_resolved']}")
    if result['stop_words_in_green_terms'] > 0:
        print(f"Stop words in green terms: {result['stop_words_in_green_terms']} ({result['green_stop_words']})")
    
    # Top contributing terms
    top_contributors = sorted(result['term_contributions'], 
                             key=lambda x: x['word_count'], reverse=True)[:10]
    print(f"\nTop 10 terms by word contribution:")
    for i, contrib in enumerate(top_contributors, 1):
        term_type = contrib['pos_type'].replace('_', ' ').title()
        print(f"  {i:2d}. {contrib['term']} ({term_type}) - {contrib['word_count']} words")
    
    # Word type distribution
    type_distribution = Counter()
    for contrib in result['term_contributions']:
        type_distribution[contrib['pos_type']] += contrib['word_count']
    
    print(f"\nWord contribution by term type:")
    for term_type, word_count in type_distribution.most_common():
        percentage = (word_count / result['green_word_count'] * 100) if result['green_word_count'] > 0 else 0
        clean_type = term_type.replace('_', ' ').title()
        print(f"  {clean_type}: {word_count} words ({percentage:.1f}%)")

# Create final integrated summary table
print(f"\n{'='*120}")
print("INTEGRATED SUMMARY - GREEN TERMS ANALYSIS + FREQUENCY ANALYSIS")
print(f"{'='*120}")

print(f"{'Document':<25} {'Terms':<6} {'Words':<6} {'Total':<8} {'Freq%':<7} {'Unique':<7} {'Negated':<8} {'Neg%':<6}")
print("-" * 120)

for doc_name in all_results.keys():
    # Get term counts
    valid_total = sum(all_results[doc_name]['valid_counts'].values())
    negated_total = len(all_results[doc_name]['negated_terms'])
    negation_pct = (negated_total / (valid_total + negated_total) * 100) if (valid_total + negated_total) > 0 else 0
    
    # Get frequency data
    freq_data = frequency_results[doc_name]
    
    print(f"{doc_name:<25} {valid_total:<6} {freq_data['green_word_count']:<6} {freq_data['total_words']:<8} {freq_data['green_term_frequency']:<7.2f} {freq_data['unique_green_words']:<7} {negated_total:<8} {negation_pct:<6.1f}")

# Export summary for further analysis
frequency_summary = {
    'analysis_metadata': {
        'exclude_stop_words': True,
        'overlap_resolution': True,
        'validation_passed': all_passed,
        'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    },
    'document_results': frequency_results,
    'overall_statistics': {
        'mean_frequency': np.mean(frequencies),
        'std_frequency': np.std(frequencies),
        'min_frequency': min(frequencies),
        'max_frequency': max(frequencies),
        'total_documents': len(frequency_results),
        'total_green_words': sum(word_counts),
        'total_content_words': sum(total_words),
        'overall_frequency': (sum(word_counts) / sum(total_words) * 100) if sum(total_words) > 0 else 0
    },
    'validation_report': validation_report
}

print(f"\n{'='*120}")
print("FREQUENCY ANALYSIS COMPLETE")
print(f"Analysis stored in 'frequency_summary' variable")
print(f"Individual document results in 'frequency_results' variable")
print(f"Validation {'PASSED' if all_passed else 'ISSUES DETECTED'}")
print(f"{'='*120}")

# Key findings summary
print(f"\nKEY FINDINGS:")
highest_freq_doc = max(frequency_results.items(), key=lambda x: x[1]['green_term_frequency'])
lowest_freq_doc = min(frequency_results.items(), key=lambda x: x[1]['green_term_frequency'])
print(f"Highest frequency: {highest_freq_doc[0]} ({highest_freq_doc[1]['green_term_frequency']:.3f}%)")
print(f"Lowest frequency: {lowest_freq_doc[0]} ({lowest_freq_doc[1]['green_term_frequency']:.3f}%)")
freq_range = highest_freq_doc[1]['green_term_frequency'] - lowest_freq_doc[1]['green_term_frequency']
print(f"Frequency range: {freq_range:.3f} percentage points")

In [None]:
def create_communication_score_dataframe(frequency_results, all_results):
    """
    Create a focused DataFrame for communication score analysis.
    Rows: Organizations (company-year combinations)
    Columns: Key metrics for communication score
    """
    data = []
    
    for doc_name, freq_data in frequency_results.items():
        # Get corresponding term analysis data
        term_data = all_results[doc_name]
        
        # Extract organization and year from document name
        parts = doc_name.split('_')
        year = parts[-1]
        org_name = '_'.join(parts[:-1])
        
        # Calculate key metrics
        valid_terms_count = sum(term_data['valid_counts'].values())
        negated_terms_count = len(term_data['negated_terms'])
        total_terms_before_negation = sum(term_data['original_counts'].values())
        negation_rate = (negated_terms_count / total_terms_before_negation * 100) if total_terms_before_negation > 0 else 0
        
        # Count different term types
        noun_terms = term_data['valid_counts'].get('noun', 0)
        multiword_noun_terms = term_data['valid_counts'].get('multiword_noun', 0)
        adj_terms = term_data['valid_counts'].get('adjective', 0)
        multiword_adj_terms = term_data['valid_counts'].get('multiword_adjective', 0)
        verb_terms = term_data['valid_counts'].get('verb', 0)
        adv_terms = term_data['valid_counts'].get('adverb', 0)
        multiword_adv_terms = term_data['valid_counts'].get('multiword_adverb', 0)
        context_terms = (term_data['valid_counts'].get('context_dependent_noun', 0) + 
                        term_data['valid_counts'].get('context_dependent_multiword_noun', 0))
        dependency_terms = sum(count for pos_type, count in term_data['valid_counts'].items() 
                              if pos_type.startswith('dependency_'))
        
        row = {
            'organization': org_name,
            'year': int(year),
            'gt_freq_pct': round(freq_data['green_term_frequency'], 3),
            'gt_words': freq_data['green_word_count'],
            'total_words': freq_data['total_words'],
            'unique_gt_words': freq_data['unique_green_words'],
            'unique_gt_lemmas': freq_data['unique_green_lemmas'],
            'gt_terms_total': valid_terms_count,
            'gt_terms_negated': negated_terms_count,
            'negation_rate_pct': round(negation_rate, 2),
            'noun_terms': noun_terms,
            'multiword_noun_terms': multiword_noun_terms,
            'adj_terms': adj_terms,
            'multiword_adj_terms': multiword_adj_terms,
            'verb_terms': verb_terms,
            'adv_terms': adv_terms,
            'multiword_adv_terms': multiword_adv_terms,
            'context_dep_terms': context_terms,
            'dependency_terms': dependency_terms,
            'total_sentences': freq_data['total_sentences'],
            'avg_words_per_sent': round(freq_data['total_words'] / freq_data['total_sentences'], 1) if freq_data['total_sentences'] > 0 else 0,
            'overlaps_resolved': freq_data['overlapping_terms_resolved']
        }
        
        data.append(row)
    
    return pd.DataFrame(data)

# Create the communication score DataFrame
comm_score_df = create_communication_score_dataframe(frequency_results, all_results)

# Sort by organization and year for better readability
comm_score_df = comm_score_df.sort_values(['organization', 'year']).reset_index(drop=True)

print("COMMUNICATION SCORE DATAFRAME CREATED")
print("="*80)
print(comm_score_df.to_string(index=False))

# Export to Excel
import os
excel_path = "data/NLP/Results/Communication_Score_df_Density.xlsx"

# Create directory if it doesn't exist
os.makedirs(os.path.dirname(excel_path), exist_ok=True)

comm_score_df.to_excel(excel_path, index=False, sheet_name='Communication_Score')

print(f"\nExported to Excel: {excel_path}")
print(f"DataFrame shape: {comm_score_df.shape[0]} organizations × {comm_score_df.shape[1]} metrics")

# Column descriptions for reference
print(f"\nCOLUMN DESCRIPTIONS:")
column_descriptions = {
    'organization': 'Organization',
    'year': 'Report year',
    'gt_freq_pct': 'Green term frequency (%)',
    'gt_words': 'Count of green words',
    'total_words': 'Count total words',
    'unique_gt_words': 'Unique green words',
    'unique_gt_lemmas': 'Unique green lemmas',
    'gt_terms_total': 'Total green terms found',
    'gt_terms_negated': 'Green terms that were negated',
    'negation_rate_pct': 'Negation rate (%)',
    'noun_terms': 'Green noun terms',
    'multiword_noun_terms': 'Multiword green noun terms',
    'adj_terms': 'Green adjective terms',
    'multiword_adj_terms': 'Multiword green adjective terms',
    'verb_terms': 'Green verb terms',
    'adv_terms': 'Green adverb terms',
    'multiword_adv_terms': 'Multiword green adverb terms',
    'context_dep_terms': 'Context-dependent terms',
    'dependency_terms': 'Dependency pattern terms',
    'total_sentences': 'Total sentences in document',
    'avg_words_per_sent': 'Average words per sentence',
    'overlaps_resolved': 'Overlapping terms resolved'
}

for col, desc in column_descriptions.items():
    print(f"  {col:<20}: {desc}")

print(f"\nData saved as: {excel_path}")
print(f"Variable available as: comm_score_df")

In [None]:
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill
from openpyxl import load_workbook

# Define file path and output path
output_path = "data/NLP/Results/Communication_Score_df_Density.xlsx"

# Save the DataFrame to Excel
comm_score_df.to_excel(output_path, index=False, engine="openpyxl")

# Load the workbook and sheet
wb = load_workbook(output_path)
ws = wb.active  # There's only one sheet since we saved just one DataFrame

# Auto-adjust column widths based on the longest string in each column
for col in ws.columns:
    max_length = 0
    col_letter = get_column_letter(col[0].column)
    for cell in col:
        if cell.value:
            max_length = max(max_length, len(str(cell.value)))
    ws.column_dimensions[col_letter].width = max_length + 3  # Add padding

# Define grey fill for alternating rows
grey_fill = PatternFill(start_color="D9D9D9", end_color="D9D9D9", fill_type="solid")

# Alternate row colors by company
prev_company = None
use_grey = False
for row in range(2, ws.max_row + 1):
    current_company = ws[f"A{row}"].value  # Column A has the company names
    if current_company != prev_company:
        use_grey = not use_grey
        prev_company = current_company

    if use_grey:
        for col in range(1, ws.max_column + 1):
            ws.cell(row=row, column=col).fill = grey_fill

# Save the final cleaned and formatted workbook
wb.save(output_path)
