In [1]:
import re
import nltk
import json
from nltk.tokenize import sent_tokenize
from collections import OrderedDict, defaultdict
import time
import pickle

# ================ Pre-compiled Regular Expressions ================
RE_WHITESPACE = re.compile(r'\s+')
RE_LEADING_CONJ = re.compile(r'^(?:and|but|or|then|also|so)\s+')
RE_TRAILING_PUNCT = re.compile(r'[,;.]+$')
RE_MIDDLE_PUNCT = re.compile(r'[,;]')
RE_END_CONJ = re.compile(r'\s+(?:while|as|when|and|but|or|after|before|since|because)$')

# ================ Pre-defined Dictionaries ================
# Load from pickle file
with open('FAVOR_Dictionary/action_extraction_dicts.pkl', 'rb') as f:
    dict_data = pickle.load(f)

# Extract various dictionaries
ANIMATE_NOUNS = dict_data["ANIMATE_NOUNS"]
SUBJECT_SYNONYMS = dict_data["SUBJECT_SYNONYMS"]
PLURAL_FORMS = dict_data["PLURAL_FORMS"]
COMMON_VERBS = dict_data["COMMON_VERBS"]
PHRASAL_VERBS = dict_data["PHRASAL_VERBS"]
IRREGULAR_VERB_FORMS = dict_data["IRREGULAR_VERB_FORMS"]
CAMERA_TERMS = dict_data["CAMERA_TERMS"]
DESCRIPTIVE_ADJECTIVES = dict_data["DESCRIPTIVE_ADJECTIVES"]
COLOR_TERMS = dict_data["COLOR_TERMS"]
CLOTHING_TERMS = dict_data["CLOTHING_TERMS"]
SETTING_TERMS = dict_data["SETTING_TERMS"]
POSITION_TERMS = dict_data["POSITION_TERMS"]
EMOTION_TERMS = dict_data["EMOTION_TERMS"]

# Pre-compile sets for efficient lookup
COMMON_VERBS_SET = set(COMMON_VERBS)
CAMERA_TERMS_SET = set(CAMERA_TERMS)


# ================ Helper Functions ================
def normalize_verb(verb_form):
    """Convert a verb form to its base form"""
    # Check irregular verb mappings
    for base_form, forms in IRREGULAR_VERB_FORMS.items():
        if verb_form.lower() in forms:
            return base_form
        
    # Handle regular verbs
    verb = verb_form.lower()
    
    # Handle -ing forms
    if verb.endswith('ing'):
        # Double consonant ending + ing
        if len(verb) > 4 and verb[-4] == verb[-5] and verb[-4] not in 'aeiou':
            return verb[:-4]
        # Drop e + ing
        elif len(verb) > 5 and verb[:-3] + 'e' in COMMON_VERBS_SET:
            return verb[:-3] + 'e'
        # Standard -ing
        else:
            return verb[:-3]
    
    # Handle -ed forms
    if verb.endswith('ed'):
        # Double consonant ending + ed
        if len(verb) > 3 and verb[-3] == verb[-4] and verb[-3] not in 'aeiou':
            return verb[:-3]
        # y -> ied
        elif verb.endswith('ied') and len(verb) > 3:
            return verb[:-3] + 'y'
        # Standard -ed
        elif verb[:-2] + 'e' in COMMON_VERBS_SET:
            return verb[:-2] + 'e'
        else:
            return verb[:-2]
    
    # Handle third person singular -s forms
    if verb.endswith('s') and not verb.endswith('ss'):
        # -ies form
        if verb.endswith('ies'):
            return verb[:-3] + 'y'
        # -es form
        elif verb.endswith(('ches', 'shes', 'sses', 'xes', 'zes')):
            return verb[:-2]
        # Simple -s form
        else:
            return verb[:-1]
    
    # If no rules match, return as is
    return verb

def is_potential_verb(word):
    """Quick check if a word could be a verb"""
    word = word.lower()
    
    # Direct check in predefined verb list
    if word in COMMON_VERBS_SET:
        return True
    
    # Check for possible verb inflections
    if word.endswith(('s', 'ed', 'ing')):
        # Try to extract base form
        base = normalize_verb(word)
        if base in COMMON_VERBS_SET:
            return True
    
    # Check irregular verb forms
    for forms in IRREGULAR_VERB_FORMS.values():
        if word in forms:
            return True
    
    return False

# Pre-compile complex regex patterns and templates
def compile_all_patterns():
    """Pre-compile all commonly used regex patterns"""
    patterns = {}
    
    # 1. Specific camera motion patterns
    specific_camera_patterns = [
        r'[Tt]he camera then pans',
        r'[Tt]he camera shifts',
        r'[Tt]he camera follows',
        r'[Tt]he camera pans',
        r'[Tt]he camera then shifts',
        r'[Tt]he scene transitions',
        r'[Tt]he camera (?:slowly|quickly|gradually)',
        r'[Tt]he camera moves',
        r'[Tt]he camera (?:remains|stays)',
        r'[Tt]he camera (?:is|appears to be)',
        r'[Tt]he camera pans',
        r'[Tt]he camera continues',
        r'[Tt]he camera zooms',
        r'[Tt]he camera (?:tracks|follows)',
        r'[Aa] (?:tracking|panning|zoom|dolly) shot',
        r'[Tt]he (?:shot|frame|video|scene) transitions',
        r'[Tt]he (?:shot|frame|video|scene) changes',
        r'[Tt]he (?:focus|angle|perspective) shifts',
        r'[Tt]he video starts with',
        r'[Tt]he video begins with',
        r'[Tt]he video shows',
        r'[Tt]he scene begins with',
        r'[Tt]he scene starts with',
        r'[Cc]amera (?:then )?cuts to',
        r'[Tt]he static shot',
        r'[Ss]tationary shot',
        r'[Ss]tatic (?:shot|camera)',
        r'[Cc]apturing the.*from',
        r'[Oo]verhead (?:shot|view)',
        r'[Ff]irst-person perspective',
    ]
    
    # 2. General camera motion patterns
    general_camera_patterns = [
        r'(?:the\s+)?camera\s+(?:pan|pans|panned|panning)',
        r'(?:the\s+)?camera\s+(?:zoom|zooms|zoomed|zooming)',
        r'(?:the\s+)?camera\s+(?:follow|follows|followed|following)',
        r'(?:the\s+)?camera\s+(?:shift|shifts|shifted|shifting)',
        r'(?:the\s+)?camera\s+(?:tilt|tilts|tilted|tilting)',
        r'(?:the\s+)?camera\s+(?:track|tracks|tracked|tracking)',
        r'(?:the\s+)?camera\s+(?:shake|shakes|shook|shaking)',
        r'(?:the\s+)?camera\s+(?:remain|remains|remained|keep|keeps|kept)\s+(?:static|stationary|fixed|steady)',
        r'(?:the\s+)?camera\s+(?:move|moves|moved|moving)',
        r'(?:the\s+)?camera\s+(?:is|was)\s+(?:positioned|placed|located)',
        r'(?:the\s+)?camera\s+(?:capture|captures|captured|capturing)',
        r'(?:the\s+)?camera\s+(?:focus|focuses|focused|focusing)',
        r'(?:a|the)\s+(?:static|stationary|moving|panning|tracking|zoom)\s+shot',
        r'(?:a|the)\s+(?:close-up|medium|wide|aerial|overhead)\s+shot',
        r'[Tt]he scene (?:remains|stays) static',
        r'[Tt]he scene (?:is|appears) static',
        r'stationary (?:throughout|camera)',
        r'no significant camera movement',
        r'no camera movement',
        r'steady camera',
        r'stable camera',
        r'fixed camera',
    ]
    
    # Compile camera patterns
    patterns['camera_patterns'] = [re.compile(p) for p in specific_camera_patterns + general_camera_patterns]
    
    # 3. Descriptive statement patterns
    descriptive_patterns = [
        r'^(?:is|are|was|were)\s+(?:visible|present|shown|displayed)',
        r'^(?:seems|seemed|appears|appeared)\s+to\s+be',
        r'^(?:looks|looked)\s+(?:like|similar)',
        r'^(?:suggesting|indicating|showing|depicting)',
        r'^(?:can\s+be\s+seen)',
        r'^(?:wearing|carrying|holding)\b',
        r'^(?:stands|standing|seated|sitting|lying)\s+in\s',
        r'what appears to be',
        r'appears to be',
        r'seems to be'
    ]
    patterns['descriptive_patterns'] = [re.compile(p, re.IGNORECASE) for p in descriptive_patterns]
    
    # 4. Scene description patterns
    scene_patterns = [
        r'^the\s+(?:scene|background|setting|video|clip)\s+',
        r'^the\s+video\s+(?:shows|depicts|begins|starts|opens)\s+',
        r'^in\s+the\s+(?:background|foreground|scene|setting)\s+',
        r'^the\s+(?:room|area|space|environment)\s+',
        r'\b(?:appears|appears\s+to\s+be|looks\s+like)\b',
        r'\b(?:suggesting|indicating|depicting)\b'
    ]
    patterns['scene_patterns'] = [re.compile(p, re.IGNORECASE) for p in scene_patterns]
    
    # 5. Subject indicator patterns
    subject_indicators = [
        r'\bthe\s+(?:\w+\s+)*(?:man|woman|person|boy|girl|child|player|individual)',
        r'\ba\s+(?:\w+\s+)*(?:man|woman|person|boy|girl|child|player|individual)',
        r'\ban\s+(?:\w+\s+)*(?:man|woman|person|boy|girl|child|player|individual)'
    ]
    patterns['subject_indicators'] = [re.compile(p, re.IGNORECASE) for p in subject_indicators]
    
    # 6. Passive voice patterns
    passive_constructions = [
        r'is\s+\w+ed', r'was\s+\w+ed', r'are\s+\w+ed', r'were\s+\w+ed',
        r'is\s+being\s+\w+ed', r'was\s+being\s+\w+ed', 
        r'has\s+been\s+\w+ed', r'have\s+been\s+\w+ed',
        r'had\s+been\s+\w+ed',
        r'is\s+\w+en', r'was\s+\w+en', r'are\s+\w+en', r'were\s+\w+en'
    ]
    patterns['passive_patterns'] = [re.compile(p, re.IGNORECASE) for p in passive_constructions]
    
    # 7. State and emotion change patterns
    state_patterns = [
        (r'expression\s+changes?\s+from\s+([^.,;:]+)\s+to\s+([^.,;:]+)', 
         lambda m: f"expression changes from {m.group(1)} to {m.group(2)}"),
        (r'expression\s+changes?\s+to\s+([^.,;:]+)', 
         lambda m: f"expression changes to {m.group(1)}"),
        (r'(?:indicating|suggesting)\s+(?:he|she|they)\s+is\s+([^.,;:]+)', 
         lambda m: f"is {m.group(1)}"),
        (r'appears\s+to\s+be\s+([^.,;:]+)', 
         lambda m: f"appears to be {m.group(1)}"),
        (r'seems\s+to\s+be\s+([^.,;:]+)', 
         lambda m: f"seems to be {m.group(1)}"),
        (r'continues\s+to\s+([^.,;:]+)', 
         lambda m: f"continues to {m.group(1)}"),
        (r'remains\s+([^.,;:]+)', 
         lambda m: f"remains {m.group(1)}")
    ]
    patterns['state_patterns'] = [(re.compile(p, re.IGNORECASE), f) for p, f in state_patterns]
    
    # 8. Reaction and interaction patterns
    reaction_patterns = [
        (r'reacting\s+to\s+([^.,;:]+)', 
         lambda m: f"reacting to {m.group(1)}"),
        (r'responding\s+to\s+([^.,;:]+)', 
         lambda m: f"responding to {m.group(1)}"),
        (r'engaged\s+with\s+([^.,;:]+)', 
         lambda m: f"engaged with {m.group(1)}"),
        (r'looks?\s+(?:at|towards|to)\s+([^.,;:]+)\s+with\s+interest', 
         lambda m: f"looks at {m.group(1)} with interest")
    ]
    patterns['reaction_patterns'] = [(re.compile(p, re.IGNORECASE), f) for p, f in reaction_patterns]
    
    # 9. Pre-compile verb patterns
    verb_pattern_parts = set()
    
    # Add all common verbs and their inflections
    for verb in COMMON_VERBS:
        # Base form
        verb_pattern_parts.add(verb)
        
        # 3rd person singular
        if verb.endswith(('s', 'x', 'z', 'ch', 'sh')):
            verb_pattern_parts.add(f"{verb}es")
        elif verb.endswith('y') and verb[-2] not in 'aeiou':
            verb_pattern_parts.add(f"{verb[:-1]}ies")
        else:
            verb_pattern_parts.add(f"{verb}s")
        
        # Past tense (regular verbs)
        if verb.endswith('e'):
            verb_pattern_parts.add(f"{verb}d")
        elif verb.endswith('y') and verb[-2] not in 'aeiou':
            verb_pattern_parts.add(f"{verb[:-1]}ied")
        elif len(verb) > 2 and verb[-1] not in 'aeiou' and verb[-2] in 'aeiou' and verb[-3] not in 'aeiou':
            verb_pattern_parts.add(f"{verb}{verb[-1]}ed")  # Double consonant
        else:
            verb_pattern_parts.add(f"{verb}ed")
        
        # Present participle
        if verb.endswith('e') and not verb.endswith('ee'):
            verb_pattern_parts.add(f"{verb[:-1]}ing")
        elif len(verb) > 2 and verb[-1] not in 'aeiou' and verb[-2] in 'aeiou' and verb[-3] not in 'aeiou':
            verb_pattern_parts.add(f"{verb}{verb[-1]}ing")  # Double consonant
        else:
            verb_pattern_parts.add(f"{verb}ing")
    
    # Add irregular verb forms
    for base_form, forms in IRREGULAR_VERB_FORMS.items():
        for form in forms:
            verb_pattern_parts.add(form)
    
    # Add phrasal verbs
    for phrasal_verb in PHRASAL_VERBS:
        verb_pattern_parts.add(phrasal_verb)
    
    # Convert to regex patterns and compile
    patterns['verb_patterns'] = [re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE) for word in verb_pattern_parts]
    
    # 10. Specific action patterns
    specific_actions = [
        (r'searching\s+through\s+[^.,;:]+', lambda m: m.group(0)),
        (r'searching\s+for\s+[^.,;:]+', lambda m: m.group(0)),
        (r'looking\s+through\s+[^.,;:]+', lambda m: m.group(0)),
        (r'looking\s+for\s+[^.,;:]+', lambda m: m.group(0)),
        (r'walks\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'walking\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'runs\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'running\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'moves\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'moving\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'looks\s+(?:at|to|towards)\s+[^.,;:]+', lambda m: m.group(0)),
        (r'looking\s+(?:at|to|towards)\s+[^.,;:]+', lambda m: m.group(0)),
        (r'bends\s+down\s+[^.,;:]*', lambda m: m.group(0)),
        (r'picks\s+up\s+[^.,;:]+', lambda m: m.group(0)),
        (r'puts\s+(?:down|away|on|in)\s+[^.,;:]+', lambda m: m.group(0)),
        (r'reacting\s+to\s+[^.,;:]+', lambda m: m.group(0)),
        (r'speaking\s+(?:to|with)\s+[^.,;:]+', lambda m: m.group(0)),
        (r'ensures\s+[^.,;:]+', lambda m: m.group(0)),
        (r'adjusts\s+[^.,;:]+', lambda m: m.group(0)),
        (r'handles\s+[^.,;:]+', lambda m: m.group(0)),
        (r'examines\s+[^.,;:]+', lambda m: m.group(0)),
        (r'interacts\s+with\s+[^.,;:]+', lambda m: m.group(0)),
        (r'gestures\s+(?:to|towards|at)\s+[^.,;:]+', lambda m: m.group(0)),
        (r'engaged\s+in\s+[^.,;:]+', lambda m: m.group(0)),
        (r'reaches\s+(?:for|out to|towards)\s+[^.,;:]+', lambda m: m.group(0)),
        (r'hands\s+over\s+[^.,;:]+', lambda m: m.group(0)),
        (r'passes\s+[^.,;:]+', lambda m: m.group(0)),
        (r'continues\s+to\s+[^.,;:]+', lambda m: m.group(0))
    ]
    patterns['specific_actions'] = [(re.compile(p, re.IGNORECASE), f) for p, f in specific_actions]
    
    # 11. Additional verb patterns
    action_indicators = [
        ('ensures', r'\bensures\s+([^.,;:]+)'),
        ('moves', r'\bmoves\s+([^.,;:]+)'),
        ('adjusts', r'\badjusts\s+([^.,;:]+)'),
        ('observes', r'\bobserves\s+([^.,;:]+)'),
        ('watches', r'\bwatches\s+([^.,;:]+)'),
        ('continues', r'\bcontinues\s+to\s+([^.,;:]+)'),
        ('begins', r'\bbegins\s+to\s+([^.,;:]+)'),
        ('starts', r'\bstarts\s+to\s+([^.,;:]+)'),
        ('tries', r'\btries\s+to\s+([^.,;:]+)'),
        ('attempts', r'\battempts\s+to\s+([^.,;:]+)')
    ]
    patterns['action_indicators'] = [(kw, re.compile(p, re.IGNORECASE)) for kw, p in action_indicators]
    
    # 12. Create a set of -ing noun compounds
    ing_compounds = {
        'cutting board', 'serving plate', 'serving dish', 'serving tray', 'serving spoon',
        'cooking pot', 'cooking pan', 'cooking utensil', 'cooking oil', 'cooking spray',
        'baking dish', 'baking sheet', 'baking pan', 'baking powder',
        'rolling pin', 'shopping cart', 'shopping bag', 'shopping basket',
        'running water', 'drinking water', 'cooking water', 'cooking wine',
        'bottled water', 'sparkling water', 'cleaning solution', 'cleaning agent',
        'packing material', 'wrapping paper', 'writing paper',
        'dining room', 'dining table', 'dining area', 'dining chair',
        'living room', 'living area', 'living space',
        'waiting room', 'meeting room', 'dressing room',
        'cooking show', 'cooking class', 'cooking lesson', 'cooking skill',
        'cooking technique', 'cooking time', 'cooking temperature',
        'serving size', 'serving style', 'serving method',
        'serving motion', 'serving technique', 'serving position',
        'training session', 'opening ceremony', 'closing ceremony',
        'sporting event', 'running track', 'swimming pool',
        'running shoes', 'swimming suit', 'diving board',
        'starting position', 'ending position', 'turning point',
        'breaking point', 'meeting point', 'resting place',
        'turning mechanism', 'operating system', 'starting line',
        'finishing line', 'measuring device', 'counting machine',
        'serving line', 'playing field', 'scoring position',
        'bowling alley', 'racing track', 'jumping obstacle',
        'putting green', 'shooting range', 'batting cage',
        'serving position', 'playing court', 'training ground'
    }
    patterns['ing_compounds'] = ing_compounds
    
    # Convert to set for faster lookup
    patterns['ing_compounds_set'] = set(ing_compounds)
    
    # 13. Attribute patterns
    attribute_patterns = [
        # Clothing
        (r'wearing\s+([^.,;:?!]+)', lambda m: "wearing " + m.group(1).strip()),
        (r'dressed\s+in\s+([^.,;:?!]+)', lambda m: "dressed in " + m.group(1).strip()),
        
        # Held items
        (r'holding\s+([^.,;:?!]+)', lambda m: "holding " + m.group(1).strip()),
        (r'carrying\s+([^.,;:?!]+)', lambda m: "carrying " + m.group(1).strip()),
        
        # Features with "with"
        (r'with\s+([^.,;:?!]+)', lambda m: "with " + m.group(1).strip()),
        
        # Location or place
        (r'in\s+([^.,;:?!]+(?:shirt|dress|jacket|uniform|clothing|clothes|outfit))', lambda m: "in " + m.group(1).strip()),
        (r'in\s+([^.,;:?!]+)', lambda m: "in " + m.group(1).strip()),  # More general "in" pattern
        
        # Appositive attributes in commas
        (r',\s+([^.,;:?!]*(?:wearing|dressed|holding|carrying|with)[^.,;:?!]*)', lambda m: m.group(1).strip())
    ]
    patterns['attribute_patterns'] = [(re.compile(p, re.IGNORECASE), f) for p, f in attribute_patterns]
    
    return patterns

# Ensure NLTK resources are downloaded
def nltk_ensure_downloaded():
    """Ensure required NLTK resources are downloaded"""
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)

    try:
        nltk.data.find('taggers/averaged_perceptron_tagger')
    except LookupError:
        nltk.download('averaged_perceptron_tagger', quiet=True)

# Get all possible subject nouns and their plural forms
def get_subject_nouns():
    """Return all possible subject nouns and their plural forms"""
    all_nouns = set(ANIMATE_NOUNS)
    
    # Add regular and irregular plural forms
    for noun in list(all_nouns):  # Use list copy to avoid modifying set during iteration
        # Add irregular plural forms
        if noun in PLURAL_FORMS:
            all_nouns.add(PLURAL_FORMS[noun])
        # Add regular plural forms
        elif not noun.endswith('s'):
            if noun.endswith(('s', 'x', 'z', 'ch', 'sh')):
                all_nouns.add(f"{noun}es")
            else:
                all_nouns.add(f"{noun}s")
    
    # Also add plural forms for synonyms
    for noun, synonyms in SUBJECT_SYNONYMS.items():
        for synonym in synonyms:
            all_nouns.add(synonym)
            # Add irregular plural forms
            if synonym in PLURAL_FORMS:
                all_nouns.add(PLURAL_FORMS[synonym])
            # Add regular plural forms
            elif not synonym.endswith('s'):
                if synonym.endswith(('s', 'x', 'z', 'ch', 'sh')):
                    all_nouns.add(f"{synonym}es")
                else:
                    all_nouns.add(f"{synonym}s")
    
    return sorted(all_nouns, key=len, reverse=True)  # Sort by length, longer ones first

# ================ Initialize Common Resources ================
# Pre-compiled regular expressions and patterns
COMPILED_PATTERNS = compile_all_patterns()
SUBJECT_NOUNS = get_subject_nouns()
SUBJECT_NOUNS_SET = set(SUBJECT_NOUNS)  # For fast lookups

class VideoDescriptionParser:
    """Class for parsing video descriptions to extract structured information"""
    
    def __init__(self):
        # Make sure necessary NLTK resources are downloaded
        nltk_ensure_downloaded()
        
        # Initialize caches
        self.reset_cache()
        
        # Store globally compiled patterns
        self.patterns = COMPILED_PATTERNS
        
    def reset_cache(self):
        """Reset all caches to prepare for parsing a new description"""
        # Cache for verb checks
        self.verb_check_cache = {}
        
        # Caches for subject identification
        self.cached_subjects = None
        self.cached_subject_refs = None
        
        # Camera motion caches
        self.cached_camera_motions = []
        self.cached_camera_texts = []
        
        # Action extraction cache
        self.action_extraction_cache = {}
        
        # Caches for subject and pronoun lookups
        self.subject_lookup_cache = {}
        self.pronoun_cache = {}

    def contains_verb(self, text):
        """Check if text contains any verb, using cache for performance"""
        # Check cache
        if text in self.verb_check_cache:
            return self.verb_check_cache[text]
        
        # Quick check for common verbs
        words = text.lower().split()
        if any(word in COMMON_VERBS_SET for word in words):
            self.verb_check_cache[text] = True
            return True
        
        # Use regex for detailed checking
        has_verb = any(pattern.search(text) for pattern in self.patterns['verb_patterns'])
        self.verb_check_cache[text] = has_verb
        return has_verb
    
    def extract_main_verb(self, text):
        """Extract the main verb from an action text"""
        # First check for phrasal verbs
        for phrasal_verb in PHRASAL_VERBS:
            if re.search(r'\b' + re.escape(phrasal_verb) + r'\b', text, re.IGNORECASE):
                return phrasal_verb
        
        # Then check all verb patterns
        for pattern in self.patterns['verb_patterns']:
            match = pattern.search(text)
            if match:
                verb = match.group(0)
                
                # Check for phrasal verbs (like "pick up")
                post_verb = text[match.end():].lstrip()
                prepositions = ["up", "down", "in", "out", "on", "off", "over", "under", 
                               "through", "away", "back", "forward", "backward", "to", "at"]
                
                for prep in prepositions:
                    if re.match(r'^\s*' + prep + r'\b', post_verb):
                        if verb.lower() not in ['is', 'are', 'was', 'were', 'be', 'been', 'being']:
                            verb += f" {prep}"
                            break
                        
                return verb
        
        return None
    
    def is_descriptive_statement(self, text):
        """Check if the text is descriptive rather than action-oriented"""
        # Use precompiled patterns to check
        for pattern in self.patterns['descriptive_patterns']:
            if pattern.search(text):
                return True
        
        # Check for be-verb followed by adjective
        be_verb_adj = re.search(r'^(?:is|are|was|were)\s+(\w+)', text.lower())
        if be_verb_adj:
            adj = be_verb_adj.group(1)
            if adj in DESCRIPTIVE_ADJECTIVES:
                return True
        
        return False
    
    def is_ing_nominal_compound(self, text):
        """Check if text might be an -ing form noun compound rather than action"""
        text_lower = text.lower()
        
        # Quick check against predefined compounds set
        for compound in self.patterns['ing_compounds_set']:
            if compound in text_lower:
                return True
        
        # Check patterns that might indicate noun compounds
        nominal_patterns = [
            # -ing words modifying nouns
            r'\b(?:the|a|an|this|that|these|those|my|your|his|her|their|our)\s+(\w+ing)\s+(\w+)\b',
            
            # Preposition followed by -ing form (likely gerund)
            r'\b(?:on|in|at|by|with|from|to|through|about|after|before)\s+(?:the\s+)?(\w+ing)\b',
            
            # Possessive followed by -ing form (clearly gerund)
            r"\b(?:his|her|their|its|one's)\s+(\w+ing)\b",
            
            # -ing form followed by common nouns that often form compounds
            r'\b(\w+ing)\s+(?:room|table|board|surface|line|position|style|technique|machine|device|system|method|process|approach)\b'
        ]
        
        for pattern in nominal_patterns:
            match = re.search(pattern, text_lower)
            if match:
                # Make sure this isn't part of a progressive verb structure
                if not re.search(r'\bis\s+\w+ing\b', text_lower) and not re.search(r'\bwas\s+\w+ing\b', text_lower):
                    return True
        
        # Process-related ing forms when used as nouns
        process_ing_forms = [
            'cooking', 'baking', 'frying', 'boiling', 'washing', 'cleaning',
            'swimming', 'running', 'jumping', 'serving', 'riding', 'playing',
            'training', 'learning', 'teaching', 'writing', 'reading', 'speaking',
            'working', 'traveling', 'studying', 'thinking', 'computing', 'measuring'
        ]
        
        for ing_form in process_ing_forms:
            # If ing form appears as noun (with determiner)
            if re.search(rf'\b(?:the|a|an|this|that|some|any|all)\s+{ing_form}\b', text_lower):
                return True
        
        return False
    
    def clean_action_text(self, action):
        """Clean action text by removing punctuation and handling passive voice correctly"""
        # Remove any trailing commas, periods, semicolons
        action = RE_TRAILING_PUNCT.sub('', action).strip()
        
        # Remove any middle commas, semicolons
        action = RE_MIDDLE_PUNCT.sub(' ', action).strip()
        
        # Remove excess whitespace
        action = RE_WHITESPACE.sub(' ', action).strip()
        
        # Handle passive voice - identify passive structures and keep them intact
        is_passive = False
        for pattern in self.patterns['passive_patterns']:
            if pattern.search(action):
                is_passive = True
                break
        
        if not is_passive:
            # For active voice, truncate at connectors if present
            connectors = ['while', 'as', 'when', 'and', 'but', 'or', 'after', 'before', 'since', 'because']
            for connector in connectors:
                pattern = r'\s+' + connector + r'\s+'
                if re.search(pattern, action):
                    parts = re.split(pattern, action, 1)
                    action = parts[0].strip()
                    break
        
        # Make sure there's no trailing conjunction
        action = RE_END_CONJ.sub('', action).strip()
        
        # Remove leading conjunctions
        action = RE_LEADING_CONJ.sub('', action).strip()
        
        return action
    
    def extract_camera_motions_from_sentence(self, sentence):
        """Extract complete camera motion descriptions from a sentence, avoiding duplicates"""
        # Quick check if this sentence might contain camera motion
        if not any(term in sentence.lower() for term in CAMERA_TERMS_SET):
            return [], []
        
        camera_motions = []
        text_to_remove = []
        processed_spans = set()  # To track text ranges already processed
        
        # Use precompiled camera patterns
        for pattern in self.patterns['camera_patterns']:
            matches = list(pattern.finditer(sentence))
            for match in matches:
                motion_start = match.start()
                
                # Check if this position is already processed
                already_processed = False
                for start, end in processed_spans:
                    if motion_start >= start and motion_start < end:
                        already_processed = True
                        break
                
                if already_processed:
                    continue
                
                motion_end = self.find_camera_motion_end(sentence, match.end())
                span = (motion_start, motion_end)
                processed_spans.add(span)
                
                full_motion = sentence[motion_start:motion_end].strip()
                normalized = self.normalize_camera_motion(full_motion)
                if normalized and normalized not in camera_motions:
                    camera_motions.append(normalized)
                    text_to_remove.append(full_motion)
        
        return camera_motions, text_to_remove
    
    def find_camera_motion_end(self, sentence, start_idx):
        """Find the end position of camera motion description and where subject action begins"""
        # Look for specific markers indicating subject description or action
        subject_markers = [
            r'\ba\s+(?:man|woman|person|boy|girl)', 
            r'\bthe\s+(?:man|woman|person|boy|girl)',
            r'\bhe\b', r'\bshe\b', r'\bthey\b',
            r'\bshowing\b', r'\bdisplaying\b', r'\brevealing\b',
            r'\bas\b', r'\bwhile\b'
        ]
        
        earliest_marker = len(sentence)
        
        # Find earliest subject marker
        for marker in subject_markers:
            match = re.search(marker, sentence[start_idx:], re.IGNORECASE)
            if match and start_idx + match.start() < earliest_marker:
                earliest_marker = start_idx + match.start()
        
        # Also check for punctuation
        for i in range(start_idx, min(len(sentence), earliest_marker)):
            if sentence[i] in '.,:;':
                return i
        
        # Return earliest marker position or end of sentence
        return earliest_marker if earliest_marker < len(sentence) else len(sentence)
    
    def normalize_camera_motion(self, motion_text):
        """Normalize camera motion text for consistent output"""
        # Ensure starts with "Camera"
        text = motion_text.strip()
        
        if text.lower().startswith("the camera"):
            text = "Camera" + text[10:]
        elif text.lower().startswith("a camera"):
            text = "Camera" + text[8:]
        elif not text.lower().startswith("camera"):
            text = "Camera " + text
        
        # Capitalize first letter
        text = text[0].upper() + text[1:]
        
        # Remove ending punctuation
        if text and text[-1] in '.,:;!?':
            text = text[:-1]
        
        return text
    
    def remove_camera_parts(self, text, camera_texts):
        """Remove camera motion descriptions from text"""
        if not camera_texts:  # If no camera texts to remove, return as is
            return text
            
        result = text
        
        for motion_text in camera_texts:
            # Remove exact camera motion text
            result = result.replace(motion_text, "").strip()
            
        # Clean up any residuals
        result = RE_WHITESPACE.sub(' ', result).strip()
        result = re.sub(r'^(?:as|while|when|and|then)\s+', '', result).strip()
        result = re.sub(r'^\s*[,;:]\s*', '', result).strip()
        
        return result
    
    def identify_all_subjects(self, sentences, camera_text_to_remove):
        """First pass: identify all subjects across the entire text"""
        # If we've already cached results, return them
        if self.cached_subjects is not None and self.cached_subject_refs is not None:
            return self.cached_subjects, self.cached_subject_refs
        
        subjects = {}  # Final subject dictionary
        subject_refs = []  # Track all subject references
        
        # First, find all potential subject mentions
        for sent_idx, sent in enumerate(sentences):
            # Try to find subjects in both original and cleaned text
            clean_sent = self.remove_camera_parts(sent, camera_text_to_remove)
            texts_to_check = [sent, clean_sent] if clean_sent != sent else [sent]
            
            for text in texts_to_check:
                if not text.strip():
                    continue
                    
                # Skip sentences that purely describe scene/environment
                if self.is_scene_description(text):
                    continue
                    
                # Find subjects in this text
                found_subjects = self.find_subjects_in_sentence(text)
                
                for subj in found_subjects:
                    # Add sentence position
                    subj["sentence_idx"] = sent_idx
                    
                    # Check if this exact subject reference already exists
                    if not any(existing["start_idx"] == subj["start_idx"] and 
                              existing["sentence_idx"] == sent_idx for existing in subject_refs):
                        subject_refs.append(subj)
        
        # Consolidate subjects (group references to same subject)
        subject_groups = self.group_subject_references(subject_refs)
        
        # Create final subject dictionary
        for i, refs in enumerate(subject_groups):
            subj_id = f"Subject {i+1}"
            base_ref = refs[0]  # Use first reference as base
            
            # Check if it's too generic a subject term to include
            if base_ref["noun"].lower() in ["characters", "individuals", "subjects"] and len(subject_groups) > 1:
                # Skip this generic subject if we have other more specific ones
                continue
            
            # Find most detailed attributes
            attributes = self.get_best_attributes(refs)
            
            subjects[subj_id] = {
                "base_noun": base_ref["noun"],
                "attributes": attributes,
                "actions": [],
                "refs": refs  # Keep all references for action extraction
            }
        
        # Cache results for future use
        self.cached_subjects = subjects
        self.cached_subject_refs = subject_refs
        
        return subjects, subject_refs
    
    def identify_subjects_generic(self, sentences, camera_text_to_remove):
        """Identify subjects in a more generic way when specific identification fails"""
        subjects = {}  # Final subject dictionary
        subject_refs = []  # Track all subject references
        
        # Generic subjects based on pronouns and actions
        pronoun_subjects = {
            "he/him": {"nouns": ["man", "boy", "male"], "pronouns": ["he", "him", "his"]},
            "she/her": {"nouns": ["woman", "girl", "female"], "pronouns": ["she", "her", "hers"]},
            "they/them": {"nouns": ["people", "individuals", "characters"], "pronouns": ["they", "them", "their"]}
        }
        
        # Try to find subjects in original sentences first, which may include camera parts
        for sent_idx, sent in enumerate(sentences):
            # Look for subjects that might appear after camera motion descriptions
            for subj_type in SUBJECT_NOUNS:
                # Create more specific search for color+clothing patterns
                for color in COLOR_TERMS:
                    for clothing in CLOTHING_TERMS:
                        pattern = rf'\b(?:a|an|the)\s+{subj_type}\s+in\s+{color}\s+{clothing}\b'
                        matches = list(re.finditer(pattern, sent, re.IGNORECASE))
                        
                        if matches:
                            for match in matches:
                                subj_id = f"Subject {len(subjects)+1}"
                                
                                # Create a subject entry with clear attributes
                                subjects[subj_id] = {
                                    "base_noun": subj_type,
                                    "attributes": f"in {color} {clothing}",
                                    "actions": [],
                                    "refs": []
                                }
                                
                                # Create a reference
                                ref = {
                                    "noun": subj_type,
                                    "full_description": match.group(0),
                                    "attributes": f"in {color} {clothing}",
                                    "start_idx": match.start(),
                                    "end_idx": match.end(),
                                    "sentence_idx": sent_idx,
                                    "has_definite_article": 'the' in match.group(0).lower()
                                }
                                
                                subjects[subj_id]["refs"] = [ref]
                                subject_refs.append(ref)
        
        # If we found subjects using specific patterns, return them
        if subjects:
            return subjects, subject_refs
        
        # Otherwise, proceed with generic pronoun-based identification
        # First, look for pronouns to determine subject types
        pronoun_counts = {k: 0 for k in pronoun_subjects.keys()}
        for sent in sentences:
            clean_sent = self.remove_camera_parts(sent, camera_text_to_remove)
            if not clean_sent.strip():
                continue
            
            for subj_type, info in pronoun_subjects.items():
                for pronoun in info["pronouns"]:
                    pattern = r'\b' + pronoun + r'\b'
                    matches = list(re.finditer(pattern, clean_sent, re.IGNORECASE))
                    pronoun_counts[subj_type] += len(matches)
        
        # Create generic subjects based on pronouns found
        subject_id = 1
        for subj_type, count in sorted(pronoun_counts.items(), key=lambda x: x[1], reverse=True):
            if count > 0:
                info = pronoun_subjects[subj_type]
                subj_id = f"Subject {subject_id}"
                
                # Use first noun as base noun
                base_noun = info["nouns"][0]
                
                # Create a subject entry
                subjects[subj_id] = {
                    "base_noun": base_noun,
                    "attributes": "",  # Generic subjects have no specific attributes
                    "actions": [],
                    "refs": []  # No specific references
                }
                
                # Create a reference for this subject
                ref = {
                    "noun": base_noun,
                    "full_description": base_noun,
                    "attributes": "",
                    "start_idx": 0,  # Generic start index
                    "end_idx": len(base_noun),  # Generic end index
                    "sentence_idx": 0,  # Put in first sentence
                    "has_definite_article": False
                }
                
                subjects[subj_id]["refs"] = [ref]
                subject_refs.append(ref)
                
                subject_id += 1
        
        # If no pronouns were found, create a generic "person" subject
        if not subjects:
            subj_id = "Subject 1"
            base_noun = "person"
            
            subjects[subj_id] = {
                "base_noun": base_noun,
                "attributes": "",
                "actions": [],
                "refs": []
            }
            
            ref = {
                "noun": base_noun,
                "full_description": base_noun,
                "attributes": "",
                "start_idx": 0,
                "end_idx": len(base_noun),
                "sentence_idx": 0,
                "has_definite_article": False
            }
            
            subjects[subj_id]["refs"] = [ref]
            subject_refs.append(ref)
        
        return subjects, subject_refs
    
    def find_subjects_in_sentence(self, sentence):
        """Find all subject mentions in a sentence with improved attribute extraction"""
        subjects = []
        
        # Optimization: cache lowercase version of sentence
        sentence_lower = sentence.lower()
        
        # Efficient pre-check - if doesn't contain any subject terms at all, return immediately
        if not any(noun.lower() in sentence_lower for noun in SUBJECT_NOUNS_SET):
            return subjects
        
        # Sorted by length (longer first) to capture "tennis player" before "player" etc.
        subject_nouns = SUBJECT_NOUNS
        
        # Create pattern cache
        if not hasattr(self, 'subject_pattern_cache'):
            self.subject_pattern_cache = {}
        
        # First look for subjects with specific color descriptions (e.g., "man in cyan clothing")
        for noun in subject_nouns:
            if noun not in sentence_lower:  # Quick check
                continue
                
            for color in COLOR_TERMS:
                if color not in sentence_lower:  # Quick check
                    continue
                    
                # Pattern for "X in [color] clothing/clothes/outfit"
                color_pattern_key = f"color_{noun}_{color}"
                if color_pattern_key not in self.subject_pattern_cache:
                    pattern = rf'\b(?:A|An|The|a|an|the)?\s*{noun}\s+in\s+{color}\s+(?:clothing|clothes|outfit|shirt|dress|suit|jacket|pants|top)\b'
                    self.subject_pattern_cache[color_pattern_key] = re.compile(pattern, re.IGNORECASE)
                
                for match in self.subject_pattern_cache[color_pattern_key].finditer(sentence):
                    if not self.is_part_of_existing_subject(match.start(), subjects):
                        self.extract_subject(match, noun, sentence, subjects, 
                                        'the' in match.group().lower() or 
                                        'this' in match.group().lower() or 
                                        'that' in match.group().lower())
                
                # Pattern for "[color]-clothed X" or "[color] X" 
                color_adj_key = f"color_adj_{noun}_{color}"
                if color_adj_key not in self.subject_pattern_cache:
                    pattern = rf'\b(?:A|An|The|a|an|the)?\s*{color}(?:-clothed|-dressed)?\s+{noun}\b'
                    self.subject_pattern_cache[color_adj_key] = re.compile(pattern, re.IGNORECASE)
                
                for match in self.subject_pattern_cache[color_adj_key].finditer(sentence):
                    if not self.is_part_of_existing_subject(match.start(), subjects):
                        self.extract_subject(match, noun, sentence, subjects, 
                                        'the' in match.group().lower() or 
                                        'this' in match.group().lower() or 
                                        'that' in match.group().lower())
        
        # Patterns for finding subjects with various determiners and adjectives
        for noun in subject_nouns:
            if noun not in sentence_lower:  # Quick check
                continue
                
            # Pattern for "a/an/the [adjectives] noun"
            det_key = f"det_{noun}"
            if det_key not in self.subject_pattern_cache:
                pattern = rf'\b(?:A|An|The|a|an|the)\s+(?:[a-z]+[-\s]+)*{noun}\b'
                self.subject_pattern_cache[det_key] = re.compile(pattern, re.IGNORECASE)
            
            for match in self.subject_pattern_cache[det_key].finditer(sentence):
                if not self.is_part_of_existing_subject(match.start(), subjects):
                    self.extract_subject(match, noun, sentence, subjects, 
                                    'the' in match.group().lower() or 
                                    'this' in match.group().lower() or 
                                    'that' in match.group().lower())
            
            # Pattern for "this/that/these/those [adjectives] noun"
            demo_key = f"demo_{noun}"
            if demo_key not in self.subject_pattern_cache:
                pattern = rf'\b(?:This|That|These|Those|this|that|these|those)\s+(?:[a-z]+[-\s]+)*{noun}\b'
                self.subject_pattern_cache[demo_key] = re.compile(pattern, re.IGNORECASE)
            
            for match in self.subject_pattern_cache[demo_key].finditer(sentence):
                if not self.is_part_of_existing_subject(match.start(), subjects):
                    self.extract_subject(match, noun, sentence, subjects, 
                                    'the' in match.group().lower() or 
                                    'this' in match.group().lower() or 
                                    'that' in match.group().lower())
            
            # Pattern for adj + noun without determiners
            adj_key = f"adj_{noun}"
            if adj_key not in self.subject_pattern_cache:
                pattern = rf'\b(?:[A-Z][a-z]+\s+)+{noun}\b'
                self.subject_pattern_cache[adj_key] = re.compile(pattern, re.IGNORECASE)
            
            for match in self.subject_pattern_cache[adj_key].finditer(sentence):
                if not self.is_part_of_existing_subject(match.start(), subjects):
                    self.extract_subject(match, noun, sentence, subjects, 
                                    'the' in match.group().lower() or 
                                    'this' in match.group().lower() or 
                                    'that' in match.group().lower())
            
            # Pattern for just the noun with capital (likely at sentence beginning)
            noun_key = f"noun_{noun}"
            if noun_key not in self.subject_pattern_cache:
                pattern = rf'\b{noun}\b'
                self.subject_pattern_cache[noun_key] = re.compile(pattern, re.IGNORECASE)
            
            for match in self.subject_pattern_cache[noun_key].finditer(sentence):
                # Make sure noun is not just a common word in middle of sentence
                if match.start() > 0 and sentence[match.start()-1] not in " \t\n.,;:?!()[]{}\"'":
                    continue
                
                if not self.is_part_of_existing_subject(match.start(), subjects):
                    self.extract_subject(match, noun, sentence, subjects, 
                                    'the' in match.group().lower() or 
                                    'this' in match.group().lower() or 
                                    'that' in match.group().lower())
        
        # Sort by position in sentence
        subjects.sort(key=lambda x: x["start_idx"])
        return subjects
    
    def is_part_of_existing_subject(self, pos, subjects):
        """Check if position is within an existing subject's range"""
        return any(s["start_idx"] <= pos < s["end_idx"] for s in subjects)
    
    def extract_subject(self, match, noun, text, subjects, has_definite_article):
        """Extract a subject and its attributes"""
        start_idx = match.start()
        
        # Extract complete subject phrase
        full_desc, end_idx = self.extract_complete_subject(text, start_idx, noun)
        
        # Extract attributes
        attributes = self.extract_subject_attributes(full_desc, noun)
        
        subjects.append({
            "noun": noun,
            "full_description": full_desc,
            "attributes": attributes,
            "start_idx": start_idx,
            "end_idx": end_idx,
            "has_definite_article": has_definite_article
        })
    
    def extract_complete_subject(self, text, start_idx, noun):
        """Extract complete subject phrase including attributes"""
        # Find end of basic noun phrase
        noun_pos = text[start_idx:].lower().find(noun.lower())
        if noun_pos == -1:  # Safety check
            return text[start_idx:start_idx + len(noun)].strip(), start_idx + len(noun)
            
        basic_end = start_idx + noun_pos + len(noun)
        
        # Look for additional attributes after the noun
        remaining = text[basic_end:].lstrip()
        
        # Common attribute markers
        attr_markers = [
            r'\bwith\b', r'\bin\b', r'\bwearing\b', r'\bholding\b', r'\bcarrying\b',
            r'\bwho\s+is\b', r'\bdressed\s+in\b', r'\bhaving\b', r'\bclad\s+in\b',
            r'\bequipped\s+with\b', r'\bfeaturing\b', r'\bdisplaying\b'
        ]
        
        # Check for attributes
        full_subject = text[start_idx:basic_end]
        current_end = basic_end
        
        # First check if next content is action rather than attribute
        if re.match(r'^\s+(?:is|was|are|were)\s+(?:walking|running|jumping|sitting|standing|lying|looking|reaching)', remaining):
            return full_subject.strip(), current_end
        
        # Look for attribute markers
        for marker in attr_markers:
            marker_match = re.search(f"^\\s*{marker}\\s+", remaining)
            if marker_match:
                # Find reasonable ending for this attribute phrase
                attr_end = self.find_attribute_boundary(remaining)
                
                if attr_end > 0:
                    # Add this attribute to subject phrase
                    attr_phrase = remaining[:attr_end].strip()
                    full_subject += " " + attr_phrase
                    current_end += len(attr_phrase) + 1  # +1 for space
                    
                    # Update remaining text and continue checking more attributes
                    remaining = remaining[attr_end:].lstrip()
                else:
                    # If no clear ending, use next punctuation
                    punct_match = re.search(r'[.,;:]', remaining)
                    if punct_match:
                        attr_phrase = remaining[:punct_match.start()].strip()
                        full_subject += " " + attr_phrase
                        current_end += len(attr_phrase) + 1
        
        # Look for "who" or "that" relative clauses describing subject
        relative_clause_match = re.search(r'^\s+(?:who|that|which)\s+(?:is|are|was|were)\s+(?!walking|running|jumping|sitting|standing)', remaining)
        if relative_clause_match:
            # Find end of relative clause
            rel_clause_end = self.find_attribute_boundary(remaining)
            if rel_clause_end > 0:
                rel_clause = remaining[:rel_clause_end].strip()
                full_subject += " " + rel_clause
                current_end += len(rel_clause) + 1
        
        return full_subject.strip(), current_end
    
    def find_attribute_boundary(self, text):
        """Find end position of attribute phrase before action begins"""
        # Look for punctuation
        punct_match = re.search(r'[.,;:]', text)
        punct_pos = punct_match.start() if punct_match else len(text)
        
        # Look for verbs
        verb_pos = len(text)
        for pattern in self.patterns['verb_patterns']:
            verb_match = pattern.search(text)
            if verb_match and verb_match.start() < verb_pos:
                verb_pos = verb_match.start()
                break  # One is enough
        
        # Look for connectors
        connector_pos = len(text)
        connector_patterns = [r'\band\b', r'\bwhile\b', r'\bthen\b', r'\bas\b', r'\bbut\b', r'\bor\b']
        
        for pattern in connector_patterns:
            conn_match = re.search(pattern, text, re.IGNORECASE)
            if conn_match and conn_match.start() < connector_pos:
                connector_pos = conn_match.start()
                break  # One is enough
        
        # Return earliest boundary
        return min(punct_pos, verb_pos, connector_pos)
    
    def extract_subject_attributes(self, description, noun):
        """Extract clean attributes from subject description, focusing on physical appearance and held items"""
        # Clean up description
        desc = description.lower().strip()
        
        # Remove common articles at start
        for article in ['the ', 'a ', 'an ']:
            if desc.startswith(article):
                desc = desc[len(article):]
                break
        
        # Filter out scene-related information
        scene_indicators = [
            "video sequence begins with", "video shows", "video begins with",
            "the scene shows", "in the scene", "in the video", "in the frame",
            "appears to be", "what appears to be"
        ]
        
        for indicator in scene_indicators:
            if indicator in desc:
                desc = desc.replace(indicator, "").strip()
        
        # Extract attributes based on specific patterns
        attributes = []
        
        # 1. Extract adjectives and descriptors before the noun
        pre_noun_pattern = rf'(.*?)\b{re.escape(noun.lower())}\b'
        pre_noun_match = re.search(pre_noun_pattern, desc)
        
        if pre_noun_match:
            pre_noun_text = pre_noun_match.group(1).strip()
            # Only consider last few words as potential adjectives
            words = pre_noun_text.split()
            
            if words:
                # Look at last 3 words max, which are likely adjectives
                potential_adjectives = words[-3:] if len(words) >= 3 else words
                
                # Filter out common determiners, keep descriptive adjectives
                descriptive_words = []
                for word in potential_adjectives:
                    if word.lower() not in ['the', 'a', 'an', 'this', 'that', 'these', 'those', 'and', 'or']:
                        descriptive_words.append(word)
                
                if descriptive_words:
                    attributes.append(" ".join(descriptive_words))
        
        # 2. Look for key attribute phrases after the noun
        # Use precompiled attribute patterns
        for pattern, formatter in self.patterns['attribute_patterns']:
            matches = pattern.finditer(desc)
            for match in matches:
                formatted_attr = formatter(match)
                
                # Make sure this attribute doesn't contain action verbs
                if not self.contains_action_verb(formatted_attr):
                    attributes.append(formatted_attr)
        
        # Combine all attributes and remove duplicates
        combined_attributes = ", ".join(filter(None, attributes))
        
        # Remove duplicate attributes 
        parts = [p.strip() for p in combined_attributes.split(',')]
        unique_parts = []
        for part in parts:
            normalized_part = self.normalize_attribute(part)
            if normalized_part and not any(self.normalize_attribute(up) == normalized_part for up in unique_parts):
                unique_parts.append(part)
            
        return ", ".join(unique_parts)
    
    def contains_action_verb(self, text):
        """Check if text contains common action verbs that shouldn't be in attributes"""
        # Cache check results
        cache_key = f"action_verb_{text}"
        if cache_key in self.subject_lookup_cache:
            return self.subject_lookup_cache[cache_key]
            
        action_verbs = [
            'walks', 'runs', 'jumps', 'sits', 'stands', 'moves', 'turns',
            'looks', 'stares', 'gazes', 'speaks', 'talks', 'says',
            'reaches', 'grabs', 'takes', 'puts', 'places', 'throws',
            'gestures', 'smiles', 'frowns', 'laughs', 'cries'
        ]
        
        # Check for these verbs (ensure they're complete words)
        for verb in action_verbs:
            if re.search(rf'\b{verb}\b', text.lower()):
                self.subject_lookup_cache[cache_key] = True
                return True
        
        # Also check for -ing forms as verbs (non-descriptive -ing forms)
        ing_verbs = ['walking', 'running', 'jumping', 'sitting', 'standing',
                    'looking', 'speaking', 'talking',
                    'reaching', 'grabbing', 'taking', 'putting', 'placing',
                    'gesturing', 'smiling', 'frowning', 'laughing', 'crying']
        
        # Only treat these as action verbs when they're not modified by determiners
        for verb in ing_verbs:
            # Check if verb exists
            match = re.search(rf'\b{verb}\b', text.lower())
            if match:
                # Check if it has determiners before it (a, an, the)
                start = match.start()
                prefix = text.lower()[max(0, start-4):start]
                # If NOT preceded by determiner (which would make it a noun)
                if not (prefix.endswith('the ') or prefix.endswith('a ') or prefix.endswith('an ')):
                    self.subject_lookup_cache[cache_key] = True
                    return True
        
        self.subject_lookup_cache[cache_key] = False
        return False
    
    def normalize_attribute(self, attribute):
        """Normalize attribute string to help detect duplicates"""
        if not attribute:
            return ""
        
        # Convert to lowercase
        normalized = attribute.lower().strip()
        
        # Remove unnecessary starting words
        prefixes = ["with ", "in ", "having ", "featuring ", "displaying "]
        for prefix in prefixes:
            if normalized.startswith(prefix):
                normalized = normalized[len(prefix):].strip()
                break
        
        # Simplify common clothing-related phrases
        clothing_simplifications = {
            "dressed in a": "wearing a",
            "dressed in the": "wearing the",
            "clad in a": "wearing a",
            "clad in the": "wearing the"
        }
        
        for phrase, replacement in clothing_simplifications.items():
            if normalized.startswith(phrase):
                normalized = normalized.replace(phrase, replacement, 1)
        
        return normalized.strip()
    
    def get_best_attributes(self, refs):
        """Get the best, most descriptive attributes from a set of references, avoiding duplicates"""
        # First, check for references with clothing/appearance descriptions (wearing, holding etc.)
        appearance_refs = [ref for ref in refs if 
                          any(word in ref["attributes"].lower() for word in 
                             ["wearing", "dressed", "holding", "carrying", "with", "glasses", "shirt", "hat"])]
        
        if appearance_refs:
            # Sort by attribute length and take longest
            appearance_refs.sort(key=lambda r: len(r["attributes"]), reverse=True)
            attributes = appearance_refs[0]["attributes"]
            # Make sure not to return duplicate attributes
            parts = [p.strip() for p in attributes.split(',')]
            unique_parts = []
            for part in parts:
                normalized = self.normalize_attribute(part)
                if part and normalized and not any(self.normalize_attribute(up) == normalized for up in unique_parts):
                    unique_parts.append(part)
            return ", ".join(unique_parts)
        
        # Next check for references with color or age
        descriptive_refs = [ref for ref in refs if 
                          any(word in ref["attributes"].lower() for word in COLOR_TERMS + 
                             ["young", "old", "elderly", "middle-aged"])]
        
        if descriptive_refs:
            # Sort by attribute length and take longest
            descriptive_refs.sort(key=lambda r: len(r["attributes"]), reverse=True)
            attributes = descriptive_refs[0]["attributes"]
            # Make sure not to return duplicate attributes
            parts = [p.strip() for p in attributes.split(',')]
            unique_parts = []
            for part in parts:
                normalized = self.normalize_attribute(part)
                if part and normalized and not any(self.normalize_attribute(up) == normalized for up in unique_parts):
                    unique_parts.append(part)
            return ", ".join(unique_parts)
        
        # Otherwise, take longest attribute string
        refs_with_attrs = [ref for ref in refs if ref["attributes"]]
        if refs_with_attrs:
            refs_with_attrs.sort(key=lambda r: len(r["attributes"]), reverse=True)
            attributes = refs_with_attrs[0]["attributes"]
            # Make sure not to return duplicate attributes
            parts = [p.strip() for p in attributes.split(',')]
            unique_parts = []
            for part in parts:
                normalized = self.normalize_attribute(part)
                if part and normalized and not any(self.normalize_attribute(up) == normalized for up in unique_parts):
                    unique_parts.append(part)
            return ", ".join(unique_parts)
        
        return ""
    
    def group_subject_references(self, subject_refs):
        """Group references that likely refer to the same subject"""
        if not subject_refs:
            return []
            
        groups = []
        processed = set()
        
        # First pass - group by specific attributes (like color) to avoid merging different subjects
        for i, ref in enumerate(subject_refs):
            if i in processed:
                continue
                
            # Start a new group with this reference
            current_group = [ref]
            processed.add(i)
            
            # Only find references that clearly match this one (strict matching)
            for j, other_ref in enumerate(subject_refs):
                if j in processed or j == i:
                    continue
                    
                # Only group if they have same distinguishing attributes
                if self.references_same_subject_strict(ref, other_ref):
                    current_group.append(other_ref)
                    processed.add(j)
            
            groups.append(current_group)
        
        return groups
    
    def references_same_subject_strict(self, ref1, ref2):
        """Determine if two references clearly refer to same subject, improved matching"""
        # Check if nouns are same or synonyms
        if ref1["noun"] != ref2["noun"]:
            # Check synonyms and compound terms
            same_meaning = False
            
            # Use predefined synonym dictionary
            for base_noun, synonyms in SUBJECT_SYNONYMS.items():
                # Check if both nouns relate to base noun or its synonyms
                if ((ref1["noun"] == base_noun or ref1["noun"] in synonyms) and
                    (ref2["noun"] == base_noun or ref2["noun"] in synonyms)):
                    same_meaning = True
                    break
            
            # Check if one is compound word containing other (e.g., "tennis player" and "player")
            if not same_meaning:
                if (ref1["noun"] in ref2["noun"] and ref2["noun"].endswith(ref1["noun"])) or \
                   (ref2["noun"] in ref1["noun"] and ref1["noun"].endswith(ref2["noun"])):
                    same_meaning = True
            
            if not same_meaning:
                return False
        
        # Extract words from attributes - use sets instead of re.findall for efficiency
        attr1 = ref1["attributes"].lower()
        attr2 = ref2["attributes"].lower()
        
        # Check color words
        color_words1 = {color for color in COLOR_TERMS if f" {color} " in f" {attr1} " or attr1.startswith(f"{color} ")}
        color_words2 = {color for color in COLOR_TERMS if f" {color} " in f" {attr2} " or attr2.startswith(f"{color} ")}
        
        # If both have colors and they differ, they're different subjects
        if color_words1 and color_words2 and not color_words1.intersection(color_words2):
            return False
        
        # Check clothing descriptors with different colors
        clothing1 = {clothing for clothing in CLOTHING_TERMS if f" {clothing} " in f" {attr1} " or attr1.endswith(f" {clothing}")}
        clothing2 = {clothing for clothing in CLOTHING_TERMS if f" {clothing} " in f" {attr2} " or attr2.endswith(f" {clothing}")}
        
        if color_words1 and clothing1 and color_words2 and clothing2:
            # Different colored clothing indicates different subjects
            return color_words1 == color_words2
        
        # If one has "in [color] clothing" and other doesn't mention color 
        # or mentions different color, they're different subjects
        if (("in" in attr1 and color_words1 and clothing1) or 
            ("in" in attr2 and color_words2 and clothing2)):
            if color_words1 != color_words2:
                return False
        
        # In other cases, use more relaxed matching
        return self.attributes_are_consistent(ref1["attributes"], ref2["attributes"])
    
    def attributes_are_consistent(self, attrs1, attrs2):
        """Check if two sets of attributes are consistent (not contradictory)"""
        # Extract key words from both attribute sets
        stopwords = {'in', 'with', 'a', 'an', 'the', 'and', 'of', 'on', 'at', 'by', 'to', 'for', 'who'}
        
        # More efficient word extraction - not using regex
        words1 = {word.strip(" ,.;:") for word in attrs1.lower().split() if word.strip(" ,.;:") not in stopwords}
        words2 = {word.strip(" ,.;:") for word in attrs2.lower().split() if word.strip(" ,.;:") not in stopwords}
        
        # Define key attribute categories
        key_categories = {
            'colors': set(COLOR_TERMS),
            'age': {'young', 'old', 'older', 'elderly', 'middle-aged', 'teenage', 'adult'},
            'hair': {'black-haired', 'blonde', 'white-haired', 'gray-haired', 'red-haired', 'bald'},
            'clothing': set(CLOTHING_TERMS)
        }
        
        # Check for contradictions in key categories
        for category, terms in key_categories.items():
            terms1 = words1.intersection(terms)
            terms2 = words2.intersection(terms)
            
            # If both have terms in this category and there's no overlap, they're contradictory
            if terms1 and terms2 and not terms1.intersection(terms2):
                return False
        
        # If no contradictions found, they're consistent
        return True
    
    def is_scene_description(self, sentence):
        """Check if sentence primarily describes scene/environment rather than actions"""
        # Use precompiled patterns
        for pattern in self.patterns['scene_patterns']:
            if pattern.search(sentence):
                # Only mark as scene description if it doesn't contain clear action verbs
                return not self.contains_clear_action(sentence)
        
        return False
    
    def contains_clear_action(self, text):
        """Check if text contains clear action verbs (not just state/descriptive verbs)"""
        cache_key = f"clear_action_{text[:50]}"
        if cache_key in self.action_extraction_cache:
            return self.action_extraction_cache[cache_key]
            
        action_verbs = [
            'walk', 'run', 'jump', 'pick', 'grab', 'take', 'put', 'place', 'throw', 
            'push', 'pull', 'lift', 'turn', 'twist', 'open', 'close', 'hold', 'shake',
            'wave', 'point', 'raise', 'lower', 'enter', 'exit', 'stand', 'sit', 'move',
            'serve', 'toss', 'hit', 'strike', 'swing', 'prepare', 'position', 'adjust',
            'touch', 'reach', 'carry', 'bring', 'approach', 'step', 'pass', 'deliver',
            'focus', 'concentrate', 'aim', 'play', 'perform', 'dance', 'spin', 'twist',
            'bend', 'stretch', 'lean', 'bite', 'eat', 'drink', 'sip', 'swallow',
            'search', 'interact'
        ]
        
        text_lower = text.lower()
        
        for verb in action_verbs:
            # Check various forms of verb
            if (f" {verb} " in f" {text_lower} " or 
                f" {verb}s " in f" {text_lower} " or 
                f" {verb}ed " in f" {text_lower} " or 
                f" {verb}ing " in f" {text_lower} "):
                # Make sure it's not part of a nominal compound
                if not re.search(rf'\b(?:the|a|an)\s+{verb}ing', text_lower):
                    self.action_extraction_cache[cache_key] = True
                    return True
        
        self.action_extraction_cache[cache_key] = False
        return False
    
    def contains_potential_subject(self, text):
        """Check if text might contain a new subject"""
        # Use precompiled patterns
        for pattern in self.patterns['subject_indicators']:
            if pattern.search(text):
                return True
        
        return False
    
    def contains_pronoun(self, text):
        """Check if text contains pronouns"""
        # Cache common check results
        cache_key = f"pronoun_{text[:50]}"
        if cache_key in self.pronoun_cache:
            return self.pronoun_cache[cache_key]
            
        text_lower = f" {text.lower()} "
        pronouns = [' he ', ' she ', ' they ', ' it ', ' him ', ' her ', ' them ']
        has_pronoun = any(p in text_lower for p in pronouns)
        
        self.pronoun_cache[cache_key] = has_pronoun
        return has_pronoun
    
    def find_subject_for_reference(self, ref, subjects):
        """Find which subject a reference belongs to"""
        cache_key = f"subj_ref_{ref['sentence_idx']}_{ref['start_idx']}"
        if cache_key in self.subject_lookup_cache:
            return self.subject_lookup_cache[cache_key]
            
        for subj_id, info in subjects.items():
            if any(r["start_idx"] == ref["start_idx"] and 
                   r["sentence_idx"] == ref["sentence_idx"] 
                   for r in info["refs"]):
                self.subject_lookup_cache[cache_key] = subj_id
                return subj_id
                
        self.subject_lookup_cache[cache_key] = None
        return None
    
    def extract_subject_actions(self, sentence, subject_ref, all_subj_refs=None):
        """Extract actions related to a specific subject, avoiding actions by other subjects"""
        start_idx = subject_ref["end_idx"]
        sentence_length = len(sentence)
        
        # Determine range of possible actions for this subject (until next subject or end)
        end_idx = sentence_length
        
        if all_subj_refs:
            # Find next subject in sentence
            for other_ref in all_subj_refs:
                if other_ref["start_idx"] > start_idx and other_ref["start_idx"] < end_idx:
                    end_idx = other_ref["start_idx"]
        
        # Extract only text between current subject and next subject as potential actions
        action_text = sentence[start_idx:end_idx].strip()
        
        # Clean up text
        if action_text.startswith(',') or action_text.startswith('.'):
            action_text = action_text[1:].strip()
        
        # Remove leading conjunctions
        action_text = re.sub(r'^(?:and|then|also)\s+', '', action_text)
        
        if not action_text:
            return []
        
        # Handle potential "while" clauses that might introduce new subjects
        while_parts = action_text.split(" while ", 1)
        if len(while_parts) > 1 and self.contains_potential_subject(while_parts[1]):
            # Only process part before "while"
            action_text = while_parts[0].strip()
        
        # Handle "as" clauses that might contain subject actions
        as_parts = action_text.split(" as ", 1)
        as_clause_actions = []
        if len(as_parts) > 1:
            # Process main part
            main_action_text = as_parts[0].strip()
            
            # Check if "as" clause contains actions by current subject
            as_clause = as_parts[1].strip()
            if as_clause.startswith(('he ', 'she ', 'they ')):
                # Extract actions from "as" clause
                as_clause_actions = self.extract_clause_actions(as_clause, " as ")
        else:
            main_action_text = action_text
        
        # Split text into separate actions (by conjunctions and commas)
        action_parts = re.split(r'\s+and\s+|\s*,\s*|\s*;\s*', main_action_text)
        actions = []
        
        for part in action_parts:
            part = part.strip()
            if part and self.contains_verb(part):
                actions.append(part)
        
        # Add actions extracted from "as" clause
        actions.extend(as_clause_actions)
        
        # Try to capture additional reactions and state changes
        for pattern, formatter in self.patterns['state_patterns']:
            matches = list(pattern.finditer(action_text))
            if matches:
                for match in matches:
                    state_desc = formatter(match)
                    if state_desc not in actions:
                        actions.append(state_desc)
        
        # Capture "reacting to" patterns
        for pattern, formatter in self.patterns['reaction_patterns']:
            matches = list(pattern.finditer(action_text))
            if matches:
                for match in matches:
                    reaction_desc = formatter(match)
                    if reaction_desc not in actions:
                        actions.append(reaction_desc)
        
        # If no actions found, try more comprehensive action extraction 
        if not actions and self.contains_verb(action_text):
            return self.extract_actions(action_text)
        
        return actions
    
    def extract_clause_actions(self, clause_text, clause_type):
        """Extract actions from a clause, with special handling for as, while, when clauses"""
        clause_text = clause_text.strip()
        
        # If clause starts with pronoun, it's likely an action by the subject
        if clause_text.lower().startswith(('he ', 'she ', 'they ', 'it ')):
            # Remove pronoun
            pronoun_match = re.match(r'\b(he|she|they|it)\s+', clause_text, re.IGNORECASE)
            if pronoun_match:
                action_text = clause_text[pronoun_match.end():].strip()
                
                # Check if it contains a verb
                if self.contains_verb(action_text):
                    # Try to extract actions
                    actions = self.extract_actions(action_text)
                    return actions
        
        # When clause contains key verb patterns like ensures, moves, observes
        for keyword, pattern in self.patterns['action_indicators']:
            if keyword in clause_text.lower():
                match = pattern.search(clause_text)
                if match:
                    return [match.group(0)]
        
        # When all else fails, try to find most obvious action
        words = clause_text.split()
        for i, word in enumerate(words):
            if is_potential_verb(word):
                # Try to extract action fragment, max 5 words
                end_idx = min(i + 6, len(words))
                action = " ".join(words[i:end_idx])
                return [action]
        
        return []
    
    def extract_actions(self, text):
        """Extract discrete actions from text, using POS tagging if available"""
        # Cache computed results
        cache_key = f"extract_{text[:50]}"
        if cache_key in self.action_extraction_cache:
            return self.action_extraction_cache[cache_key]
            
        try:
            # Try using NLTK's POS tagger
            from nltk import word_tokenize, pos_tag
            
            tokens = word_tokenize(text)
            pos_tags = pos_tag(tokens)
            
            # Find verb chunks
            verb_chunks = []
            i = 0
            while i < len(pos_tags):
                # Look for verbs
                if pos_tags[i][1].startswith('VB'):
                    # Found a verb, collect verb phrase
                    start = i
                    verb = pos_tags[i][0]
                    i += 1
                    
                    # Collect auxiliary verbs (if any)
                    aux_verbs = []
                    while i < len(pos_tags) and (pos_tags[i][1].startswith('VB') or 
                                                pos_tags[i][0].lower() in ['to', 'not', "n't"]):
                        aux_verbs.append(pos_tags[i][0])
                        i += 1
                    
                    # Handle passive voice structures
                    is_passive = False
                    if verb.lower() in ['is', 'are', 'was', 'were', 'be', 'been', 'being'] and i < len(pos_tags):
                        # Check if followed by past participle
                        if pos_tags[i][1] == 'VBN' or (pos_tags[i][0].lower().endswith('ed') and not pos_tags[i][1].startswith('JJ')):
                            is_passive = True
                    
                    # Collect objects and adverbs that follow
                    objects = []
                    while i < len(pos_tags) and not pos_tags[i][1].startswith('VB'):
                        # Stop at sentence-ending punctuation or conjunctions
                        if pos_tags[i][0] in ['.', ',', ';', ':'] or (pos_tags[i][1] == 'CC' and pos_tags[i][0].lower() in ['and', 'or', 'but']):
                            break
                        objects.append(pos_tags[i][0])
                        i += 1
                    
                    # Build verb phrase
                    verb_phrase = verb
                    if aux_verbs:
                        verb_phrase += " " + " ".join(aux_verbs)
                    if objects:
                        verb_phrase += " " + " ".join(objects)
                    
                    # Skip cases that might be nominal compounds
                    if self.is_ing_nominal_compound(verb_phrase):
                        continue
                    
                    verb_chunks.append(verb_phrase)
                else:
                    i += 1
            
            # Filter for meaningful actions
            actions = [self.clean_action_text(chunk) for chunk in verb_chunks if chunk and not self.is_descriptive_statement(chunk)]
            actions = [action for action in actions if action]  # Remove any empty strings after cleaning
            
            if actions:
                self.action_extraction_cache[cache_key] = actions
                return actions
            
            # If POS tagging didn't find actions, use standard method
            fallback_actions = self.extract_actions_fallback(text)
            self.action_extraction_cache[cache_key] = fallback_actions
            return fallback_actions
            
        except Exception:
            # If NLTK fails, return regex-based method
            fallback_actions = self.extract_actions_fallback(text)
            self.action_extraction_cache[cache_key] = fallback_actions
            return fallback_actions
    
    def extract_actions_fallback(self, text):
        """Fallback method to extract actions using regex patterns"""
        # First clean up text
        text = text.strip()
        
        # Look for specific action patterns
        for pattern, extractor in self.patterns['specific_actions']:
            match = pattern.search(text)
            if match:
                return [extractor(match)]
        
        # Split at common separators
        raw_actions = []
        separators = r',\s+|\s+and\s+|\s+then\s+|\.\s+|;\s+'
        parts = re.split(separators, text)
        
        for part in parts:
            part = part.strip()
            if part:
                raw_actions.append(part)
        
        # Process actions
        actions = []
        for action in raw_actions:
            # Clean up
            action = self.clean_action_text(action)
            if not action:
                continue
                
            # Skip if it's an -ing noun compound
            if self.is_ing_nominal_compound(action):
                continue
                
            # Add if it contains a verb
            if self.contains_verb(action) and not self.is_descriptive_statement(action):
                actions.append(action)
        
        # Handle compound actions
        if len(actions) > 1:
            result = []
            prev_verb = None
            
            for action in actions:
                if self.contains_verb(action):
                    result.append(action)
                    prev_verb = self.extract_main_verb(action)
                elif prev_verb:
                    # Add verb to this action
                    result.append(f"{prev_verb} {action}")
            
            if result:
                return result
        
        return actions
    
    def extract_pronoun_actions(self, text):
        """Extract actions related to pronoun references"""
        # Find pronouns
        pronoun_match = re.search(r'\b(?:he|she|they|it|him|her|them)\b', text.lower())
        
        if not pronoun_match:
            return []
        
        # Extract text after pronoun
        action_text = text[pronoun_match.end():].strip()
        
        # Clean up
        if action_text.startswith(',') or action_text.startswith('.'):
            action_text = action_text[1:].strip()
        
        if not action_text:
            return []
        
        # Check for "as" or "while" clauses
        clause_splits = {
            " as ": action_text.split(" as ", 1),
            " while ": action_text.split(" while ", 1)
        }
        
        for clause_type, parts in clause_splits.items():
            if len(parts) > 1:
                main_actions = self.extract_actions(parts[0])
                clause_actions = self.extract_clause_actions(parts[1], clause_type)
                return main_actions + clause_actions
        
        # Split text into separate actions by conjunctions and commas
        action_parts = re.split(r'\s+and\s+|\s*,\s*|\s*;\s*', action_text)
        actions = []
        
        for part in action_parts:
            part = part.strip()
            if part and self.contains_verb(part):
                actions.append(part)
        
        # Try to capture state and emotion descriptions
        state_patterns = [
            r'(?:expression|face|mood|emotion)\s+(?:changes|changed)',
            r'appears\s+to\s+be\s+([^.,;:]+)',
            r'seems\s+to\s+be\s+([^.,;:]+)',
            r'looks\s+(?:at|to|towards)\s+([^.,;:]+)\s+with\s+interest',
            r'engaged\s+with\s+([^.,;:]+)'
        ]
        
        for pattern in state_patterns:
            matches = list(re.finditer(pattern, action_text, re.IGNORECASE))
            if matches:
                for match in matches:
                    state_desc = match.group(0)
                    if state_desc not in actions:
                        actions.append(state_desc)
        
        return actions
    
    def extract_actions_without_subject(self, text):
        """Extract actions even with no explicit subject mentioned"""
        actions = []
        
        # Split by conjunctions and punctuation
        parts = re.split(r',\s+|\s+and\s+|\s+then\s+|;\s+', text)
        
        for part in parts:
            part = part.strip()
            if not part:
                continue
                
            # Check if this part contains a verb
            if self.contains_verb(part):
                # Clean up action text
                action = part
                action = re.sub(r'^\s*(?:also|then|finally|next|afterwards|subsequently)\s+', '', action)
                action = action.strip()
                
                if action:
                    actions.append(action)
        
        return actions
    
    def extract_actions_from_as_clause(self, text):
        """Extract actions from text after 'as' that might contain subject+action"""
        actions = []
        
        # Look for subject at start of "as" clause
        subject_pattern = r'^\s*(?:a|an|the)\s+(?:[a-z\s]*(?:man|woman|person|boy|girl))\b'
        subj_match = re.search(subject_pattern, text, re.IGNORECASE)
        
        if subj_match:
            # Get text after this subject
            after_subj = text[subj_match.end():].strip()
            
            # Split by conjunctions and commas
            parts = re.split(r'\s+and\s+|\s*,\s*|\s*;\s*', after_subj)
            
            for part in parts:
                part = part.strip()
                if part and self.contains_verb(part):
                    actions.append(part)
        else:
            # Check if starts with pronoun
            pronoun_match = re.search(r'^\s*(?:he|she|they|it)\b', text, re.IGNORECASE)
            if pronoun_match:
                # Get text after pronoun
                after_pronoun = text[pronoun_match.end():].strip()
                
                # Split by conjunctions and commas
                parts = re.split(r'\s+and\s+|\s*,\s*|\s*;\s*', after_pronoun)
                
                for part in parts:
                    part = part.strip()
                    if part and self.contains_verb(part):
                        actions.append(part)
            else:
                # No explicit subject, just extract verbs
                parts = re.split(r'\s+and\s+|\s*,\s*|\s*;\s*', text)
                
                for part in parts:
                    part = part.strip()
                    if part and self.contains_verb(part):
                        actions.append(part)
        
        return actions
    
    def extract_indirect_actions(self, text, subject_ref):
        """Extract indirectly described actions like state changes, emotional reactions, etc."""
        actions = []
        
        # Optimization: if text is too short, return immediately
        if len(text) < 10:
            return actions
            
        # Check for matches in text
        start_pos = subject_ref["end_idx"] if subject_ref else 0
        text_to_check = text[start_pos:] if start_pos < len(text) else text
        
        # Use precompiled patterns
        for pattern, formatter in self.patterns['state_patterns']:
            matches = list(pattern.finditer(text_to_check))
            for match in matches:
                action_text = formatter(match)
                if action_text and action_text not in actions:
                    actions.append(action_text)
                    
        # Use precompiled reaction patterns
        for pattern, formatter in self.patterns['reaction_patterns']:
            matches = list(pattern.finditer(text_to_check))
            for match in matches:
                action_text = formatter(match)
                if action_text and action_text not in actions:
                    actions.append(action_text)
        
        return actions
    
    def extract_all_actions(self, sentences, subjects, subject_refs, camera_text_to_remove):
        """Extract all actions for identified subjects, improved version to avoid duplicates and misattribution"""
        all_actions = []
        action_idx = 0
        current_subject = None
        
        # Preprocessing: organize subject references by sentence index
        subject_refs_by_sentence = defaultdict(list)
        for ref in subject_refs:
            subject_refs_by_sentence[ref["sentence_idx"]].append(ref)
        
        # Track already extracted actions for each subject to avoid duplicates
        extracted_actions = {subj_id: set() for subj_id in subjects}
        
        # Process each sentence
        for sent_idx, sent in enumerate(sentences):
            # Cleanup - create a fully cleaned version
            clean_sent = self.remove_camera_parts(sent, camera_text_to_remove)
            
            # Skip if nothing remains after cleaning
            if not clean_sent.strip():
                continue
                
            # Skip purely descriptive sentences
            if self.is_scene_description(clean_sent):
                continue
            
            # Get all subject references in this sentence
            sentence_refs = subject_refs_by_sentence[sent_idx]
            
            # Sort by position in sentence
            sentence_refs.sort(key=lambda ref: ref["start_idx"])
            
            if sentence_refs:
                # Process each subject in order
                for subj_ref in sentence_refs:
                    # Find which subject this reference belongs to
                    subj_id = self.find_subject_for_reference(subj_ref, subjects)
                    if not subj_id:
                        continue
                        
                    current_subject = subj_id
                    
                    # Extract actions for this subject, passing all subject refs for better boundary detection
                    actions = self.extract_subject_actions(clean_sent, subj_ref, sentence_refs)
                    
                    # Try to capture more indirectly described actions
                    indirect_actions = self.extract_indirect_actions(clean_sent, subj_ref)
                    for action in indirect_actions:
                        if action not in actions:
                            actions.append(action)
                    
                    # Add to subject and chronological list
                    for action in actions:
                        if action.strip() and not self.is_descriptive_statement(action) and not self.is_ing_nominal_compound(action):
                            # Clean up any punctuation
                            action = self.clean_action_text(action)
                            # Avoid adding duplicate actions for same subject
                            if action and action not in extracted_actions[subj_id]:
                                extracted_actions[subj_id].add(action)
                                subjects[subj_id]["actions"].append(action)
                                all_actions.append((action_idx, subj_id, action))
                                action_idx += 1
            
                    # Special handling for full sentences that might contain clause actions
                    action_idx = self.process_complex_clauses(clean_sent, current_subject, subjects, extracted_actions, all_actions, action_idx)
            
            # Check for potential "as" clauses with implicit subject actions
            elif current_subject and ' as ' in clean_sent:
                as_parts = clean_sent.split(' as ', 1)
                if len(as_parts) > 1 and self.contains_clear_action(as_parts[1]):
                    # Extract actions from part after "as"
                    actions = self.extract_actions_from_as_clause(as_parts[1])
                    
                    for action in actions:
                        if action.strip() and not self.is_descriptive_statement(action) and not self.is_ing_nominal_compound(action):
                            action = self.clean_action_text(action)
                            if action and action not in extracted_actions[current_subject]:
                                extracted_actions[current_subject].add(action)
                                subjects[current_subject]["actions"].append(action)
                                all_actions.append((action_idx, current_subject, action))
                                action_idx += 1
            
            # If no explicit subject but we have current subject, check for pronoun references
            elif current_subject and self.contains_pronoun(clean_sent):
                # Extract actions for pronoun
                actions = self.extract_pronoun_actions(clean_sent)
                
                for action in actions:
                    if action.strip() and not self.is_descriptive_statement(action) and not self.is_ing_nominal_compound(action):
                        # Clean up any punctuation
                        action = self.clean_action_text(action)
                        if action and action not in extracted_actions[current_subject]:
                            extracted_actions[current_subject].add(action)
                            subjects[current_subject]["actions"].append(action)
                            all_actions.append((action_idx, current_subject, action))
                            action_idx += 1
            
            # When no explicit mention but action exists, check for implicit subject
            elif self.contains_clear_action(clean_sent) and current_subject:
                # Extract actions without explicit subject mentions
                actions = self.extract_actions_without_subject(clean_sent)
                
                for action in actions:
                    if action.strip() and not self.is_descriptive_statement(action) and not self.is_ing_nominal_compound(action):
                        # Clean up any punctuation
                        action = self.clean_action_text(action)
                        if action and action not in extracted_actions[current_subject]:
                            extracted_actions[current_subject].add(action)
                            subjects[current_subject]["actions"].append(action)
                            all_actions.append((action_idx, current_subject, action))
                            action_idx += 1
        
        return all_actions
    
    def process_complex_clauses(self, sentence, current_subject, subjects, extracted_actions, all_actions, action_idx):
        """Process clause actions in complex sentences"""
        if not current_subject:
            return action_idx
        
        # 1. Process clause markers (as, while, when, etc.)
        clause_markers = [' as ', ' while ', ' when ']
        
        for marker in clause_markers:
            if marker in sentence:
                parts = sentence.split(marker, 1)
                if len(parts) == 2:
                    clause = parts[1].strip()
                    
                    # Case 1: Pronoun-led actions (he, she, they)
                    if clause.lower().startswith(('he ', 'she ', 'they ')):
                        clause_actions = self.extract_clause_actions(clause, marker.strip())
                        for action in clause_actions:
                            action_idx = self._process_and_add_action(
                                action, current_subject, subjects, extracted_actions, all_actions, action_idx
                            )
                    
                    # Case 2: Adverb+ing form actions (occasionally watching)
                    adverb_ing_match = re.search(r'\b(\w+ly\s+\w+ing[^.,;:]*)', clause)
                    if adverb_ing_match:
                        action = adverb_ing_match.group(1)
                        action_idx = self._process_and_add_action(
                            action, current_subject, subjects, extracted_actions, all_actions, action_idx
                        )
        
        # 2. Check for ensuring behaviors (ensuring, adjusting, making sure)
        assurance_patterns = [
            r'\bensuring\s+[^.,;:]+',
            r'\badjusting\s+[^.,;:]+',
            r'\bmaking\s+sure\s+[^.,;:]+',
            r'\bensures\s+that\s+[^.,;:]+',
            r'\bensures\s+the\s+[^.,;:]+' 
        ]
        
        for pattern in assurance_patterns:
            matches = re.finditer(pattern, sentence, re.IGNORECASE)
            for match in matches:
                action = match.group(0)
                action_idx = self._process_and_add_action(
                    action, current_subject, subjects, extracted_actions, all_actions, action_idx
                )
        
        return action_idx
    
    def _process_and_add_action(self, action, subject, subjects, extracted_actions, all_actions, action_idx):
        """Process and add action to subject, including all necessary checks"""
        if action and action.strip() and not self.is_descriptive_statement(action) and not self.is_ing_nominal_compound(action):
            # Clean up any punctuation
            action = self.clean_action_text(action)
            if action and action not in extracted_actions[subject]:
                extracted_actions[subject].add(action)
                subjects[subject]["actions"].append(action)
                all_actions.append((action_idx, subject, action))
                action_idx += 1
        return action_idx
    
    def parse_video_caption(self, caption_data):
        """Parse video captions to extract structured information"""
        # Reset all caches
        self.reset_cache()
        
        for key in caption_data:
            caption = caption_data[key]

        # caption = caption_data["caption"]
        
        # Initialize result structure
        result = {
            f"{key}": {
                "model_caption": f"{caption}",
                "camera_motion": "",
                "num_subjects": "",
                "motion_list": "",
                "chronological_motion_list": ""
            }
        }
        
        # result[f"{key}"]["model_caption"] = ", ".join(caption)

        # Split caption into sentences
        sentences = sent_tokenize(caption)
        
        # Extract camera motions, maintaining order
        camera_motions = []
        camera_text_to_remove = []
        
        for sent in sentences:
            motions, text_to_remove = self.extract_camera_motions_from_sentence(sent)
            if motions:  # Only add non-empty results
                camera_motions.extend(motions)
                camera_text_to_remove.extend(text_to_remove)
        
        if camera_motions:
            result[f"{key}"]["camera_motion"] = ", ".join(camera_motions)
        
        # Cache camera text for reuse
        self.cached_camera_motions = camera_motions
        self.cached_camera_texts = camera_text_to_remove
        
        # First pass: identify all subjects across text (both original and camera-cleaned text)
        subjects, subject_refs = self.identify_all_subjects(sentences, camera_text_to_remove)
        
        # Second pass: process actions for identified subjects
        all_actions = self.extract_all_actions(sentences, subjects, subject_refs, camera_text_to_remove)
        
        # If no subjects identified, try a more relaxed approach
        if not subjects:
            # Use more generic subject identification
            subjects, subject_refs = self.identify_subjects_generic(sentences, camera_text_to_remove)
            all_actions = self.extract_all_actions(sentences, subjects, subject_refs, camera_text_to_remove)
        
        # Format output
        result[f"{key}"]["num_subjects"] = "Single subject" if len(subjects) == 1 else f"Multiple subjects ({len(subjects)})"
        
        # Format motion lists by subject
        motion_lists = []
        for subj_id, subj_info in subjects.items():
            base_type = subj_info["base_noun"].capitalize()
            actions = ', '.join(subj_info["actions"])
            motion_lists.append(f"{subj_id}: {base_type} [{actions}]")
        
        result[f"{key}"]["motion_list"] = "\n".join(motion_lists)
        
        # Add chronological action list
        all_actions.sort(key=lambda x: x[0])  # Sort by index
        chrono_actions = [f"{action} ({subj_id})" for _, subj_id, action in all_actions]
        
        result[f"{key}"]["chronological_motion_list"] = ", ".join(chrono_actions)
        
        return result

def parse_video_caption(caption_data):
    """Public interface function that creates a parser instance and performs parsing"""
    parser = VideoDescriptionParser()
    return parser.parse_video_caption(caption_data)

In [2]:
import os

def test_parser(test_case, output_path):
    test_case = test_case
    output_file_path = output_path
    all_results = {}
    timeout_seconds = 10

    if os.path.exists(output_file_path):
        with open(output_file_path, 'r') as f:
            try:
                all_results = json.load(f)
            except json.JSONDecodeError:
                print(f"Warning: Could not decode existing JSON file at {output_file_path}. Starting with an empty result set.")
                all_results = {}
        
    for i, test_case in enumerate(test_case):
        print(f"Test case {i+1}:")
        print(test_case)
        start_time = time.time()
        result = parse_video_caption(test_case)
        end_time = time.time()
        print(f"Processing time: {end_time - start_time:.4f} seconds")
        print(json.dumps(result, indent=4))
        print("\n" + "-"*80 + "\n")

        all_results.update(result) # Use update to merge the new result into existing results
    
    with open(output_file_path, 'w') as f:
        json.dump(all_results, f, indent=4)
    print(f"Results saved to {output_file_path}")

if __name__ == "__main__":
    test_case = [
{"7R8ZU": "The recorder moves forward, the girl closes the laptop with her right hand, stands up and walks backward, then turns around and walks forward, the recorder moves backward, the girl walks to the left, and stirs the food with a spatula in her right hand."},
{"539EH": "The camera keeps stationary. The man picks up a box with both hands, then opens it with his left hand. He takes out a shoe from the box with his right hand and puts it on his foot. He then picks up another shoe with his right hand and puts it on his other foot. The man then picks up the box with both hands and turns around to sit on a chair. He picks up a phone with his right hand and makes a phone call. "},
{"4_08_000048": "The girl speaks while playing with a frisbee in her hand, then tosses it aside, turns her head to look left, and the man blinks. "},
{"0AYPZ": "The woman walks around, picks up the vacuum cleaner, turns it on, and starts cleaning. The light outside changes."},

{"7O6FK": "The recorder moves forward, the man in blue shirt picks up a bag with his right hand and throws it down, then he turns around to pick up a picture frame with both hands and walks down the stairs."},
{"26k-11-2-3|P1|3561|3966": "The recorder's right hand turns off the faucet, then picks up a cup and pours water into it. The left hand takes the cup while the right hand opens the cabinet door, placing the cup inside with the left hand. The right hand then closes the cabinet door, turns around, walks to the table, and picks up a bottle with the right hand. The left hand takes the bottle while the right hand opens the cap, then turns around again. The right hand places the cap on the table, then turns on the faucet. Both hands wash the bottle, and subsequently, the right hand turns off the faucet."},
    ]
    os.makedirs('extract_results', exist_ok=True)
    output_path = "extract_results/LLM-free_case.json"
    test_parser(test_case, output_path=output_path)

Test case 1:
{'7R8ZU': 'The recorder moves forward, the girl closes the laptop with her right hand, stands up and walks backward, then turns around and walks forward, the recorder moves backward, the girl walks to the left, and stirs the food with a spatula in her right hand.'}
Processing time: 0.0330 seconds
{
    "7R8ZU": {
        "model_caption": "The recorder moves forward, the girl closes the laptop with her right hand, stands up and walks backward, then turns around and walks forward, the recorder moves backward, the girl walks to the left, and stirs the food with a spatula in her right hand.",
        "camera_motion": "",
        "num_subjects": "Multiple subjects (2)",
        "motion_list": "Subject 1: Recorder [moves forward, moves backward]\nSubject 2: Girl [closes the laptop with her right hand, stands up, walks backward, turns around, walks forward, walks to the left, stirs the food with a spatula in her right hand]",
        "chronological_motion_list": "moves forward (S