In [9]:
import spacy
import pandas as pd
import json
from typing import List, Dict, Set, Tuple
from collections import defaultdict
import networkx as nx
from spacy.tokens import Doc, Span
from spacy.matcher import PhraseMatcher, Matcher
import re
from datetime import datetime

Advanced Entity Analysis:

Entity role detection (e.g., director, CEO)
Confidence scoring for entities
Multiple mention tracking
Entity disambiguation


Relationship Detection:

Entity pair analysis
Sentence-level relationship extraction
Confidence scoring for relationships
Action pattern recognition


Enhanced Information Extraction:

Context preservation
Role extraction for persons and organizations
Relationship confidence scoring
Detailed mention tracking

In [10]:
class AdvancedEntityAnalyzer:
    """
    Advanced entity analysis system specifically designed for processing
    criminal investigation documents and extracting detailed entity relationships.
    Incorporates both original and processed text for improved accuracy.
    """
    
    def __init__(self):
        # Load the larger spaCy model for better accuracy
        self.nlp = spacy.load("en_core_web_lg")
        
        # Initialize matchers and storage
        self.matcher = Matcher(self.nlp.vocab)
        self.phrase_matcher = PhraseMatcher(self.nlp.vocab)
        self.entity_registry = defaultdict(dict)
        
        # Initialize patterns for relationship detection
        self._initialize_patterns()
    
    def _initialize_patterns(self):
        """
        Initialize custom patterns for entity and relationship analysis.
        Includes role patterns, action patterns, and relationship indicators.
        """
        # Role patterns (positions and titles)
        self.role_patterns = [
            'director', 'ceo', 'chairman', 'manager', 'officer',
            'president', 'head', 'leader', 'founder', 'owner',
            'executive', 'administrator', 'supervisor', 'coordinator'
        ]
        
        # Action patterns indicating relationships
        self.action_patterns = [
            'collaborated', 'worked with', 'partnered', 'associated',
            'connected to', 'linked to', 'involved with', 'related to',
            'managed', 'supervised', 'reported to', 'directed'
        ]
        
        # Relationship indicators for crime context
        self.relationship_indicators = {
            'HIERARCHICAL': [
                'reports to', 'supervised by', 'managed by', 'works under',
                'directed by', 'controlled by', 'overseen by'
            ],
            'COLLABORATION': [
                'worked with', 'partnered with', 'collaborated with',
                'assisted', 'supported', 'aided', 'helped'
            ],
            'CRIMINAL_ASSOCIATION': [
                'conspired with', 'colluded with', 'involved in',
                'participated in', 'engaged in', 'associated with'
            ]
        }
        
        # Add patterns to matcher
        for pattern_type, patterns in self.relationship_indicators.items():
            pattern = [{'LOWER': {'IN': patterns}}]
            self.matcher.add(pattern_type, [pattern])
    
    def process_document(self, original_text: str, processed_text: str, 
                        existing_entities: Dict[str, Set[str]], metadata: Dict) -> Dict:
        """
        Process a single document using both original and processed text.
        
        Args:
            original_text: Raw text content
            processed_text: Cleaned text from Step 1
            existing_entities: Entities already identified in Step 1
            metadata: Document metadata from Step 1
            
        Returns:
            Dict containing enhanced entity information and relationships
        """
        # Process both versions of text
        original_doc = self.nlp(original_text)
        processed_doc = self.nlp(processed_text)
        
        # Extract and combine entities from both versions
        original_entities = self._extract_entities(original_doc)
        processed_entities = self._extract_entities(processed_doc)
        
        # Compare and combine entity information
        enhanced_entities = self._compare_entity_versions(original_entities, 
                                                        processed_entities,
                                                        existing_entities)
        
        # Extract relationships between entities
        relationships = self._extract_relationships(original_doc, enhanced_entities)
        
        # Enhance relationships with metadata
        enhanced_relationships = self._incorporate_metadata(relationships, metadata)
        
        return {
            'entities': enhanced_entities,
            'relationships': enhanced_relationships,
            'confidence_scores': self._calculate_confidence_scores(enhanced_entities)
        }
    
    def _extract_entities(self, doc: Doc) -> Dict[str, Set[str]]:
        """
        Extract entities from a spaCy Doc object with enhanced categorization.
        """
        entities = defaultdict(set)
        
        # Enhanced entity label mapping
        label_mapping = {
            'GPE': 'LOC',
            'LOC': 'LOC',
            'FAC': 'LOC',
            'PERSON': 'PER',
            'ORG': 'ORG',
            'NORP': 'MISC',
            'PRODUCT': 'MISC',
            'EVENT': 'MISC',
            'LAW': 'MISC',
            'LANGUAGE': 'MISC'
        }
        
        for ent in doc.ents:
            category = label_mapping.get(ent.label_, 'MISC')
            # Clean and normalize entity text
            clean_text = self._clean_entity_text(ent.text)
            if clean_text:
                entities[category].add(clean_text)
        
        return dict(entities)
    
    def _clean_entity_text(self, text: str) -> str:
        """
        Clean and normalize entity text.
        """
        # Remove extra whitespace and standardize quotes
        text = ' '.join(text.split())
        text = text.replace('"', '"').replace('"', '"')
        return text.strip()
    
    def _compare_entity_versions(self, original_entities: Dict, 
                               processed_entities: Dict,
                               existing_entities: Dict) -> Dict:
        """
        Compare and combine entities from different sources.
        Assigns confidence scores based on occurrence in multiple sources.
        """
        combined_entities = defaultdict(lambda: {
            'entities': set(),
            'confidence_scores': defaultdict(float)
        })
        
        # Process each entity type
        for entity_type in set(original_entities) | set(processed_entities) | set(existing_entities):
            # Get entities from each source
            original = original_entities.get(entity_type, set())
            processed = processed_entities.get(entity_type, set())
            existing = existing_entities.get(entity_type, set())
            
            # Combine entities and calculate confidence scores
            all_entities = original | processed | existing
            for entity in all_entities:
                confidence = 0.0
                if entity in original: confidence += 0.3
                if entity in processed: confidence += 0.3
                if entity in existing: confidence += 0.4
                
                combined_entities[entity_type]['entities'].add(entity)
                combined_entities[entity_type]['confidence_scores'][entity] = min(1.0, confidence)
        
        return dict(combined_entities)
    
    def _extract_relationships(self, doc: Doc, entities: Dict) -> List[Dict]:
        """
        Extract relationships between entities with enhanced context.
        """
        relationships = []
        entity_pairs = self._generate_entity_pairs(entities)
        
        for entity1, entity2, entity_types in entity_pairs:
            relationship = self._analyze_relationship(doc, entity1, entity2, entity_types)
            if relationship:
                relationships.append(relationship)
        
        return relationships
    
    def _generate_entity_pairs(self, entities: Dict) -> List[Tuple]:
        """
        Generate meaningful entity pairs for relationship analysis.
        """
        pairs = []
        
        # Focus on PER-ORG, PER-PER, and ORG-ORG relationships
        priority_types = ['PER', 'ORG']
        
        for type1 in priority_types:
            for type2 in priority_types:
                if type1 in entities and type2 in entities:
                    entities1 = entities[type1]['entities']
                    entities2 = entities[type2]['entities']
                    
                    if type1 == type2:
                        # Avoid self-pairs for same type
                        pairs.extend([
                            (e1, e2, (type1, type2))
                            for i, e1 in enumerate(entities1)
                            for e2 in list(entities1)[i+1:]
                        ])
                    else:
                        # Cross-type pairs
                        pairs.extend([
                            (e1, e2, (type1, type2))
                            for e1 in entities1
                            for e2 in entities2
                        ])
        
        return pairs
    
    def _analyze_relationship(self, doc: Doc, entity1: str, 
                            entity2: str, entity_types: Tuple) -> Dict:
        """
        Analyze the relationship between two entities with enhanced context.
        """
        # Find sentences containing both entities
        relevant_sents = []
        for sent in doc.sents:
            if entity1.lower() in sent.text.lower() and entity2.lower() in sent.text.lower():
                relevant_sents.append(sent)
        
        if not relevant_sents:
            return None
        
        # Analyze the relationship
        relationship = {
            'entity1': {'text': entity1, 'type': entity_types[0]},
            'entity2': {'text': entity2, 'type': entity_types[1]},
            'contexts': [],
            'relationship_types': set(),
            'confidence': 0.0
        }
        
        # Analyze each relevant sentence
        for sent in relevant_sents:
            context = self._extract_relationship_context(sent, entity1, entity2)
            if context:
                relationship['contexts'].append(context)
                relationship['relationship_types'].update(context['relationship_types'])
                relationship['confidence'] = max(relationship['confidence'], 
                                              context['confidence'])
        
        if relationship['contexts']:
            relationship['relationship_types'] = list(relationship['relationship_types'])
            return relationship
        
        return None
    
    def _extract_relationship_context(self, sent: Span, entity1: str, entity2: str) -> Dict:
        """
        Extract detailed context for a relationship from a sentence.
        """
        matches = self.matcher(sent)
        relationship_types = set()
        
        # Analyze matches to determine relationship type
        for match_id, start, end in matches:
            rel_type = sent.vocab.strings[match_id]
            relationship_types.add(rel_type)
        
        # Calculate confidence based on clarity of relationship
        confidence = 0.5  # Base confidence
        if relationship_types:
            confidence += 0.3
        if len(sent) < 30:  # Shorter sentences often indicate clearer relationships
            confidence += 0.2
        
        return {
            'sentence': sent.text,
            'relationship_types': relationship_types,
            'confidence': min(1.0, confidence)
        }
    
    def _incorporate_metadata(self, relationships: List[Dict], metadata: Dict) -> List[Dict]:
        """
        Enhance relationships with temporal and document context from metadata.
        """
        enhanced_relationships = []
        
        for rel in relationships:
            # Add temporal context if available
            if 'mentioned_dates' in metadata:
                rel['temporal_context'] = metadata['mentioned_dates']
            
            # Add document metadata
            rel['document_context'] = {
                'doc_id': metadata.get('doc_id'),
                'timestamp': metadata.get('timestamp'),
                'document_type': metadata.get('document_type', 'unknown')
            }
            
            enhanced_relationships.append(rel)
        
        return enhanced_relationships
    
    def _calculate_confidence_scores(self, entities: Dict) -> Dict:
        """
        Calculate overall confidence scores for entity detection.
        """
        confidence_scores = {}
        
        for entity_type, data in entities.items():
            avg_confidence = sum(data['confidence_scores'].values()) / len(data['confidence_scores'])
            confidence_scores[entity_type] = avg_confidence
        
        return confidence_scores

In [11]:
def process_documents(input_file: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Process documents with enhanced entity recognition and relationship extraction.
    
    Args:
        input_file: Path to the CSV file from Step 1
        
    Returns:
        Tuple containing:
        - DataFrame with enhanced entity information
        - DataFrame with relationship information
    """
    print("Starting enhanced entity recognition and relationship extraction...")
    start_time = datetime.now()
    
    # Initialize analyzer
    analyzer = AdvancedEntityAnalyzer()
    
    # Read input data
    df = pd.read_csv(input_file)
    print(f"Loaded {len(df)} documents from {input_file}")
    
    # Process documents
    entities_data = []
    relationships_data = []
    
    for idx, row in df.iterrows():
        try:
            # Convert existing entities to sets
            existing_entities = {
                'LOC': set(row['LOC'].split('; ')) if pd.notna(row['LOC']) else set(),
                'PER': set(row['PER'].split('; ')) if pd.notna(row['PER']) else set(),
                'ORG': set(row['ORG'].split('; ')) if pd.notna(row['ORG']) else set(),
                'MISC': set(row['MISC'].split('; ')) if pd.notna(row['MISC']) else set()
            }
            
            # Process document
            result = analyzer.process_document(
                original_text=row['text'],
                processed_text=row['processed_text'],
                existing_entities=existing_entities,
                metadata=eval(row['metadata']) if pd.notna(row['metadata']) else {}
            )
            
            # Store entity information
            entities_data.append({
                'filename': row['filename'],
                'entities': result['entities'],
                'confidence_scores': result['confidence_scores']
            })
            
            # Store relationship information
            for rel in result['relationships']:
                rel['filename'] = row['filename']
                relationships_data.append(rel)
            
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1} documents...")
                
        except Exception as e:
            print(f"Error processing document {idx}: {str(e)}")
            continue
    
    # Create output DataFrames
    entities_df = pd.DataFrame(entities_data)
    relationships_df = pd.DataFrame(relationships_data)
    
    # Print processing statistics
    processing_time = datetime.now() - start_time
    print(f"\nProcessing completed in {processing_time}")
    print(f"Found {len(relationships_df)} relationships across {len(entities_df)} documents")
    
    return entities_df, relationships_df

In [13]:
import pandas as pd
import json
from datetime import datetime
import sys
from collections import defaultdict
from typing import Dict, List, Tuple
import numpy as np

def calculate_entity_statistics(df: pd.DataFrame) -> Tuple[Dict, Dict, Dict]:
    """
    Calculate detailed statistics about entities and their confidence scores.
    
    Returns:
        Tuple containing:
        - Entity counts by type
        - Average confidence scores by type
        - Most frequent entities by type
    """
    entity_counts = defaultdict(int)
    confidence_scores = defaultdict(list)
    frequent_entities = defaultdict(lambda: defaultdict(int))
    
    for _, row in df.iterrows():
        entities = row['entities']
        for entity_type, data in entities.items():
            # Count entities
            entity_counts[entity_type] += len(data['entities'])
            
            # Collect confidence scores
            confidence_scores[entity_type].extend(data['confidence_scores'].values())
            
            # Count entity frequencies
            for entity in data['entities']:
                frequent_entities[entity_type][entity] += 1
    
    # Calculate average confidence scores
    avg_confidence = {
        etype: np.mean(scores) if scores else 0
        for etype, scores in confidence_scores.items()
    }
    
    # Get top 5 most frequent entities for each type
    top_entities = {
        etype: dict(sorted(entities.items(), key=lambda x: x[1], reverse=True)[:5])
        for etype, entities in frequent_entities.items()
    }
    
    return entity_counts, avg_confidence, top_entities

def analyze_relationships(df: pd.DataFrame) -> Dict:
    """
    Analyze relationship patterns and statistics.
    """
    relationship_stats = {
        'total_count': len(df),
        'type_distribution': defaultdict(int),
        'confidence_distribution': [],
        'entity_type_pairs': defaultdict(int)
    }
    
    for _, row in df.iterrows():
        # Count relationship types
        for rel_type in row['relationship_types']:
            relationship_stats['type_distribution'][rel_type] += 1
        
        # Record confidence scores
        relationship_stats['confidence_distribution'].append(row['confidence'])
        
        # Count entity type pairs
        pair_key = f"{row['entity1']['type']}-{row['entity2']['type']}"
        relationship_stats['entity_type_pairs'][pair_key] += 1
    
    return relationship_stats

def main():
    try:
        print("Starting enhanced entity recognition and relationship extraction...")
        start_time = datetime.now()
        
        # Input and output files
        input_file = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/process.csv'
        output_entities = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/enhanced_entities.csv'
        output_relationships = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/entity_relationships.csv'
        
        # Process documents
        print(f"Processing documents from {input_file}...")
        entities_df, relationships_df = process_documents(input_file)
        
        # Save results
        print(f"\nSaving enhanced entity information to {output_entities}")
        entities_df.to_csv(output_entities, index=False)
        
        print(f"Saving relationship information to {output_relationships}")
        relationships_df.to_csv(output_relationships, index=False)
        
        # Calculate statistics
        print("\nCalculating detailed statistics...")
        entity_counts, confidence_scores, frequent_entities = calculate_entity_statistics(entities_df)
        relationship_stats = analyze_relationships(relationships_df)
        
        # Print comprehensive statistics
        print("\n=== Processing Statistics ===")
        print(f"Total documents processed: {len(entities_df)}")
        
        print("\n--- Entity Statistics ---")
        print(f"Total entities found: {sum(entity_counts.values())}")
        print("\nEntities by type:")
        for entity_type, count in entity_counts.items():
            print(f"{entity_type}: {count} entities (Avg. confidence: {confidence_scores[entity_type]:.2f})")
        
        print("\nTop 5 most frequent entities by type:")
        for entity_type, entities in frequent_entities.items():
            print(f"\n{entity_type}:")
            for entity, count in entities.items():
                print(f"  - {entity}: {count} occurrences")
        
        print("\n--- Relationship Statistics ---")
        print(f"Total relationships found: {relationship_stats['total_count']}")
        
        print("\nRelationship types distribution:")
        for rel_type, count in relationship_stats['type_distribution'].items():
            print(f"  {rel_type}: {count}")
        
        print("\nEntity type pair distribution:")
        for pair, count in relationship_stats['entity_type_pairs'].items():
            print(f"  {pair}: {count}")
        
        avg_confidence = np.mean(relationship_stats['confidence_distribution'])
        print(f"\nAverage relationship confidence: {avg_confidence:.2f}")
        
        # Save statistics to file
        statistics = {
            'entity_statistics': {
                'counts': entity_counts,
                'confidence_scores': confidence_scores,
                'frequent_entities': frequent_entities
            },
            'relationship_statistics': relationship_stats,
            'processing_info': {
                'timestamp': datetime.now().isoformat(),
                'processing_time': str(datetime.now() - start_time),
                'document_count': len(entities_df)
            }
        }
        
        with open('processing_statistics.json', 'w') as f:
            json.dump(statistics, f, indent=2)
        
        # Print processing time
        processing_time = datetime.now() - start_time
        print(f"\nTotal processing time: {processing_time}")
        
    except Exception as e:
        print(f"Error during processing: {str(e)}", file=sys.stderr)
        print("\nFull error details:", file=sys.stderr)
        import traceback
        traceback.print_exc()
        raise

if __name__ == "__main__":
    main()

Starting enhanced entity recognition and relationship extraction...
Processing documents from /Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/process.csv...
Starting enhanced entity recognition and relationship extraction...
Loaded 1518 documents from /Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/process.csv
Processed 100 documents...
Processed 200 documents...
Processed 300 documents...
Processed 400 documents...
Processed 500 documents...
Processed 600 documents...
Processed 700 documents...
Processed 800 documents...
Processed 900 documents...
Processed 1000 documents...
Processed 1100 documents...
Processed 1200 documents...
Processed 1300 documents...
Processed 1400 documents...
Processed 1500 documents...

Processing completed in 0:00:32.566390
Found 11796 relationships across 1518 documents

Saving enhanced entity information to /Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/enh