In [30]:
import pandas as pd
import spacy
import re
from datetime import datetime
import numpy as np
from typing import Dict, List, Tuple
from collections import defaultdict

Below outlines the class that will give us the TextPreprocessor to help us clean the data.
Let me explain the preprocessing pipeline I've created in simple terms:

First, we created a TextPreprocessor class that handles all the cleaning and processing of our documents. 
The main components of our preprocessing pipeline are:
a. Text Cleaning (clean_text method):

Removes extra spaces and standardizes how lines break
Makes sure all quotes are in the same format
Removes strange characters while keeping important ones like periods and commas
Makes sure spacing around punctuation is consistent

b. Metadata Extraction (extract_metadata method):

Collects important information about each document
Records when we processed it
Figures out how long the document is
Identifies what language it's in
Determines what kind of document it is (like an investigation report or memo)
Finds any dates mentioned in the text

c. Document Segmentation (segment_document method):

Breaks the document into logical pieces
Identifies different parts like "allegation," "background," and "investigation"
This helps us later when we want to find specific types of information


The process_dataset function ties everything together:

Takes our spreadsheet of documents
Runs each document through the preprocessing pipeline
Creates a new, enhanced version of our data with all the processed information


What makes this preprocessing special for our criminal investigation data:

It's designed to handle investigative documents specifically, recognizing parts like "allegations" and "background information"
It preserves important elements like dates, monetary values, and company names
It creates a structured format that will make it easier to extract relationships between entities later
It maintains a clear record of how we processed each document

In [31]:
class EnhancedTextPreprocessor:
    """
    Enhanced version of TextPreprocessor with crime type detection and structured entity extraction.
    """
    
    def __init__(self):
        # Load the English language model from spaCy
        self.nlp = spacy.load("en_core_web_lg")
        
        # Initialize storage for document metadata
        self.document_metadata = {}
        
        # Define crime types and their related keywords
        self.crime_patterns = {
            'FINANCIAL_CRIME': [
                'fraud', 'embezzlement', 'money laundering', 'bribery', 'corruption',
                'insider trading', 'tax evasion', 'ponzi scheme', 'financial crime',
                'misappropriation', 'illegal transaction'
            ],
            'CYBERCRIME': [
                'hacking', 'cyber attack', 'data breach', 'ransomware', 'phishing',
                'malware', 'identity theft', 'cyber crime', 'cyber security breach',
                'computer fraud', 'network intrusion'
            ],
            'VIOLENT_CRIME': [
                'assault', 'murder', 'homicide', 'robbery', 'kidnapping',
                'terrorism', 'shooting', 'violent crime', 'armed robbery',
                'physical assault', 'violent attack'
            ],
            'ORGANIZED_CRIME': [
                'trafficking', 'smuggling', 'cartel', 'syndicate', 'gang',
                'organized crime', 'criminal enterprise', 'criminal network',
                'criminal organization', 'illegal operation'
            ],
            'PROPERTY_CRIME': [
                'theft', 'burglary', 'vandalism', 'arson', 'shoplifting',
                'property damage', 'stolen property', 'breaking and entering',
                'property crime', 'larceny'
            ]
        }
        
        # Compile regex patterns for crime detection
        self.crime_regex = self._compile_crime_patterns()
        
    def _compile_crime_patterns(self) -> Dict[str, re.Pattern]:
        """
        Compile regex patterns for each crime type.
        Returns dictionary of compiled regex patterns.
        """
        compiled_patterns = {}
        for crime_type, keywords in self.crime_patterns.items():
            # Create regex pattern that matches any keyword, case insensitive
            pattern = r'\b(' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b'
            compiled_patterns[crime_type] = re.compile(pattern, re.IGNORECASE)
        return compiled_patterns
    
    def detect_crime_types(self, text: str) -> List[str]:
        """
        Detect crime types mentioned in the text.
        Returns list of detected crime types.
        """
        detected_crimes = set()
        for crime_type, pattern in self.crime_regex.items():
            if pattern.search(text):
                detected_crimes.add(crime_type)
        return list(detected_crimes)
    
    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """
        Extract entities by type (LOC, PER, MISC, ORG).
        Returns dictionary of entity lists by type.
        """
        doc = self.nlp(text)
        entities = defaultdict(set)
        
        # Map spaCy entity labels to our desired categories
        label_mapping = {
            'GPE': 'LOC',
            'LOC': 'LOC',
            'PERSON': 'PER',
            'ORG': 'ORG',
            'FAC': 'LOC',  # Facilities are mapped to locations
            'NORP': 'MISC',  # Nationalities, religious or political groups
            'PRODUCT': 'MISC',
            'EVENT': 'MISC',
            'WORK_OF_ART': 'MISC',
            'LAW': 'MISC',
            'LANGUAGE': 'MISC'
        }
        
        for ent in doc.ents:
            # Map the entity to our categories
            category = label_mapping.get(ent.label_, 'MISC')
            entities[category].add(ent.text)
        
        # Convert sets to sorted lists
        return {k: sorted(v) for k, v in entities.items()}
    
    def clean_text(self, text: str) -> str:
        """
        Clean and standardize text content.
        """
        if pd.isna(text):
            return ""
            
        # Convert to string if not already
        text = str(text)
        
        # Remove extra whitespace and standardize newlines
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        # Standardize quotes
        text = text.replace('"', '"').replace('"', '"')
        
        # Remove control characters
        text = ''.join(char for char in text if ord(char) >= 32 or char == '\n')
        
        # Standardize spacing around punctuation
        text = re.sub(r'\s*([.,!?;:])\s*', r'\1 ', text)
        
        return text
    
    def process_document(self, text: str, doc_id: str) -> Tuple[Dict[str, List[str]], List[str], Dict]:
        """
        Process a single document to extract entities and crime types.
        
        Args:
            text (str): Raw text content
            doc_id (str): Document identifier
            
        Returns:
            Tuple[Dict[str, List[str]], List[str], Dict]: 
            - Extracted entities by category
            - Detected crime types
            - Document metadata
        """
        # Clean the text first
        cleaned_text = self.clean_text(text)
        
        # Extract entities
        entities = self.extract_entities(cleaned_text)
        
        # Detect crime types
        crime_types = self.detect_crime_types(cleaned_text)
        
        # Extract metadata
        metadata = self.extract_metadata(cleaned_text, doc_id)
        
        return entities, crime_types, metadata
    
    def extract_metadata(self, text: str, doc_id: str) -> Dict:
        """
        Extract metadata from the document text.
        """
        metadata = {
            'doc_id': doc_id,
            'timestamp': datetime.now().isoformat(),
            'length': len(text)
        }
        
        # Extract dates mentioned in the text
        doc = self.nlp(text)
        dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
        if dates:
            metadata['mentioned_dates'] = dates
            
        return metadata

In [32]:
def process_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the entire dataset using the EnhancedTextPreprocessor.
    
    Args:
        df (pd.DataFrame): Input DataFrame with document text
        
    Returns:
        pd.DataFrame: Processed DataFrame in desired format
    """
    # Initialize the preprocessor
    preprocessor = EnhancedTextPreprocessor()
    
    # Lists to store processed data
    all_data = []
    
    # Process each document
    for idx, row in df.iterrows():
        try:
            # Extract text and filename
            text = row['Extracted Text']
            filename = row['Filename']
            
            # Process the document
            entities, crime_types, metadata = preprocessor.process_document(text, filename)
            
            # Prepare row data
            row_data = {
                'filename': filename,
                'LOC': '; '.join(entities.get('LOC', [])),
                'PER': '; '.join(entities.get('PER', [])),
                'ORG': '; '.join(entities.get('ORG', [])),
                'MISC': '; '.join(entities.get('MISC', [])),
                'CRIME_TYPES': '; '.join(crime_types)
            }
            
            all_data.append(row_data)
            
            # Print progress
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1} documents...")
                
        except Exception as e:
            print(f"Error processing document {idx}: {str(e)}")
            continue
    
    # Create output DataFrame
    output_df = pd.DataFrame(all_data)
    
    return output_df

In [None]:
# Implementation
import pandas as pd
import sys
from datetime import datetime

def main():
    try:
        print("Starting data processing...")
        start_time = datetime.now()
        
        # Read the datasets
        print("Reading datasets...")
        news_df = pd.read_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Unstructured csvs/news_excerpts_parsed.xlsx - Sheet1 (1).csv')
        wikileaks_df = pd.read_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Unstructured csvs/wikileaks_parsed.xlsx - Sheet1 copy.csv')
        
        print(f"Read {len(news_df)} news excerpts and {len(wikileaks_df)} wikileaks documents")
        
        # Rename columns in wikileaks_df to match news_df
        wikileaks_df = wikileaks_df.rename(columns={
            'PDF Path': 'Filename',
            'Text': 'Extracted Text'
        })
        
        # Combine datasets
        print("Combining datasets...")
        combined_df = pd.concat([news_df, wikileaks_df], ignore_index=True)
        
        # Process the combined dataset
        print("\nProcessing documents...")
        processed_df = process_dataset(combined_df)
        
        # Save the processed data
        output_path = '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/Dataset/Cleaned_data 29 Jan/process.csv'
        print(f"\nSaving processed data to {output_path}")
        processed_df.to_csv(output_path, index=False)
        
        # Print statistics
        print("\nProcessing Statistics:")
        print(f"Total documents processed: {len(processed_df)}")
        
        print("\nCrime type distribution:")
        crime_types = processed_df['CRIME_TYPES'].str.split('; ').explode()
        print(crime_types.value_counts().head())
        
        print("\nEntity statistics:")
        for col in ['LOC', 'PER', 'ORG']:
            entities = processed_df[col].str.split('; ').explode()
            print(f"\nTop 5 most mentioned {col}:")
            print(entities.value_counts().head())
        
        # Calculate and print processing time
        processing_time = datetime.now() - start_time
        print(f"\nTotal processing time: {processing_time}")
        
    except Exception as e:
        print(f"Error during processing: {str(e)}", file=sys.stderr)
        raise

if __name__ == "__main__":
    main()

Starting data processing...
Reading datasets...
Read 1509 news excerpts and 9 wikileaks documents
Combining datasets...

Processing documents...


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107a55c90>>
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/smubia/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Processed 100 documents...
Processed 200 documents...
Processed 300 documents...
Processed 400 documents...
Processed 500 documents...
Processed 600 documents...
Processed 700 documents...
