In [16]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import json

In [17]:
import pandas as pd
import networkx as nx
from collections import defaultdict
import json

class CrimeNetworkAnalyzer:
    def __init__(self, relationships_df):
        self.relationships_df = relationships_df
        self.G = nx.Graph()
        self.entity_metrics = {}
        self.crime_patterns = defaultdict(list)
        
    def build_network(self):
        """Build network from relationships data"""
        # Add nodes with attributes
        for _, row in self.relationships_df.iterrows():
            # Add nodes if they don't exist
            if not self.G.has_node(row['subject']):
                self.G.add_node(row['subject'], 
                              type=row['subject_type'],
                              crimes=set())
            if not self.G.has_node(row['object']):
                self.G.add_node(row['object'], 
                              type=row['object_type'],
                              crimes=set())
                
            # Add edge with attributes
            self.G.add_edge(row['subject'], row['object'],
                          relationship=row['predicate'],
                          crime_type=row['crime_type'],
                          evidence_strength=row['evidence_strength'])
            
            # Update crime types for both nodes
            self.G.nodes[row['subject']]['crimes'].add(row['crime_type'])
            self.G.nodes[row['object']]['crimes'].add(row['crime_type'])

    def calculate_network_metrics(self):
        """Calculate various network metrics for each entity"""
        # Basic centrality measures
        degree_cent = nx.degree_centrality(self.G)
        betweenness_cent = nx.betweenness_centrality(self.G)
        eigenvector_cent = nx.eigenvector_centrality(self.G, max_iter=1000)
        
        # Calculate metrics for each node
        for node in self.G.nodes():
            neighbors = list(self.G.neighbors(node))
            crimes = self.G.nodes[node]['crimes']
            
            self.entity_metrics[node] = {
                'type': self.G.nodes[node]['type'],
                'degree': self.G.degree(node),
                'degree_centrality': degree_cent[node],
                'betweenness_centrality': betweenness_cent[node],
                'eigenvector_centrality': eigenvector_cent[node],
                'num_connections': len(neighbors),
                'crimes': list(crimes),
                'num_crimes': len(crimes),
                'connected_entities': neighbors
            }

    def identify_crime_patterns(self):
        """Identify patterns in criminal activities"""
        for node, metrics in self.entity_metrics.items():
            # Group by crime type
            for crime in metrics['crimes']:
                if crime != 'Unknown':
                    self.crime_patterns[crime].append({
                        'entity': node,
                        'type': metrics['type'],
                        'centrality': metrics['degree_centrality']
                    })

    def clean_network_data(self):
        """Clean network data by handling null values and removing unknown crimes"""
        # Remove edges with unknown crime types
        edges_to_remove = [(u, v) for u, v, data in self.G.edges(data=True) 
                          if data.get('crime_type') == 'Unknown']
        self.G.remove_edges_from(edges_to_remove)
        
        # Remove isolated nodes (nodes with no connections after edge removal)
        isolated_nodes = list(nx.isolates(self.G))
        self.G.remove_nodes_from(isolated_nodes)
        
        # Clean remaining node attributes
        for node in self.G.nodes():
            # Remove 'Unknown' from crime sets
            self.G.nodes[node]['crimes'] = {
                str(crime) for crime in self.G.nodes[node]['crimes']
                if crime != 'Unknown'
            }
            
            # If no crimes left, add placeholder
            if not self.G.nodes[node]['crimes']:
                self.G.nodes[node]['crimes'] = {'Unspecified'}
            
            # Ensure type is not null
            if not self.G.nodes[node]['type']:
                self.G.nodes[node]['type'] = 'Unspecified'

        # Clean remaining edge attributes
        for u, v, data in self.G.edges(data=True):
            if not data.get('evidence_strength'):
                data['evidence_strength'] = 'unspecified'
        
        # Update metrics after cleaning
        self.calculate_network_metrics()
        self.identify_crime_patterns()

    def export_for_tableau(self, output_path_prefix):
        """Export data in Tableau-friendly format"""
        # Nodes table
        nodes_data = []
        for node, metrics in self.entity_metrics.items():
            nodes_data.append({
                'Entity': node,
                'Type': metrics['type'],
                'Degree': metrics['degree'],
                'DegreeCentrality': metrics['degree_centrality'],
                'BetweennessCentrality': metrics['betweenness_centrality'],
                'EigenvectorCentrality': metrics['eigenvector_centrality'],
                'NumConnections': metrics['num_connections'],
                'NumCrimes': metrics['num_crimes'],
                'Crimes': ';'.join(metrics['crimes'])
            })
        
        # Edges table
        edges_data = []
        for edge in self.G.edges(data=True):
            edges_data.append({
                'Source': edge[0],
                'Target': edge[1],
                'Relationship': edge[2]['relationship'],
                'CrimeType': edge[2]['crime_type'],
                'EvidenceStrength': edge[2]['evidence_strength']
            })
        
        # Crime patterns table
        patterns_data = []
        for crime_type, entities in self.crime_patterns.items():
            for entity in entities:
                patterns_data.append({
                    'CrimeType': crime_type,
                    'Entity': entity['entity'],
                    'EntityType': entity['type'],
                    'Centrality': entity['centrality']
                })
        
        # Export to CSV
        pd.DataFrame(nodes_data).to_csv(f'{output_path_prefix}_nodes.csv', index=False)
        pd.DataFrame(edges_data).to_csv(f'{output_path_prefix}_edges.csv', index=False)
        pd.DataFrame(patterns_data).to_csv(f'{output_path_prefix}_patterns.csv', index=False)
        
        return pd.DataFrame(nodes_data), pd.DataFrame(edges_data), pd.DataFrame(patterns_data)

    def get_summary_statistics(self):
        """Get summary statistics of the network"""
        return {
            'num_nodes': self.G.number_of_nodes(),
            'num_edges': self.G.number_of_edges(),
            'avg_degree': sum(dict(self.G.degree()).values()) / self.G.number_of_nodes(),
            'density': nx.density(self.G),
            'num_components': nx.number_connected_components(self.G),
            'avg_clustering': nx.average_clustering(self.G)
        }

In [18]:
# Load the relationships data
relationships_df = pd.read_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/process3_crime_relationships_enhanced.csv')

# Initialize and run the analyzer
analyzer = CrimeNetworkAnalyzer(relationships_df)
analyzer.build_network()

# Record initial stats
initial_nodes = analyzer.G.number_of_nodes()
initial_edges = analyzer.G.number_of_edges()

print(f"\nInitial network size:")
print(f"Nodes: {initial_nodes}")
print(f"Edges: {initial_edges}")

# Clean the network data
print("\nCleaning network data...")
analyzer.clean_network_data()

# Print cleaning impact
print(f"\nNetwork size after removing unknown crimes:")
print(f"Nodes: {analyzer.G.number_of_nodes()} ({initial_nodes - analyzer.G.number_of_nodes()} removed)")
print(f"Edges: {analyzer.G.number_of_edges()} ({initial_edges - analyzer.G.number_of_edges()} removed)")

# Calculate metrics and export
analyzer.calculate_network_metrics()
analyzer.identify_crime_patterns()
nodes_df, edges_df, patterns_df = analyzer.export_for_tableau('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean')

# Export data for Tableau
nodes_df, edges_df, patterns_df = analyzer.export_for_tableau('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network')

# Print summary statistics
stats = analyzer.get_summary_statistics()
print("\nNetwork Summary Statistics:")
for key, value in stats.items():
    print(f"{key}: {value:.4f}")

# Print top entities by centrality
print("\nTop 10 Most Central Entities:")
nodes_df_sorted = nodes_df.sort_values('DegreeCentrality', ascending=False).head(10)
print(nodes_df_sorted[['Entity', 'Type', 'DegreeCentrality', 'NumCrimes']])

# Print most common crime patterns
print("\nTop Crime Patterns:")
crime_counts = patterns_df['CrimeType'].value_counts()
print(crime_counts.head(10))


Initial network size:
Nodes: 2152
Edges: 5528

Cleaning network data...

Network size after removing unknown crimes:
Nodes: 462 (1690 removed)
Edges: 1268 (4260 removed)

Network Summary Statistics:
num_nodes: 462.0000
num_edges: 1268.0000
avg_degree: 5.4892
density: 0.0119
num_components: 69.0000
avg_clustering: 0.6422

Top 10 Most Central Entities:
                           Entity Type  DegreeCentrality  NumCrimes
44                             Un  ORG          0.197397          4
273                           Itf  ORG          0.084599          4
361                          Unon  ORG          0.065076          2
271                         Unmik  ORG          0.062907          4
279                            Al  PER          0.056399          4
275                       Airport  ORG          0.054230          3
290  Air Traffic Control Services  ORG          0.054230          1
342                            Ed  PER          0.052061          4
344                            Wi 