In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
from scipy.stats import zscore
import networkx as nx

In [10]:
class RepeatOffenderPredictor:
    def __init__(self):
        self.label_encoders = {}
        self.model = None
        self.feature_importance = None
        
    def load_data(self, nodes_path, edges_path, patterns_path):
        """Load and merge network data"""
        self.nodes_df = pd.read_csv(nodes_path)
        self.edges_df = pd.read_csv(edges_path)
        self.patterns_df = pd.read_csv(patterns_path)
        
        # Create graph for network metrics
        self.G = nx.from_pandas_edgelist(self.edges_df, 'Source', 'Target')
    
    def engineer_features(self):
        """Create features for prediction"""
        features = pd.DataFrame()
        features['Entity'] = self.nodes_df['Entity']
        
        # Basic metrics from nodes
        features['Degree'] = self.nodes_df['Degree']
        features['NumCrimes'] = self.nodes_df['NumCrimes']
        features['DegreeCentrality'] = self.nodes_df['DegreeCentrality']
        features['BetweennessCentrality'] = self.nodes_df['BetweennessCentrality']
        
        # Network metrics
        features['PageRank'] = pd.Series(nx.pagerank(self.G))
        features['ClusteringCoeff'] = pd.Series(nx.clustering(self.G))
        
        # Crime type encoding
        crime_dummies = self.patterns_df.pivot_table(
            index='Entity', 
            columns='CrimeType',
            values='Centrality',
            aggfunc='count',
            fill_value=0
        )
        features = features.join(crime_dummies, on='Entity')
        
        # Connection patterns
        entity_connections = self.edges_df.groupby('Source')['Target'].count()
        features['ConnectionCount'] = features['Entity'].map(entity_connections).fillna(0)
        
        # Evidence strength patterns
        evidence_counts = self.edges_df.groupby('Source')['EvidenceStrength'].value_counts().unstack(fill_value=0)
        features = features.join(evidence_counts, on='Entity')
        
        # Define repeat offender (target variable)
        features['IsRepeatOffender'] = (features['NumCrimes'] > 1).astype(int)
        
        self.features = features.fillna(0)
        return self.features
    
    def prepare_model_data(self):
        """Prepare data for modeling"""
        # Separate features and target
        X = self.features.drop(['Entity', 'IsRepeatOffender'], axis=1)
        y = self.features['IsRepeatOffender']
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        return self.X_train, self.X_test, self.y_train, self.y_test
    
    def train_model(self):
        """Train XGBoost classifier"""
        self.model = xgb.XGBClassifier(
            max_depth=4,
            learning_rate=0.1,
            n_estimators=100,
            objective='binary:logistic',
            random_state=42
        )
        
        self.model.fit(self.X_train, self.y_train)
        
        # Store feature importance
        self.feature_importance = pd.DataFrame({
            'feature': self.X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return self.model
    
    def evaluate_model(self):
        """Evaluate model performance"""
        y_pred = self.model.predict(self.X_test)
        
        # Calculate metrics
        report = classification_report(self.y_test, y_pred, output_dict=True)
        conf_matrix = confusion_matrix(self.y_test, y_pred)
        
        return {
            'classification_report': report,
            'confusion_matrix': conf_matrix,
            'feature_importance': self.feature_importance
        }
    
    def generate_risk_scores(self):
        """Generate risk scores for all entities"""
        # Get probability predictions
        X = self.features.drop(['Entity', 'IsRepeatOffender'], axis=1)
        probabilities = self.model.predict_proba(X)[:, 1]
        
        # Create risk scores dataframe
        risk_scores = pd.DataFrame({
            'Entity': self.features['Entity'],
            'RiskScore': probabilities
        })
        
        # Assign risk levels using percentile ranges to handle duplicates
        risk_scores['RiskPercentile'] = risk_scores['RiskScore'].rank(pct=True)
        risk_scores['RiskLevel'] = pd.cut(
            risk_scores['RiskPercentile'],
            bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
            labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'],
            include_lowest=True
        )
        
        return risk_scores.sort_values('RiskScore', ascending=False)


In [11]:
if __name__ == "__main__":
    # Initialize predictor
    predictor = RepeatOffenderPredictor()
    
    # Load data
    predictor.load_data(
        '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean_nodes.csv',
        '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean_edges.csv',
        '/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/Tablaeu Data/crime_network_clean_patterns.csv'
    )
    
    # Engineer features
    features = predictor.engineer_features()
    print("\nFeature Engineering Complete:")
    print(f"Number of features: {features.shape[1]}")
    
    # Prepare data
    X_train, X_test, y_train, y_test = predictor.prepare_model_data()
    
    # Train model
    model = predictor.train_model()
    print("\nModel Training Complete")
    
    # Evaluate model
    evaluation = predictor.evaluate_model()
    print("\nModel Evaluation:")
    print("Classification Report:")
    print(pd.DataFrame(evaluation['classification_report']).T)
    
    # Generate risk scores
    risk_scores = predictor.generate_risk_scores()
    print("\nTop 10 Highest Risk Entities:")
    print(risk_scores.head(10))
    
    # Save outputs
    risk_scores.to_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/entity_risk_scores.csv', index=False)
    predictor.feature_importance.to_csv('/Users/damienfoo/Desktop/SMUBIA Datathon Lunar Logic/FINAL FINAL PLEASE/Data/feature_importance.csv', index=False)


Feature Engineering Complete:
Number of features: 22

Model Training Complete

Model Evaluation:
Classification Report:
              precision  recall  f1-score  support
0                   1.0     1.0       1.0     84.0
1                   1.0     1.0       1.0      9.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0     93.0
weighted avg        1.0     1.0       1.0     93.0

Top 10 Highest Risk Entities:
             Entity  RiskScore  RiskPercentile  RiskLevel
425        Un-Sacco   0.981855        0.948052  Very High
413           Airpo   0.981855        0.948052  Very High
433          Unicef   0.981855        0.948052  Very High
44               Un   0.981855        0.948052  Very High
431  Unicef/Somalia   0.981855        0.948052  Very High
310        Septembe   0.981855        0.948052  Very High
313        Procurem   0.981855        0.948052  Very High
426      Ed Nations   0.981855        0.948052  Very High
424           Sacco 