# Crime Hotspot Prediction Model Development

This notebook develops and evaluates machine learning models for predicting crime hotspots in Roysambu ward.

## Objectives
- Prepare features for ML models
- Train multiple classification algorithms
- Evaluate model performance
- Generate predictions and hotspot maps

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib
import sys
import os

# Add project path
sys.path.append('../../')

print("Libraries imported successfully!")

In [None]:
# Load and prepare data
print("Loading crime data...")

# For demonstration, create sample grid data
np.random.seed(42)

# Create grid cells for Roysambu
lat_min, lat_max = -1.2200, -1.2000
lon_min, lon_max = 36.8900, 36.9100
grid_size = 0.005  # ~500m grid cells

lats = np.arange(lat_min, lat_max, grid_size)
lons = np.arange(lon_min, lon_max, grid_size)

grid_data = []
for i, lat in enumerate(lats):
    for j, lon in enumerate(lons):
        # Simulate features for each grid cell
        grid_data.append({
            'grid_id': f'{i}_{j}',
            'center_lat': lat + grid_size/2,
            'center_lon': lon + grid_size/2,
            'crime_count': np.random.poisson(2),  # Poisson distribution for crime counts
            'population_density': np.random.uniform(100, 1000),
            'distance_to_main_road': np.random.uniform(0, 500),
            'distance_to_school': np.random.uniform(0, 1000),
            'distance_to_bar': np.random.uniform(0, 800),
            'distance_to_atm': np.random.uniform(0, 600),
            'commercial_density': np.random.uniform(0, 50),
            'lighting_quality': np.random.uniform(1, 5),
            'patrol_frequency': np.random.uniform(0, 10)
        })

df = pd.DataFrame(grid_data)
print(f"Created grid dataset with {len(df)} cells")
print(f"Crime count distribution: {df['crime_count'].describe()}")

df.head()

In [None]:
# Feature engineering
print("Engineering features...")

# Create additional features
df['distance_to_center'] = np.sqrt(
    (df['center_lat'] - df['center_lat'].mean())**2 + 
    (df['center_lon'] - df['center_lon'].mean())**2
)

# Proximity features (inverse distance)
df['proximity_to_main_road'] = 1 / (1 + df['distance_to_main_road'])
df['proximity_to_school'] = 1 / (1 + df['distance_to_school'])
df['proximity_to_bar'] = 1 / (1 + df['distance_to_bar'])
df['proximity_to_atm'] = 1 / (1 + df['distance_to_atm'])

# Risk composite scores
df['vulnerability_score'] = (
    df['proximity_to_bar'] * 0.3 +
    df['proximity_to_atm'] * 0.2 +
    (1 / (1 + df['lighting_quality'])) * 0.3 +
    (1 / (1 + df['patrol_frequency'])) * 0.2
)

# Create binary target variable (hotspot vs not)
# Hotspot if crime count >= 3
df['is_hotspot'] = (df['crime_count'] >= 3).astype(int)

print(f"Hotspot distribution: {df['is_hotspot'].value_counts()}")
print(f"Hotspot percentage: {df['is_hotspot'].mean():.1%}")

In [None]:
# Prepare features for modeling
feature_columns = [
    'center_lat', 'center_lon', 'population_density',
    'distance_to_main_road', 'distance_to_school', 'distance_to_bar', 'distance_to_atm',
    'commercial_density', 'lighting_quality', 'patrol_frequency',
    'distance_to_center', 'proximity_to_main_road', 'proximity_to_school',
    'proximity_to_bar', 'proximity_to_atm', 'vulnerability_score'
]

X = df[feature_columns]
y = df['is_hotspot']

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully")

In [None]:
# Train Random Forest model
print("Training Random Forest model...")

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
print("\n=== Random Forest Results ===")
print(f"Accuracy: {rf_model.score(X_test_scaled, y_test):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_rf):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Train Logistic Regression model
print("Training Logistic Regression model...")

lr_model = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    max_iter=1000
)

lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
print("\n=== Logistic Regression Results ===")
print(f"Accuracy: {lr_model.score(X_test_scaled, y_test):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_lr):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Feature Importances (Random Forest)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
for i, row in feature_importance.head().iterrows():
    print(f"{row['feature']}: {row['importance']:.3f}")

In [None]:
# Model comparison
models_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Logistic Regression'],
    'Accuracy': [
        rf_model.score(X_test_scaled, y_test),
        lr_model.score(X_test_scaled, y_test)
    ],
    'ROC_AUC': [
        roc_auc_score(y_test, y_pred_proba_rf),
        roc_auc_score(y_test, y_pred_proba_lr)
    ]
})

print("\n=== Model Comparison ===")
print(models_comparison)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].bar(models_comparison['Model'], models_comparison['Accuracy'])
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_ylabel('Accuracy')
axes[0].set_ylim(0, 1)

axes[1].bar(models_comparison['Model'], models_comparison['ROC_AUC'])
axes[1].set_title('Model ROC AUC Comparison')
axes[1].set_ylabel('ROC AUC')
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
# Generate predictions for entire dataset
X_all_scaled = scaler.transform(X)
hotspot_probabilities = rf_model.predict_proba(X_all_scaled)[:, 1]

# Add predictions to dataframe
df['hotspot_probability'] = hotspot_probabilities
df['predicted_hotspot'] = (hotspot_probabilities >= 0.5).astype(int)

# Identify top risk areas
top_risk_areas = df.nlargest(20, 'hotspot_probability')[[
    'grid_id', 'center_lat', 'center_lon', 'hotspot_probability', 'crime_count'
]]

print("\n=== Top 20 Predicted Hotspot Areas ===")
print(top_risk_areas)

In [None]:
# Visualize predictions
plt.figure(figsize=(15, 10))

# Hotspot probability map
plt.subplot(2, 2, 1)
scatter = plt.scatter(df['center_lon'], df['center_lat'], 
                     c=df['hotspot_probability'], 
                     cmap='YlOrRd', s=50, alpha=0.7)
plt.colorbar(scatter, label='Hotspot Probability')
plt.title('Predicted Hotspot Probabilities')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Actual vs predicted
plt.subplot(2, 2, 2)
colors = ['blue' if x == 0 else 'red' for x in df['is_hotspot']]
plt.scatter(df['center_lon'], df['center_lat'], c=colors, alpha=0.6, s=30)
plt.title('Actual Hotspots (Red = Hotspot)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Prediction distribution
plt.subplot(2, 2, 3)
plt.hist(df['hotspot_probability'], bins=30, alpha=0.7, color='orange')
plt.title('Distribution of Hotspot Probabilities')
plt.xlabel('Probability')
plt.ylabel('Frequency')

# Crime count vs probability
plt.subplot(2, 2, 4)
plt.scatter(df['crime_count'], df['hotspot_probability'], alpha=0.6)
plt.title('Crime Count vs Hotspot Probability')
plt.xlabel('Actual Crime Count')
plt.ylabel('Predicted Hotspot Probability')

plt.tight_layout()
plt.show()

In [None]:
# Save models and results
print("Saving models and results...")

# Create output directory
output_dir = '../../models/saved_models/'
os.makedirs(output_dir, exist_ok=True)

# Save models
joblib.dump(rf_model, f'{output_dir}/random_forest_hotspot_model.pkl')
joblib.dump(lr_model, f'{output_dir}/logistic_regression_hotspot_model.pkl')
joblib.dump(scaler, f'{output_dir}/feature_scaler.pkl')

# Save results
results_dir = '../../data/processed/'
os.makedirs(results_dir, exist_ok=True)

df.to_csv(f'{results_dir}/hotspot_predictions.csv', index=False)
top_risk_areas.to_csv(f'{results_dir}/top_risk_areas.csv', index=False)
feature_importance.to_csv(f'{results_dir}/feature_importance.csv', index=False)

print("Models and results saved successfully!")
print(f"\nModel files saved to: {output_dir}")
print(f"Results saved to: {results_dir}")

In [None]:
# Summary and recommendations
print("\n" + "="*50)
print("CRIME HOTSPOT PREDICTION MODEL SUMMARY")
print("="*50)

print(f"\nDataset: {len(df)} grid cells analyzed")
print(f"Hotspot threshold: ≥3 crimes per cell")
print(f"Actual hotspots: {df['is_hotspot'].sum()} ({df['is_hotspot'].mean():.1%})")
print(f"Predicted hotspots: {df['predicted_hotspot'].sum()}")

print(f"\nBest model: Random Forest")
print(f"Test accuracy: {rf_model.score(X_test_scaled, y_test):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_rf):.3f}")

print(f"\nTop risk factors:")
for i, row in feature_importance.head(3).iterrows():
    print(f"  {i+1}. {row['feature']}: {row['importance']:.3f}")

print(f"\nRecommendations:")
print(f"1. Deploy additional patrols to {len(top_risk_areas)} highest-risk grid cells")
print(f"2. Focus on areas with high vulnerability scores")
print(f"3. Improve lighting in high-risk areas")
print(f"4. Consider environmental interventions near bars and ATMs")
print(f"5. Validate predictions with ground truth data")

print("\nModel ready for deployment in crime prevention planning!")