# Machine Learning Models & Predictive Analytics
## Anomaly Detection, Vulnerability Scoring, and Time Series Forecasting

This notebook implements:
- **Anomaly Detection**: Identify unusual patterns in Aadhaar data
- **Vulnerability Scoring**: Quantify social exclusion risks
- **Time Series Forecasting**: Predict future enrolment trends
- **Clustering Analysis**: Segment populations for targeted interventions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings

warnings.filterwarnings('ignore')

# Add lib to path
sys.path.insert(0, '../lib')

from data_pipeline import AadhaarDataPipeline
from ml_models import (
    AnomalyDetector, 
    VulnerabilityAnalyzer,
    TimeSeriesForecaster,
    PopulationClusterer,
    InsightGenerator
)

# Set style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("ML libraries imported successfully!")

## 1. Anomaly Detection

In [None]:
# Initialize models
pipeline = AadhaarDataPipeline()
anomaly_detector = AnomalyDetector()

# TODO: Load and preprocess data
# pipeline.load_datasets(...)
# pipeline.clean_enrolment_data()

# Example anomaly detection (after data loading)
# numeric_features = pipeline.enrolment_df.select_dtypes(include=[np.number]).columns
# X = pipeline.enrolment_df[numeric_features].fillna(0)

# Isolation Forest detection
# anomalies_if = anomaly_detector.detect_isolation_forest(X, contamination=0.05)
# print(f"Isolation Forest found {(anomalies_if == -1).sum()} anomalies")

# LOF detection
# anomalies_lof = anomaly_detector.detect_lof(X, n_neighbors=20)
# print(f"LOF found {(anomalies_lof == -1).sum()} anomalies")

# Get anomaly scores
# scores = anomaly_detector.get_anomaly_scores(X)
# pipeline.enrolment_df['Anomaly_Score'] = scores

print("Anomaly detection ready. Run after loading data.")

## 2. Vulnerability Analysis

In [None]:
vulnerability_analyzer = VulnerabilityAnalyzer()

# Calculate vulnerability scores
# vuln_scores = vulnerability_analyzer.calculate_vulnerability_score(pipeline.enrolment_df)
# pipeline.enrolment_df['Vulnerability_Score'] = vuln_scores

# Identify vulnerable populations
# vulnerable = vulnerability_analyzer.identify_vulnerable_populations(pipeline.enrolment_df, threshold=0.7)
# print(f"\nIdentified {len(vulnerable)} vulnerable regions:")
# print(vulnerable[['State', 'District', 'Vulnerability_Score']].head(10))

# Vulnerability distribution
# plt.figure(figsize=(12, 6))
# plt.hist(pipeline.enrolment_df['Vulnerability_Score'], bins=30, edgecolor='black')
# plt.title('Distribution of Vulnerability Scores')
# plt.xlabel('Vulnerability Score (0-1)')
# plt.ylabel('Frequency')
# plt.show()

print("Vulnerability analysis ready. Run after loading data.")

## 3. Time Series Forecasting

In [None]:
forecaster = TimeSeriesForecaster()

# Prepare time series
# ts_data = forecaster.prepare_time_series(
#     pipeline.enrolment_df,
#     date_col='Date',
#     value_col='Aadhaar Generated'
# )

# ARIMA forecasting
# arima_result = forecaster.forecast_arima(ts_data, periods=12, order=(1, 1, 1))
# if arima_result:
#     print(f"\nARIMA Model (AIC: {arima_result['aic']:.2f})")
#     print(arima_result['forecast'].head())

# Prophet forecasting
# prophet_result = forecaster.forecast_prophet(ts_data, periods=30)
# if prophet_result:
#     forecast_df = prophet_result['forecast']
#     plt.figure(figsize=(14, 6))
#     plt.plot(forecast_df['ds'], forecast_df['yhat'], label='Forecast')
#     plt.fill_between(forecast_df['ds'], forecast_df['yhat_lower'], forecast_df['yhat_upper'], alpha=0.3)
#     plt.title('Aadhaar Enrolment Forecast (Prophet)')
#     plt.xlabel('Date')
#     plt.ylabel('Aadhaar Generated')
#     plt.legend()
#     plt.show()

print("Time series forecasting ready. Run after loading data.")

## 4. Population Clustering

In [None]:
clusterer = PopulationClusterer(n_clusters=5)

# Cluster regions
# features_to_cluster = ['Aadhaar Generated', 'Biometric_Updates', 'Demographic_Updates']
# clustered_data = clusterer.cluster_regions(pipeline.enrolment_df, features_to_cluster)

# Get cluster profiles
# profiles = clusterer.get_cluster_profiles(clustered_data, features_to_cluster)
# for cluster_name, profile in profiles.items():
#     print(f"\n{cluster_name}:")
#     print(profile)

# Cluster distribution
# plt.figure(figsize=(10, 6))
# clustered_data['Cluster'].value_counts().sort_index().plot(kind='bar')
# plt.title('Distribution of Clusters')
# plt.xlabel('Cluster')
# plt.ylabel('Count')
# plt.show()

print("Population clustering ready. Run after loading data.")

## 5. Insight Generation

In [None]:
insight_gen = InsightGenerator()

# Generate insights from different analyses
# migration_insights = insight_gen.generate_migration_insights(migration_df)
# biometric_insights = insight_gen.generate_biometric_insights(biometric_df)
# age_insights = insight_gen.generate_age_group_insights(age_analysis)

# Combine and display
# all_insights = migration_insights + biometric_insights + age_insights
# for insight in all_insights:
#     print(f"\n[{insight['severity']}] {insight['category']}")
#     print(f"Finding: {insight['finding']}")
#     print(f"Recommendation: {insight['recommendation']}")

print("Insight generation ready. Run after analysis.")

## 6. Model Performance Comparison

In [None]:
# Compare different anomaly detection methods
# comparison = pd.DataFrame({
#     'Method': ['Isolation Forest', 'LOF', 'Statistical Z-score'],
#     'Anomalies Detected': [
#         (anomalies_if == -1).sum(),
#         (anomalies_lof == -1).sum(),
#         (scores > 0.7).sum()
#     ],
#     'False Positive Rate': ['5% (expected)', '~3-5%', '~2%']
# })

# print("\nAnomaly Detection Methods Comparison:")
# print(comparison)

print("Model comparison ready. Run after detection.")

## 7. Export Results

In [None]:
# Export model results for dashboard
# results = {
#     'anomalies': pipeline.enrolment_df[pipeline.enrolment_df['Anomaly_Score'] > 0.7],
#     'vulnerability': pipeline.enrolment_df[['State', 'District', 'Vulnerability_Score']],
#     'clusters': clustered_data[['State', 'District', 'Cluster']]
# }

# for name, data in results.items():
#     data.to_csv(f'./model_results/{name}_results.csv', index=False)

print("Ready to export model results.")