# Exploratory Data Analysis: Aadhaar Enrolment & Updates
## Dataset Overview and Statistical Analysis

This notebook provides comprehensive EDA of three datasets:
1. **Aadhaar Enrolment Data** - New Aadhaar enrollments by state/district/demographic
2. **Aadhaar Demographic Updates** - Address/name/date-of-birth updates
3. **Aadhaar Biometric Updates** - Fingerprint/iris/photo capture updates

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add lib to path
sys.path.insert(0, '../lib')

from data_pipeline import AadhaarDataPipeline, DataValidator

# Set style
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Initialize pipeline
pipeline = AadhaarDataPipeline()

# TODO: Replace with actual CSV file paths from Google Drive
# For now, we'll create sample data structure

# Example of how to load:
# datasets = pipeline.load_datasets(
#     enrolment_path='path/to/enrolment.csv',
#     demographic_path='path/to/demographic.csv',
#     biometric_path='path/to/biometric.csv'
# )

print("Pipeline initialized. Ready to load data.")
print("\nNext step: Load CSV files from Google Drive")

## 2. Data Quality Assessment

In [None]:
# After loading data, validate quality
validator = DataValidator()

# Check missing values
# missing = validator.check_missing_values(pipeline.enrolment_df)
# print("Missing values in Enrolment Data:")
# print(missing)

print("Data validation ready. Run after loading data.")

## 3. Geographic Analysis

In [None]:
# Geographic patterns
# enrol_by_state = pipeline.enrolment_df.groupby('State').agg({
#     'Aadhaar Generated': 'sum',
#     'District': 'nunique',
#     'Age_Group': 'count'
# }).reset_index()

# enrol_by_state.columns = ['State', 'Enrolments', 'Districts', 'Records']
# enrol_by_state = enrol_by_state.sort_values('Enrolments', ascending=False)

# print("\nTop 15 States by Aadhaar Enrolment:")
# print(enrol_by_state.head(15))

print("Geographic analysis ready. Run after loading data.")

## 4. Demographic Analysis

In [None]:
# Age group patterns
# age_analysis = pipeline.calculate_age_group_analysis()

# if 'enrolment_age' in age_analysis:
#     age_enrol = age_analysis['enrolment_age']
#     age_enrol = age_enrol.sort_values('Aadhaar Generated', ascending=False)
#     print("\nAadhaar Enrolment by Age Group:")
#     print(age_enrol)
#     
#     # Visualization
#     plt.figure(figsize=(12, 6))
#     plt.bar(age_enrol['Age_Group'], age_enrol['Aadhaar Generated'])
#     plt.title('Aadhaar Enrolment by Age Group')
#     plt.xlabel('Age Group')
#     plt.ylabel('Aadhaar Generated')
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()

print("Demographic analysis ready. Run after loading data.")

## 5. Migration and Update Patterns

In [None]:
# Migration indicators
# migration = pipeline.calculate_migration_indicators()

# if not migration.empty:
#     high_migration = migration.nlargest(15, 'Migration_Risk')
#     print("\nTop 15 Districts with High Migration Risk:")
#     print(high_migration[['State', 'District', 'Update_Rate', 'Migration_Risk']])

print("Migration analysis ready. Run after loading data.")

## 6. Biometric Coverage Analysis

In [None]:
# Biometric system health
# biometric_health = pipeline.calculate_biometric_quality()

# if not biometric_health.empty:
#     print("\nBiometric Coverage by State:")
#     print(biometric_health.sort_values('Biometric_Coverage', ascending=False).head(10))

print("Biometric analysis ready. Run after loading data.")

## 7. Temporal Trends

In [None]:
# Temporal patterns
# if 'Date' in pipeline.enrolment_df.columns:
#     daily_enrol = pipeline.enrolment_df.groupby(pipeline.enrolment_df['Date'].dt.date)['Aadhaar Generated'].sum()
#     
#     plt.figure(figsize=(14, 6))
#     plt.plot(daily_enrol.index, daily_enrol.values)
#     plt.title('Daily Aadhaar Enrolment Trend')
#     plt.xlabel('Date')
#     plt.ylabel('Aadhaar Generated')
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()

print("Temporal analysis ready. Run after loading data.")

## 8. Correlation Analysis

In [None]:
# Correlation between datasets
# if pipeline.enrolment_df is not None:
#     numeric_cols = pipeline.enrolment_df.select_dtypes(include=[np.number]).columns
#     correlation_matrix = pipeline.enrolment_df[numeric_cols].corr()
#     
#     plt.figure(figsize=(10, 8))
#     sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
#     plt.title('Correlation Matrix - Enrolment Dataset')
#     plt.tight_layout()
#     plt.show()

print("Correlation analysis ready. Run after loading data.")

## 9. Summary Statistics

In [None]:
# Generate comprehensive insights
insights = pipeline.generate_insights()

import json
print("\nDataset Insights:")
print(json.dumps(insights, indent=2, default=str))

## 10. Export Processed Data

In [None]:
# Export processed datasets
# pipeline.export_processed_data('./processed_data')
# print("Data exported successfully!")

print("Ready to export processed data.")