In [None]:
# ============================================
# CELL 1: Import Libraries
# ============================================

import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"

# Machine Learning
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print(" Libraries imported successfully")
print(f" Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")

In [None]:
# ============================================
# CELL 2: Load Real UIDAI Datasets
# ============================================

# Output directory
OUTPUT_DIR = '../outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/charts", exist_ok=True)

# Data paths
DATA_DIR = '../data/'
ENROL_DIR = f"{DATA_DIR}enrolment/"
DEMO_DIR = f"{DATA_DIR}demographic/"
BIO_DIR = f"{DATA_DIR}biometric/"

print(" LOADING REAL UIDAI DATASETS FOR FORECASTING")
print("="*60)

# Load Enrolment Data
print("\n1️⃣ Loading Enrolment Data...")
enrol_files = glob.glob(f"{ENROL_DIR}*.csv")
df_list = []
for f in enrol_files:
    df_temp = pd.read_csv(f)
    df_list.append(df_temp)
df_enrolment = pd.concat(df_list, ignore_index=True)
print(f"    Loaded {len(df_enrolment):,} records from {len(enrol_files)} files")

# Load Demographic Data
print("\n2️⃣ Loading Demographic Data...")
demo_files = glob.glob(f"{DEMO_DIR}*.csv")
df_list = []
for f in demo_files:
    df_temp = pd.read_csv(f)
    df_list.append(df_temp)
df_demographic = pd.concat(df_list, ignore_index=True)
print(f"    Loaded {len(df_demographic):,} records from {len(demo_files)} files")

# Load Biometric Data
print("\n3️⃣ Loading Biometric Data...")
bio_files = glob.glob(f"{BIO_DIR}*.csv")
df_list = []
for f in bio_files:
    df_temp = pd.read_csv(f)
    df_list.append(df_temp)
df_biometric = pd.concat(df_list, ignore_index=True)
print(f"    Loaded {len(df_biometric):,} records from {len(bio_files)} files")

# Parse dates
df_enrolment['date'] = pd.to_datetime(df_enrolment['date'], format='%d-%m-%Y', errors='coerce')
df_demographic['date'] = pd.to_datetime(df_demographic['date'], format='%d-%m-%Y', errors='coerce')
df_biometric['date'] = pd.to_datetime(df_biometric['date'], format='%d-%m-%Y', errors='coerce')

# Create total enrollment column
df_enrolment['total_enrolments'] = df_enrolment['age_0_5'] + df_enrolment['age_5_17'] + df_enrolment['age_18_greater']

print("\n" + "="*60)
print(" DATASETS LOADED SUCCESSFULLY!")

In [None]:
# ============================================
# CELL 3: Create Monthly Time Series
# ============================================

print("\n CREATING MONTHLY TIME SERIES")
print("="*60)

# Aggregate to monthly
df_enrolment['year_month'] = df_enrolment['date'].dt.to_period('M')

monthly_data = df_enrolment.groupby('year_month').agg({
    'total_enrolments': 'sum',
    'age_0_5': 'sum',
    'age_5_17': 'sum',
    'age_18_greater': 'sum',
    'pincode': 'nunique',
    'state': 'nunique'
}).reset_index()

monthly_data.columns = ['month', 'total_enrollments', 'age_0_5', 'age_5_17', 'age_18_plus', 
                        'active_pincodes', 'active_states']

# Convert period to timestamp for plotting
monthly_data['month'] = monthly_data['month'].dt.to_timestamp()

# Add time features
monthly_data['year'] = monthly_data['month'].dt.year
monthly_data['month_num'] = monthly_data['month'].dt.month
monthly_data['quarter'] = monthly_data['month'].dt.quarter

# Sort by date
monthly_data = monthly_data.sort_values('month').reset_index(drop=True)

print(f"\n Time Series Summary:")
print(f"   Period: {monthly_data['month'].min().strftime('%b %Y')} to {monthly_data['month'].max().strftime('%b %Y')}")
print(f"   Total Months: {len(monthly_data)}")
print(f"   Avg Monthly Enrollments: {monthly_data['total_enrollments'].mean():,.0f}")
print(f"   Min: {monthly_data['total_enrollments'].min():,.0f}")
print(f"   Max: {monthly_data['total_enrollments'].max():,.0f}")

print("\n Monthly Data:")
print(monthly_data.head(10).to_string(index=False))

In [None]:
# ============================================
# CELL 4: Time Series Visualization
# ============================================

print("\n VISUALIZING TIME SERIES PATTERNS")
print("="*60)

# Create comprehensive visualization
fig_ts = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Monthly Enrollment Trend',
        'Monthly Seasonality Pattern',
        'Age Group Distribution Over Time',
        'Active Pincodes Trend'
    )
)

# Plot 1: Time series trend
fig_ts.add_trace(
    go.Scatter(
        x=monthly_data['month'],
        y=monthly_data['total_enrollments'],
        mode='lines+markers',
        line=dict(color='#FF6B35', width=2),
        name='Enrollments'
    ),
    row=1, col=1
)

# Plot 2: Monthly seasonality
monthly_avg = monthly_data.groupby('month_num')['total_enrollments'].mean()
fig_ts.add_trace(
    go.Bar(
        x=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][:len(monthly_avg)],
        y=monthly_avg.values,
        marker_color='#3498db'
    ),
    row=1, col=2
)

# Plot 3: Age groups over time
for col, name, color in [
    ('age_0_5', '0-5 Years', '#2ecc71'),
    ('age_5_17', '5-17 Years', '#3498db'),
    ('age_18_plus', '18+ Years', '#9b59b6')
]:
    fig_ts.add_trace(
        go.Scatter(
            x=monthly_data['month'],
            y=monthly_data[col],
            mode='lines',
            name=name,
            line=dict(color=color)
        ),
        row=2, col=1
    )

# Plot 4: Active pincodes
fig_ts.add_trace(
    go.Scatter(
        x=monthly_data['month'],
        y=monthly_data['active_pincodes'],
        mode='lines+markers',
        line=dict(color='#e74c3c', width=2),
        name='Active Pincodes'
    ),
    row=2, col=2
)

fig_ts.update_layout(
    title=dict(text='<b>TIME SERIES ANALYSIS - ENROLLMENT PATTERNS</b>', x=0.5),
    height=600,
    showlegend=False,
    template='plotly_white'
)

fig_ts.write_html(f"{OUTPUT_DIR}/charts/05_time_series_overview.html")
print(" Time series overview chart saved!")

In [None]:
# ============================================
# CELL 5: Calculate Trend and Seasonality
# ============================================

print("\n DECOMPOSING TREND & SEASONALITY")
print("="*60)

# Calculate moving average (trend)
window = min(3, len(monthly_data))  # 3-month moving average or less if not enough data
monthly_data['trend'] = monthly_data['total_enrollments'].rolling(window=window, center=True, min_periods=1).mean()

# Calculate seasonal index
monthly_seasonality = monthly_data.groupby('month_num')['total_enrollments'].mean()
overall_avg = monthly_data['total_enrollments'].mean()
seasonal_index = monthly_seasonality / overall_avg

print("\n SEASONAL INDEX BY MONTH:")
months_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
              7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
for month_num, idx in seasonal_index.items():
    month_name = months_map.get(month_num, str(month_num))
    direction = "↑" if idx > 1 else "↓" if idx < 1 else "→"
    print(f"   {month_name}: {idx:.3f} {direction}")

# Add seasonal index to data
monthly_data['seasonal_index'] = monthly_data['month_num'].map(seasonal_index)

# Calculate growth rate
monthly_data['pct_change'] = monthly_data['total_enrollments'].pct_change() * 100

print(f"\n TREND ANALYSIS:")
print(f"   Average Growth Rate: {monthly_data['pct_change'].mean():.1f}%")
print(f"   Max Growth: {monthly_data['pct_change'].max():.1f}%")
print(f"   Max Decline: {monthly_data['pct_change'].min():.1f}%")

In [None]:
# ============================================
# CELL 6: Train/Test Split for Forecasting
# ============================================

print("\n PREPARING DATA FOR FORECASTING")
print("="*60)

# Use 80% for training, 20% for testing
train_size = int(len(monthly_data) * 0.8)
train_data = monthly_data.iloc[:train_size].copy()
test_data = monthly_data.iloc[train_size:].copy()

print(f"\n Data Split:")
print(f"   Training: {len(train_data)} months ({train_data['month'].min().strftime('%b %Y')} to {train_data['month'].max().strftime('%b %Y')})")
print(f"   Testing: {len(test_data)} months ({test_data['month'].min().strftime('%b %Y')} to {test_data['month'].max().strftime('%b %Y')})")

# Create numeric index for regression
train_data['time_index'] = range(len(train_data))
test_data['time_index'] = range(len(train_data), len(train_data) + len(test_data))

print(f"\n   Train Avg Enrollments: {train_data['total_enrollments'].mean():,.0f}")
print(f"   Test Avg Enrollments: {test_data['total_enrollments'].mean():,.0f}")

In [None]:
# ============================================
# CELL 7: Linear Trend Model
# ============================================

print("\n MODEL 1: LINEAR TREND FORECASTING")
print("="*60)

# Train linear regression model
X_train = train_data[['time_index']].values
y_train = train_data['total_enrollments'].values

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on test set
X_test = test_data[['time_index']].values
test_data['lr_prediction'] = lr_model.predict(X_test)

# Calculate metrics
if len(test_data) > 0:
    mae_lr = mean_absolute_error(test_data['total_enrollments'], test_data['lr_prediction'])
    rmse_lr = np.sqrt(mean_squared_error(test_data['total_enrollments'], test_data['lr_prediction']))
    r2_lr = r2_score(test_data['total_enrollments'], test_data['lr_prediction'])
    
    # MAPE
    mape_lr = np.mean(np.abs((test_data['total_enrollments'] - test_data['lr_prediction']) / test_data['total_enrollments'])) * 100
    
    print(f"\n LINEAR REGRESSION RESULTS:")
    print("-" * 50)
    print(f"   Coefficient (Slope): {lr_model.coef_[0]:,.2f}")
    print(f"   Intercept: {lr_model.intercept_:,.2f}")
    print(f"\n   Test Set Metrics:")
    print(f"   MAE: {mae_lr:,.0f}")
    print(f"   RMSE: {rmse_lr:,.0f}")
    print(f"   R² Score: {r2_lr:.3f}")
    print(f"   MAPE: {mape_lr:.1f}%")
else:
    print("⚠️ Not enough test data for evaluation")

In [None]:
# ============================================
# CELL 8: Moving Average Model
# ============================================

print("\n MODEL 2: MOVING AVERAGE FORECASTING")
print("="*60)

# Simple Moving Average (last n periods)
n_periods = min(3, len(train_data))  # 3-period MA or less
last_values = train_data['total_enrollments'].tail(n_periods).values

# Predict (use average of last n periods)
ma_prediction = np.mean(last_values)
test_data['ma_prediction'] = ma_prediction

# Calculate metrics
if len(test_data) > 0:
    mae_ma = mean_absolute_error(test_data['total_enrollments'], test_data['ma_prediction'])
    rmse_ma = np.sqrt(mean_squared_error(test_data['total_enrollments'], test_data['ma_prediction']))
    mape_ma = np.mean(np.abs((test_data['total_enrollments'] - test_data['ma_prediction']) / test_data['total_enrollments'])) * 100
    
    print(f"\n MOVING AVERAGE ({n_periods}-PERIOD) RESULTS:")
    print("-" * 50)
    print(f"   Moving Average Value: {ma_prediction:,.0f}")
    print(f"\n   Test Set Metrics:")
    print(f"   MAE: {mae_ma:,.0f}")
    print(f"   RMSE: {rmse_ma:,.0f}")
    print(f"   MAPE: {mape_ma:.1f}%")

In [None]:
# ============================================
# CELL 9: Seasonal Naive Model
# ============================================

print("\n MODEL 3: SEASONAL ADJUSTED FORECAST")
print("="*60)

# Apply seasonal adjustment to linear trend
test_data['seasonal_prediction'] = test_data['lr_prediction'] * test_data['seasonal_index']

# Calculate metrics
if len(test_data) > 0:
    mae_seasonal = mean_absolute_error(test_data['total_enrollments'], test_data['seasonal_prediction'])
    rmse_seasonal = np.sqrt(mean_squared_error(test_data['total_enrollments'], test_data['seasonal_prediction']))
    mape_seasonal = np.mean(np.abs((test_data['total_enrollments'] - test_data['seasonal_prediction']) / test_data['total_enrollments'])) * 100
    
    print(f"\n SEASONAL ADJUSTED MODEL RESULTS:")
    print("-" * 50)
    print(f"   Base Model: Linear Trend")
    print(f"   Adjustment: Monthly Seasonal Index")
    print(f"\n   Test Set Metrics:")
    print(f"   MAE: {mae_seasonal:,.0f}")
    print(f"   RMSE: {rmse_seasonal:,.0f}")
    print(f"   MAPE: {mape_seasonal:.1f}%")

In [None]:
# ============================================
# CELL 10: Model Comparison
# ============================================

print("\n MODEL COMPARISON")
print("="*60)

# Compare all models
models_comparison = pd.DataFrame({
    'Model': ['Linear Trend', 'Moving Average', 'Seasonal Adjusted'],
    'MAE': [mae_lr if 'mae_lr' in dir() else np.nan, 
            mae_ma if 'mae_ma' in dir() else np.nan, 
            mae_seasonal if 'mae_seasonal' in dir() else np.nan],
    'RMSE': [rmse_lr if 'rmse_lr' in dir() else np.nan,
             rmse_ma if 'rmse_ma' in dir() else np.nan,
             rmse_seasonal if 'rmse_seasonal' in dir() else np.nan],
    'MAPE%': [mape_lr if 'mape_lr' in dir() else np.nan,
              mape_ma if 'mape_ma' in dir() else np.nan,
              mape_seasonal if 'mape_seasonal' in dir() else np.nan]
})

print("\n MODEL PERFORMANCE COMPARISON:")
print("-" * 60)
print(models_comparison.to_string(index=False))

# Select best model
best_model_idx = models_comparison['MAPE%'].idxmin()
best_model = models_comparison.loc[best_model_idx, 'Model']
best_mape = models_comparison.loc[best_model_idx, 'MAPE%']

print(f"\n BEST MODEL: {best_model}")
print(f"   MAPE: {best_mape:.1f}%")
print(f"   Forecast Accuracy: {100-best_mape:.1f}%")

In [None]:
# ============================================
# CELL 11: Generate 6-Month Forecast
# ============================================

print("\n GENERATING 6-MONTH FORECAST")
print("="*60)

# Create future dates
last_date = monthly_data['month'].max()
future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=6, freq='MS')

# Create forecast dataframe
forecast_df = pd.DataFrame({
    'month': future_dates,
    'time_index': range(len(monthly_data), len(monthly_data) + 6)
})

forecast_df['month_num'] = forecast_df['month'].dt.month
forecast_df['seasonal_index'] = forecast_df['month_num'].map(seasonal_index)

# Generate forecasts using best model approach (Linear + Seasonal)
forecast_df['base_forecast'] = lr_model.predict(forecast_df[['time_index']].values)
forecast_df['forecast'] = forecast_df['base_forecast'] * forecast_df['seasonal_index'].fillna(1)

# Ensure non-negative
forecast_df['forecast'] = forecast_df['forecast'].clip(lower=0)

print("\n 6-MONTH ENROLLMENT FORECAST:")
print("-" * 60)
for _, row in forecast_df.iterrows():
    month_name = row['month'].strftime('%B %Y')
    print(f"   {month_name}: {row['forecast']:,.0f} projected enrollments")

# Calculate growth
current_avg = monthly_data['total_enrollments'].tail(3).mean()
forecast_avg = forecast_df['forecast'].mean()
growth_pct = ((forecast_avg - current_avg) / current_avg) * 100

print(f"\n FORECAST SUMMARY:")
print(f"   Current 3-Month Avg: {current_avg:,.0f}")
print(f"   Forecast 6-Month Avg: {forecast_avg:,.0f}")
print(f"   Projected Growth: {growth_pct:+.1f}%")

In [None]:
# ============================================
# CELL 12: Forecast Visualization
# ============================================

print("\n CREATING FORECAST VISUALIZATION")
print("="*60)

# Combine historical and forecast data
fig_forecast = go.Figure()

# Historical data
fig_forecast.add_trace(go.Scatter(
    x=monthly_data['month'],
    y=monthly_data['total_enrollments'],
    mode='lines+markers',
    name='Historical',
    line=dict(color='#3498db', width=2)
))

# Training period highlight
fig_forecast.add_trace(go.Scatter(
    x=train_data['month'],
    y=train_data['total_enrollments'],
    mode='lines',
    name='Training Data',
    line=dict(color='#2ecc71', width=3, dash='solid'),
    showlegend=True
))

# Test predictions
if len(test_data) > 0:
    fig_forecast.add_trace(go.Scatter(
        x=test_data['month'],
        y=test_data['seasonal_prediction'],
        mode='lines+markers',
        name='Test Predictions',
        line=dict(color='#f39c12', width=2, dash='dash')
    ))

# Forecast
fig_forecast.add_trace(go.Scatter(
    x=forecast_df['month'],
    y=forecast_df['forecast'],
    mode='lines+markers',
    name='6-Month Forecast',
    line=dict(color='#e74c3c', width=3, dash='dot'),
    marker=dict(size=10, symbol='star')
))

# Add vertical line at forecast start
fig_forecast.add_vline(
    x=last_date, 
    line_dash="dash", 
    line_color="gray",
    annotation_text="Forecast Start"
)

fig_forecast.update_layout(
    title=dict(
        text='<b>ENROLLMENT DEMAND FORECAST</b><br><sup>Historical + 6-Month Projection</sup>',
        x=0.5
    ),
    xaxis_title='Date',
    yaxis_title='Total Enrollments',
    template='plotly_white',
    height=500
)

fig_forecast.write_html(f"{OUTPUT_DIR}/charts/05_demand_forecast.html")
print(" Forecast chart saved!")

In [None]:
# ============================================
# CELL 13: State-Level Forecast
# ============================================

print("\n STATE-LEVEL DEMAND ANALYSIS")
print("="*60)

# Get state-wise monthly enrollments
state_monthly = df_enrolment.groupby(['state', 'year_month'])['total_enrolments'].sum().reset_index()
state_monthly['year_month'] = state_monthly['year_month'].dt.to_timestamp()

# Calculate average monthly demand per state
state_avg = df_enrolment.groupby('state')['total_enrolments'].mean().sort_values(ascending=False)

print("\n TOP 10 STATES BY AVERAGE MONTHLY ENROLLMENT:")
print("-" * 50)
for i, (state, avg) in enumerate(state_avg.head(10).items(), 1):
    print(f"   {i}. {state[:30]}: {avg:,.0f}")

# Project state-wise growth
state_growth = df_enrolment.groupby('state').agg({
    'total_enrolments': ['first', 'last', 'mean']
}).reset_index()
state_growth.columns = ['state', 'first_period', 'last_period', 'avg']
state_growth['growth_rate'] = ((state_growth['last_period'] - state_growth['first_period']) / 
                                state_growth['first_period'] * 100)
state_growth = state_growth.sort_values('growth_rate', ascending=False)

print("\n TOP GROWING STATES:")
print("-" * 50)
for _, row in state_growth.head(5).iterrows():
    print(f"   {row['state'][:30]}: {row['growth_rate']:+.1f}% growth")

In [None]:
# ============================================
# CELL 14: Save Results & Summary
# ============================================

print("\n SAVING FORECASTING RESULTS")
print("="*60)

# Save monthly data
monthly_data.to_csv(f"{OUTPUT_DIR}/05_monthly_enrollments.csv", index=False)
print(" Monthly enrollments saved")

# Save forecast
forecast_df.to_csv(f"{OUTPUT_DIR}/05_6month_forecast.csv", index=False)
print(" 6-month forecast saved")

# Save model comparison
models_comparison.to_csv(f"{OUTPUT_DIR}/05_model_comparison.csv", index=False)
print(" Model comparison saved")

# Save seasonal index
seasonal_df = pd.DataFrame({
    'month': list(months_map.values()),
    'seasonal_index': [seasonal_index.get(i, 1.0) for i in range(1, 13)]
})
seasonal_df.to_csv(f"{OUTPUT_DIR}/05_seasonal_index.csv", index=False)
print(" Seasonal index saved")

# Save state averages
state_avg.reset_index().to_csv(f"{OUTPUT_DIR}/05_state_monthly_avg.csv", index=False)
print(" State monthly averages saved")

print(f"\n All outputs saved to: {OUTPUT_DIR}")

# Final Summary
print("\n" + "="*70)
print(" DEMAND FORECASTING - SUMMARY")
print("="*70)

print(f"""
 ANALYSIS RESULTS:
   • Data Period: {monthly_data['month'].min().strftime('%b %Y')} to {monthly_data['month'].max().strftime('%b %Y')}
   • Total Months Analyzed: {len(monthly_data)}
   • Best Model: {best_model}
   • Forecast Accuracy: {100-best_mape:.1f}%

 6-MONTH FORECAST:
   • Average Projected: {forecast_df['forecast'].mean():,.0f}
   • Expected Growth: {growth_pct:+.1f}%

 OUTPUTS GENERATED:
   • 5 CSV files with analysis results
   • 2 Interactive HTML charts
""")

print("="*70)
print(" NOTEBOOK 05 COMPLETE!")
print("="*70)