# Graphs And Insights From the Clean Enrollment Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

df = pd.read_csv('Enrollment_final_cleaned.csv', encoding='latin1')
print(list(df.columns))
print(df.shape)

In [None]:
df['state'].nunique()

In [None]:
# Age wise aadhar enrollment

from matplotlib.colors import Normalize, LinearSegmentedColormap
import warnings


warnings.filterwarnings("ignore")




df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y', dayfirst=True)
df['Month'] = df['date'].dt.strftime('%B')
df['Child'] = df['age_0_4'] + df['age_5_17']
df['Adult'] = df['age_18_plus']
df['Total'] = df['Child'] + df['Adult']



sns.set(style="whitegrid")
plt.rcParams.update({'font.size': 12, 'figure.titlesize':16, 'axes.titlesize':14})


def gradient_colors(values, cmap_name='viridis'):
    values = np.array(values, dtype=float)
    values_log = np.log1p(values)
    norm = Normalize(vmin=values_log.min(), vmax=values_log.max())
    cmap = plt.cm.get_cmap(cmap_name)
    return [cmap(norm(val)) for val in values_log]

age_totals = df[['age_0_4', 'age_5_17', 'age_18_plus']].sum()
dominant_age = df.copy()
dominant_age['dominant_age_group'] = dominant_age[['age_0_4','age_5_17','age_18_plus']].idxmax(axis=1)
percentage_share = (age_totals / age_totals.sum()) * 100

print("Total Enrolments by Age Group:\n", age_totals)
print("\nDominant Age Group Count:\n", dominant_age['dominant_age_group'].value_counts())
print("\nPercentage Contribution by Age Group:\n", percentage_share)


plt.figure(figsize=(8,6))
bars = plt.bar(
    age_totals.index,
    age_totals.values,
    color=gradient_colors(age_totals.values, "coolwarm"),
    edgecolor='black', alpha=0.9
)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 1500, f'{int(height):,}',
             ha='center', va='bottom', fontweight='bold', fontsize=11)

plt.xlabel("Age Group", fontweight='bold')
plt.ylabel("Total Enrolments", fontweight='bold')
plt.title("Age-wise Aadhaar Enrolment Distribution", fontweight='bold', fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()


explode = (0.05,0.05,0.05)
colors = gradient_colors(age_totals.values, 'Spectral')
plt.figure(figsize=(7,7))
plt.pie(
    age_totals.values,
    labels=age_totals.index,
    autopct='%1.1f%%',
    startangle=90,
    colors=colors,
    shadow=True,
    explode=explode,
    wedgeprops={'edgecolor':'white', 'linewidth':1.5}
)
plt.title("Percentage Share of Aadhaar Enrolments by Age Group", fontweight='bold', fontsize=16)
plt.show()


state_age = df.groupby('state')[['age_0_4','age_5_17','age_18_plus']].sum()
fig, ax = plt.subplots(figsize=(18,8))


cmap = LinearSegmentedColormap.from_list("vibrant_stack", ['#1f77b4','#2ca02c','#ff7f0e'], N=256)
all_totals = state_age.sum(axis=1).values
norm = Normalize(vmin=min(all_totals), vmax=max(all_totals))

bottom = np.zeros(len(state_age))
for age_group in state_age.columns:
    for i, val in enumerate(state_age[age_group].values):
        color = cmap(norm(all_totals[i]))
        ax.bar(
            state_age.index[i], val, bottom=bottom[i],
            color=color, edgecolor='white', width=0.6, alpha=0.9
        )
    bottom += state_age[age_group].values


ax.set_yscale('log')


for i, total in enumerate(all_totals):
    ax.text(i, total*1.05, f'{int(total):,}', ha='center', va='bottom', fontweight='bold', fontsize=9, rotation=45)
ax.set_xlabel("State", fontweight='bold')
ax.set_ylabel("Total Enrolments (log scale)", fontweight='bold')
ax.set_title("State-wise Age Group Aadhaar Enrollments (Gradient, Log Scale)", fontweight='bold', fontsize=16)
ax.grid(axis='y', linestyle='--', alpha=0.6)
ax.set_xticklabels(state_age.index, rotation=45, ha='right')
ax.legend(state_age.columns, title="Age Group")
plt.tight_layout()
plt.show()

The enrolment data is strongly skewed toward younger age groups.  
Children aged **0–4 years contribute 57.5%** of total enrolments, followed by **5–17 years (37.6%)**, while **adult enrolment (18+) remains very low at ~5%**.  
The state-wise analysis further shows that **high-population states like Uttar Pradesh, Bihar, Maharashtra, and West Bengal dominate enrolments**, especially in the **0–4 and 5–17 age groups**, whereas **18+ enrolments are consistently minimal across almost all states**.



In [None]:
# State Wise Aadhar Enrollment
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


df['state'] = df['state'].str.strip().str.title()

state_totals = df.groupby('state')[['age_0_4', 'age_5_17', 'age_18_plus']].sum()
state_totals['total_enrolment'] = state_totals.sum(axis=1)
state_totals = state_totals.sort_values('total_enrolment', ascending=False)


def gradient_bar(ax, x, y, cmap_name='viridis', min_c=0.2, max_c=0.8):
    cmap = plt.get_cmap(cmap_name)
    n = len(y)
    for i, val in enumerate(y):

        color = cmap(min_c + (max_c - min_c) * (i / (n-1)))
        ax.bar(x[i], val, color=color)
    ax.grid(True, linestyle='--', alpha=0.6)


plt.figure(figsize=(14,6))
ax = plt.gca()
gradient_bar(ax, state_totals.index, state_totals['total_enrolment'], cmap_name='YlGnBu')
plt.xlabel('State')
plt.ylabel('Total Enrolments')
plt.title('State-wise Aadhaar Enrolment')
plt.xticks(rotation=45, ha='right')
plt.show()


adult_vs_child = state_totals[['age_18_plus', 'age_0_4']]
plt.figure(figsize=(14,6))
width = 0.4
x = np.arange(len(adult_vs_child))
n = len(adult_vs_child)
cmap_adult = plt.get_cmap('Oranges')
cmap_child = plt.get_cmap('Blues')

for i, state in enumerate(adult_vs_child.index):
    plt.bar(x[i] - width/2, adult_vs_child.loc[state, 'age_18_plus'], width=width,
            color=cmap_adult(0.3 + 0.5 * i / (n-1)))
    plt.bar(x[i] + width/2, adult_vs_child.loc[state, 'age_0_4'], width=width,
            color=cmap_child(0.3 + 0.5 * i / (n-1)))

plt.xticks(x, adult_vs_child.index, rotation=45, ha='right')
plt.xlabel('State')
plt.ylabel('Enrolments')
plt.title('Adult vs Child Aadhaar Enrolments by State')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

plt.figure(figsize=(10,8))
sns.heatmap(
    state_totals[['age_0_4', 'age_5_17', 'age_18_plus']],
    cmap='YlOrRd',
    linewidths=0.5,
    linecolor='gray'
)
plt.xlabel('Age Group')
plt.ylabel('State')
plt.title('Heatmap of Aadhaar Enrolments by State')
plt.show()


top_10 = state_totals.head(10)
plt.figure(figsize=(10,5))
ax = plt.gca()
gradient_bar(ax, top_10.index, top_10['total_enrolment'], cmap_name='coolwarm', min_c=0.3, max_c=0.8)
plt.xlabel('State')
plt.ylabel('Total Enrolments')
plt.title('Top 10 States by Total Aadhaar Enrolment')
plt.xticks(rotation=45, ha='right')
plt.show()


The analysis shows that **Uttar Pradesh, Bihar, Maharashtra, and West Bengal** lead in total Aadhaar enrolments, reflecting their large populations.  
Across almost all states, **child enrolments (0–4 and 5–17)** dominate, while **adult enrolment (18+) remains consistently low**.  
The heatmap highlights a **strong concentration of enrolments in younger age groups**, confirming that Aadhaar coverage is highest among children rather than adults.


In [None]:
# Adult vs Child Enrollment by States: 

df['state'] = (
    df['state']
    .str.strip()
    .str.title()
)

df = df[
    (df['age_0_4'] >= 0) &
    (df['age_5_17'] >= 0) &
    (df['age_18_plus'] >= 0)
]

state_totals = df.groupby('state')[['age_0_4', 'age_5_17', 'age_18_plus']].sum()

state_totals['total'] = state_totals.sum(axis=1)

state_percent = state_totals[['age_0_4','age_5_17', 'age_18_plus']].div(
    state_totals['total'], axis=0
) * 100

state_percent = state_percent.sort_values('age_18_plus', ascending=False)

state_percent.plot(kind='bar', figsize=(16,10))
plt.xlabel('State')
plt.ylabel('Percentage of Enrolments')
plt.title('Adult vs Child vs Teenager Aadhaar Enrolments by State (Percentage View)')
plt.legend(['Children (0–5)', 'Teenage (5-17) ','Adults (18+)'])
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


The percentage view shows that **children (0–5 years dominate Aadhaar enrolments across all states**, often contributing **60–90%** of total registrations.  
In contrast, **adult enrolment (18+) remains consistently low**, rarely exceeding **10–15%**, indicating a **nationwide gap in adult Aadhaar participation**.


In [None]:
# This code cleans Aadhaar enrolment data, analyzes district-wise trends, and visualizes key insights using bar charts

from matplotlib import cm
import numpy as np
import matplotlib.pyplot as plt

df['state'] = df['state'].str.strip().str.title()
df['district'] = df['district'].str.strip().str.title()

df = df[
    (df['age_0_4'] >= 0) &
    (df['age_5_17'] >= 0) &
    (df['age_18_plus'] >= 0)
]

df['total_enrolment'] = df[['age_0_4','age_5_17','age_18_plus']].sum(axis=1)

district_totals = df.groupby(['state','district'])[
    ['age_0_4','age_5_17','age_18_plus','total_enrolment']
].sum()

top_10_districts = district_totals.sort_values(
    'total_enrolment', ascending=False
).head(10)

plt.figure(figsize=(10,6))
values = top_10_districts['total_enrolment'].values
colors = cm.Blues(np.linspace(0.4, 0.9, len(values)))
plt.barh(range(len(values)), values, color=colors)
plt.yticks(range(len(values)), top_10_districts.index)
plt.xlabel('Total Enrolments')
plt.ylabel('District')
plt.title('Top 10 Districts by Aadhaar Enrolment')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

district_totals['child_percentage'] = (
    district_totals['age_0_4'] / district_totals['total_enrolment']
) * 100

high_child_districts = district_totals.sort_values(
    'child_percentage', ascending=False
).head(10)

plt.figure(figsize=(10,6))
values = high_child_districts['child_percentage'].values
colors = cm.Greens(np.linspace(0.4, 0.9, len(values)))
plt.barh(range(len(values)), values, color=colors)
plt.yticks(range(len(values)), high_child_districts.index)
plt.xlabel('Child Enrolment Percentage')
plt.ylabel('District')
plt.title('Districts with High Child (0–5) Aadhaar Enrolments')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

selected_state = 'Karnataka'
state_districts = district_totals.loc[selected_state]

plt.figure(figsize=(12,5))
x = np.arange(len(state_districts))
adult = state_districts['age_18_plus'].values
child = state_districts['age_0_4'].values

adult_colors = cm.Oranges(np.linspace(0.4, 0.9, len(adult)))
child_colors = cm.Purples(np.linspace(0.4, 0.9, len(child)))

plt.bar(x - 0.2, adult, width=0.4, color=adult_colors, label='Adult')
plt.bar(x + 0.2, child, width=0.4, color=child_colors, label='Child')

plt.xticks(x, state_districts.index, rotation=90)
plt.xlabel('District')
plt.ylabel('Enrolments')
plt.title(f'District-wise Adult vs Child Enrolments in {selected_state}')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()


The district-level analysis shows that **a small number of districts contribute disproportionately to total Aadhaar enrolments**, as seen in the top-10 ranking.  
Several districts have an **exceptionally high child (0–5) enrolment share**, indicating **strong early-age registration but limited adult participation**.  
Within **Karnataka**, child enrolments consistently exceed adult (18+) enrolments across districts, highlighting a **state-wide imbalance favoring child Aadhaar coverage**.


In [None]:
# Monthly Aadhar Enrollment Activity:


df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])

df = df[
    (df['age_0_4'] >= 0) &
    (df['age_5_17'] >= 0) &
    (df['age_18_plus'] >= 0)
]

df['total_enrolment'] = df[['age_0_4','age_5_17','age_18_plus']].sum(axis=1)

monthly = (
    df.groupby(pd.Grouper(key='date', freq='M'))['total_enrolment']
    .sum()
    .sort_index()
)

plt.figure(figsize=(12,5))
plt.bar(monthly.index.astype(str), monthly.values)
plt.xlabel('Month')
plt.ylabel('Total Aadhaar Enrolments')
plt.title('Monthly Aadhaar Enrolment Activity')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

cumulative = monthly.cumsum()

plt.figure(figsize=(12,5))
plt.plot(cumulative.index, cumulative.values)
plt.xlabel('Month')
plt.ylabel('Cumulative Enrolments')
plt.title('Cumulative Aadhaar Enrolment Growth Over Time')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

indexed = (monthly / monthly.iloc[0]) * 100

plt.figure(figsize=(12,5))
plt.plot(indexed.index, indexed.values)
plt.xlabel('Month')
plt.ylabel('Indexed Enrolment (Base = 100)')
plt.title('Indexed Aadhaar Enrolment Trend (Growth / Decline)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


The monthly trend shows **clear fluctuations in Aadhaar enrolment activity**, indicating periods of higher and lower registration demand.  
The cumulative curve rises steadily, confirming **continuous growth in total enrolments over time**.  
The indexed trend highlights **months of acceleration and slowdown**, reflecting changing enrolment intensity rather than a uniform growth pattern.


In [None]:
# Monthly Trend: Initial Surge → Stabilization
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M')

monthly_trend = (
    df.groupby('month')['total_enrolment']
    .sum()
    .reset_index()
)

monthly_trend['month'] = monthly_trend['month'].astype(str)

plt.figure(figsize=(12,5))
plt.plot(
    monthly_trend['month'],
    monthly_trend['total_enrolment'],
    marker='o'
)
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Total Enrolments')
plt.title('Monthly Aadhaar Enrolment Trend')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

#UP & Bihar Dominate State-wise Enrolments
state_totals = (
    df.groupby('state')['total_enrolment']
    .sum()
    .sort_values(ascending=False)
)

plt.figure(figsize=(10,6))
plt.barh(
    state_totals.index[:10],
    state_totals.values[:10]
)
plt.xlabel('Total Enrolments')
plt.ylabel('State')
plt.title('Top 10 States by Aadhaar Enrolment')
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

#District-level Enrolments are Highly Concentrated

district_totals = (
    df.groupby('district')['total_enrolment']
    .sum()
    .sort_values()
)

cum_enrolment = district_totals.cumsum()
cum_enrolment = cum_enrolment / cum_enrolment.iloc[-1]

plt.figure(figsize=(7,7))
plt.plot(
    np.linspace(0,1,len(cum_enrolment)),
    cum_enrolment,
    label='District Enrolment Distribution'
)
plt.plot([0,1],[0,1],'--', label='Equal Distribution')

plt.xlabel('Cumulative Share of Districts')
plt.ylabel('Cumulative Share of Enrolments')
plt.title('Concentration of Aadhaar Enrolments Across Districts')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


The monthly trend shows a **sharp peak in enrolments early on**, followed by **stabilization at lower levels**, indicating an initial surge and later normalization.  
**Uttar Pradesh and Bihar clearly dominate state-wise enrolments**, contributing significantly more than other states.  
At the district level, enrolments are **highly concentrated in a few districts**, reinforcing that Aadhaar activity is **clustered rather than evenly distributed**.


In [None]:
# This code generates state-wise Aadhaar enrolment insights showing district concentration, age-group distribution, and monthly trends for each state

states = df['state'].dropna().unique()

month_order = ['January','February','March','April','May','June',
               'July','August','September','October','November','December']

for state in states:
    state_df = df[df['state'] == state]

    district_enroll = (
        state_df.groupby('district')['total_enrolment']
        .sum()
        .sort_values(ascending=False)
    )

    age_group = state_df[['age_18_plus','age_0_4']].sum()

    month_enroll = (
        state_df.groupby('Month')['total_enrolment']
        .sum()
        .reindex(month_order)
        .fillna(0)
    )

    fig, axs = plt.subplots(2, 2, figsize=(16,10))
    fig.suptitle(f"Aadhaar Enrolment Insights – {state}", fontsize=18, fontweight='bold')

    top5 = district_enroll.head(5)
    bottom5 = district_enroll.tail(5)

    sns.barplot(x=top5.values, y=top5.index, ax=axs[0,0], palette='Greens_r')
    axs[0,0].set_title('Top 5 Districts')
    axs[0,0].set_xlabel('Enrolments')

    sns.barplot(x=bottom5.values, y=bottom5.index, ax=axs[0,1], palette='Reds_r')
    axs[0,1].set_title('Bottom 5 Districts')
    axs[0,1].set_xlabel('Enrolments')

    axs[1,0].pie(
        age_group,
        labels=['Adult','Child'],
        autopct='%1.1f%%',
        startangle=90
    )
    axs[1,0].set_title('Adult vs Child Share')

    sns.barplot(
        x=month_enroll.index,
        y=month_enroll.values,
        ax=axs[1,1],
        palette='Purples'
    )
    axs[1,1].set_title('Monthly Trend')
    axs[1,1].tick_params(axis='x', rotation=45)

    plt.tight_layout(rect=[0,0.03,1,0.95])
    plt.show()
