<a href="https://colab.research.google.com/github/AkasK09/Data_Hackathon/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


file_path = r"/content/api_data_aadhar_enrolment_1000000_1006029.csv"
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,31-12-2025,Karnataka,Bidar,585330,2,3,0
1,31-12-2025,Karnataka,Bidar,585402,6,0,0
2,31-12-2025,Karnataka,Bidar,585413,1,0,0
3,31-12-2025,Karnataka,Bidar,585418,1,2,0
4,31-12-2025,Karnataka,Bidar,585421,4,3,0


In [26]:
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
df.info()
df.isnull().sum()
df.duplicated().sum()


Shape: (6029, 7)

Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6029 entries, 0 to 6028
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   date            6029 non-null   object
 1   state           6029 non-null   object
 2   district        6029 non-null   object
 3   pincode         6029 non-null   int64 
 4   age_0_5         6029 non-null   int64 
 5   age_5_17        6029 non-null   int64 
 6   age_18_greater  6029 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 329.8+ KB


np.int64(0)

In [27]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')


In [28]:
age_cols = ['age_0_5', 'age_5_17', 'age_18_greater']
df[age_cols] = df[age_cols].apply(pd.to_numeric, errors='coerce')


In [29]:
df = df.dropna(subset=['date', 'state', 'district'])
df[age_cols] = df[age_cols].fillna(0)


In [30]:
df = df.drop_duplicates()


In [31]:
df['state'] = df['state'].str.strip().str.title()
df['district'] = df['district'].str.strip().str.title()


In [32]:
df['total_population'] = df[age_cols].sum(axis=1)
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month


In [33]:
df.info()
df.head()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6029 entries, 0 to 6028
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              6029 non-null   datetime64[ns]
 1   state             6029 non-null   object        
 2   district          6029 non-null   object        
 3   pincode           6029 non-null   int64         
 4   age_0_5           6029 non-null   int64         
 5   age_5_17          6029 non-null   int64         
 6   age_18_greater    6029 non-null   int64         
 7   total_population  6029 non-null   int64         
 8   year              6029 non-null   int32         
 9   month             6029 non-null   int32         
dtypes: datetime64[ns](1), int32(2), int64(5), object(2)
memory usage: 424.0+ KB


Unnamed: 0,date,pincode,age_0_5,age_5_17,age_18_greater,total_population,year,month
count,6029,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0,6029.0
mean,2025-12-31 00:00:00.000000256,518765.547023,3.606734,3.493448,0.096533,7.196716,2025.0,12.0
min,2025-12-31 00:00:00,110003.0,0.0,0.0,0.0,1.0,2025.0,12.0
25%,2025-12-31 00:00:00,380022.0,1.0,0.0,0.0,1.0,2025.0,12.0
50%,2025-12-31 00:00:00,518005.0,2.0,1.0,0.0,3.0,2025.0,12.0
75%,2025-12-31 00:00:00,685595.0,4.0,3.0,0.0,8.0,2025.0,12.0
max,2025-12-31 00:00:00,855116.0,102.0,89.0,9.0,190.0,2025.0,12.0
std,,193308.752319,6.055847,6.694502,0.479475,11.372608,0.0,0.0


In [34]:
#insight 1
state_summary = df.groupby('state')['total_population'].sum().sort_values(ascending=False)
top_states = state_summary.head(10)


In [36]:
fig = px.bar(
    top_states.reset_index(),
    x='state',
    y='total_population',
    title='Top 10 States by Total Population',
    labels={'total_population': 'Total Population', 'state': 'State'},
    text_auto=True
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


In [85]:
#insight 2
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Force light theme (no goth charts)
pio.templates.default = "plotly_white"

# -------------------------------
# Prepare Correlation Data
# -------------------------------
corr_df = df[['age_0_5', 'age_5_17', 'age_18_greater', 'total_population']]
correlation_matrix = corr_df.corr()

print("Correlation Matrix:")
print(correlation_matrix)

# -------------------------------
# Plot Correlation Heatmap
# -------------------------------
fig = px.imshow(
    correlation_matrix,
    text_auto=True,
    title="Correlation Heatmap of Age Groups and Total Population",
    aspect="auto"
)

fig.update_layout(
    width=700,
    height=500,
    paper_bgcolor='white',
    font=dict(color='black')
)

fig.show()


Correlation Matrix:
                   age_0_5  age_5_17  age_18_greater  total_population
age_0_5           1.000000  0.545105        0.207614          0.862124
age_5_17          0.545105  1.000000        0.344711          0.893450
age_18_greater    0.207614  0.344711        1.000000          0.355629
total_population  0.862124  0.893450        0.355629          1.000000


In [71]:
#insight 3
treemap_df = (
    df.groupby(['state', 'district'])['total_population']
      .sum()
      .reset_index()
)


In [72]:
fig = px.treemap(
    treemap_df,
    path=['state', 'district'],
    values='total_population',
    title='Population Distribution by State and District',
    hover_data={'total_population':':,'}
)

fig.update_layout(
    margin=dict(t=50, l=25, r=25, b=25)
)

fig.show()


In [88]:
#insight 4
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_white"

# -------------------------------
# Contribution Calculation
# -------------------------------
contribution = df[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
contribution.columns = ['age_group', 'count']

# -------------------------------
# Donut Chart Visualization
# -------------------------------
fig = px.pie(
    contribution,
    names='age_group',
    values='count',
    hole=0.45,
    title="Population Contribution by Age Group"
)

fig.update_layout(
    width=600,
    height=500,
    paper_bgcolor='white',
    font=dict(color='black')
)

fig.show()


In [89]:
#insight 5
import pandas as pd
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_white"

# -------------------------------
# Aggregate district population per state
# -------------------------------
state_district = (
    df.groupby(['state','district'])['total_population']
      .sum()
      .reset_index()
)

# -------------------------------
# Compute Inequality Index (CV)
# -------------------------------
inequality = (
    state_district.groupby('state')['total_population']
    .agg(['mean','std'])
    .reset_index()
)

inequality['inequality_index'] = inequality['std'] / (inequality['mean'] + 1)

inequality = inequality.sort_values(
    by='inequality_index',
    ascending=False
).head(15)

# -------------------------------
# Visualization
# -------------------------------
fig = px.bar(
    inequality,
    x='state',
    y='inequality_index',
    title="Top States by Population Distribution Inequality (Service Planning Risk)",
    labels={'inequality_index':'Inequality Index (CV)'},
    text_auto=True
)

fig.update_layout(
    xaxis_tickangle=-45,
    width=900,
    height=500,
    paper_bgcolor='white',
    font=dict(color='black')
)

fig.show()


In [17]:
#insight 6
child_density = df.groupby('district')['age_0_5'].sum().sort_values(ascending=False)


In [37]:
child_density_top = child_density.head(15).reset_index()

fig = px.bar(
    child_density_top,
    x='district',
    y='age_0_5',
    title='Top Districts by Child Population (0–5)',
    labels={'age_0_5':'Child Population'},
    text_auto=True
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


In [19]:
#insight 7
features = df.groupby('district').agg({
    'age_0_5':'mean',
    'age_5_17':'mean',
    'age_18_greater':'mean',
    'total_population':'mean'
}).reset_index()


In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

X = features.drop('district', axis=1)
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)
features['cluster'] = kmeans.fit_predict(X_scaled)


In [38]:
fig = px.scatter(
    features,
    x='age_0_5',
    y='age_18_greater',
    color='cluster',
    hover_data=['district'],
    title='District Clustering Based on Population Pattern',
    labels={
        'age_0_5':'Avg Age 0–5',
        'age_18_greater':'Avg Age 18+'
    }
)
fig.show()


In [94]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

pio.templates.default = "plotly_white"

# -------------------------------
# Feature Selection
# -------------------------------
features = ['age_0_5', 'age_5_17', 'age_18_greater']
target = 'total_population'

model_df = df[features + [target]].dropna()

X = model_df[features]
y = model_df[target]

# -------------------------------
# Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# Train Model
# -------------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# -------------------------------
# Predictions
# -------------------------------
y_pred = model.predict(X_test)

# -------------------------------
# Evaluation
# -------------------------------
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Model Performance:")
print("R² Score:", round(r2, 4))
print("MAE:", round(mae, 2))

# -------------------------------
# Visualization: Actual vs Predicted
# -------------------------------
viz_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})

fig = px.scatter(
    viz_df,
    x='Actual',
    y='Predicted',
    title='Actual vs Predicted Total Population',
    trendline='ols'
)

fig.update_layout(
    width=700,
    height=500,
    paper_bgcolor='white',
    font=dict(color='black')
)

fig.show()


Model Performance:
R² Score: 1.0
MAE: 0.0
