# EDA 
## 1. Periodic Labor Force Survey 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
emp_industry = pd.read_csv('Cleaned Data/PLFS/PLFS_emp_industry.csv',header=[0,1])

In [None]:
emp_industry

### This dataset requires a column name and feature engineering 

In [None]:
col = pd.MultiIndex.from_tuples([
    ('Industry as per NIC-2008',''),
    ('Self-Employed','own account worker, employer'),('Self-Employed','helper in household enterprise'),('Self-Employed','all self employed'),
    ('regular wage/salary',''),('casual labour',''),('all','')
])
emp_industry.columns = col

In [None]:
emp_industry

In [None]:
emp_industry.iloc[0:13]

In [None]:
categories = ['Male', 'Female', 'Persons'] * 9

rows_per_set = 13

category_col = [categories[i // rows_per_set] for i in range(len(emp_industry))]
emp_industry['Category'] = category_col
emp_industry

In [None]:
emp_industry.iloc[0:13]

In [None]:
emp_industry.info()

In [None]:
emp_industry['Self-Employed','own account worker, employer'].iloc[0:11].mean()

## Column Types:

* `Numerical`: own account worker, employer; helper in household enterprise	all self ; all self employed; regular wage/salary; casual labour;  all
* `Categorical`: Category
* `Mixed`: Industry as per NIC-2008


### Removing the sample worker row in the table because it represents the survey sample size, not actual employment data.

In [None]:
emp_industry = emp_industry[~emp_industry['Industry as per NIC-2008'].str.contains('sample workers',case=False, na=False)]

In [None]:
emp_industry[~emp_industry['Industry as per NIC-2008'].str.contains('total',case=False, na=False)].describe()

## Univariate Analysis on Numerical Columns 

### 1. own account worker, employer:

   * Mean val: 30.5%
   * STD: 18%
   * 25%: 17%, 50%: 27%, 75%: 46%, max: 70% (there might be a outlier)

In [None]:
emp = emp_industry[~emp_industry['Industry as per NIC-2008'].str.contains('total',case=False, na=False)]
# filtering out the dataset without total rows for univariate analysis 

In [None]:
emp['Self-Employed']['own account worker, employer'].skew()

In [None]:
sns.set_style('darkgrid')

plt.figure(figsize=(10,5))
sns.histplot(emp['Self-Employed']['own account worker, employer'], kde=True, bins=15)
plt.title("Distribution of Own account workers & employers")
plt.xlabel('Percentage of Workforce')
plt.ylabel('Frequency')
plt.show()

#Boxplot
plt.figure(figsize=(10,5))
sns.boxplot(x=emp['Self-Employed']['own account worker, employer'])
plt.title('Boxplot of Own Account Worker, Employer')
plt.show()

### Conclusion:

* Shape of the histogram is almost a bimodial distribution with peaks around 10-30% and 40-50%
* This shows that industries have very few or moderate amount of own acount workers, with fewer industries having extreme values(70%)
* Boxplot tells us that there are no outliers in this distribution which indicates that there are no extreme deviations.
* The spread is natural and there are no usual spikes.

In [None]:
emp_industry[~emp_industry['Industry as per NIC-2008'].str.contains('total',case=False, na=False)].describe()

### 2. Helper in Household Enterprise

In [None]:
emp_industry[~emp_industry['Industry as per NIC-2008'].str.contains('total',case=False, na=False)].corr(numeric_only=True)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(emp['Self-Employed']['helper in household enterprise'],kde=True,bins=15)
plt.title('Distribution of Helpers in Household Enterprises')
plt.xlabel('Percentage of Workforce')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(x=emp['Self-Employed']['helper in household enterprise'])
plt.title('Boxplot of Helpers in Household Enterprises')
plt.show()

### Conclusion:

* Histogram & KDE:
  * The distribution is right-skewed(positively), showing that most industries have low values of "Helpers in Household Enterprise"
  * The highest frequency is near 0% showing most industries employ very few people
  * long tail extends towards 20-50%, meaning few industries have some values of Helpers in household enterprise
* Boxplot:
  * median is low
  * There are several outliers beyond 30%
  * IQR is narrow, means that most values are concentrated in the lower range
* After doing groupby, we find that females have the highest mean out of the 3 category hence the outliers 

In [None]:
h = [('Self-Employed','helper in household enterprise')]
c = emp.groupby('Category')[h].describe()
c[('Self-Employed','helper in household enterprise','mean')].plot(kind='bar', figsize=(10,5), title='Mean Percentage of Helpers by Category')
plt.ylabel('Percentage of Workforce')
plt.show()

In [None]:
d = emp.groupby('Industry as per NIC-2008')[h].describe()
d

In [None]:
emp.head(2)

### 3. All Self Employed

In [None]:
plt.figure(figsize=(10,5))
n = len(emp['Self-Employed']['all self employed'])
bins = int(np.sqrt(n)) # Square root rule for finding binsize
sns.histplot(emp['Self-Employed']['all self employed'], kde=True, bins=bins)
plt.xlabel('Percentage of Workforce')
plt.ylabel('Frequency')
plt.title('Distribution of All-self employed workers')
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(x=emp['Self-Employed']['all self employed'])
plt.title('Boxplot of All Self-Employed Workers')
plt.show()

### Conclusions 

* Distribution is bimodial (two peaks)
* Frequency distribution is spread out
* No outliers present
* median line is below and near 40%
* IQR is large , the middle 50% of data is quite spread out.

### 4. Regular wage/Salary

In [None]:
plt.figure(figsize=(10,5))
n = len(emp['regular wage/salary'])
bins = int(np.sqrt(n)) # Square root rule for finding binsize
sns.histplot(emp['regular wage/salary'],kde=True, bins=4)
plt.xlabel('Percentage of Regular wage/Salary Workers')
plt.ylabel('Frequency')
plt.title('Distribution of Regular wage/salary')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x=emp['regular wage/salary'])
plt.title('Boxplot of Regular wage/salary')
plt.show()

### Conclusions

* Highest frequency is observed in the 30-40% range meaning most regions fall within this percentage
* Right-skewed, suggests a slight decline as we move toward higher percentages, fewer regions have a very high percentage of regular wage/salary.
* No outliers
* median value below and near 40% and IQR covering the middle which is quite spread out 

### 5. Casual Labor

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(emp['casual labour'], kde=True, bins=4)
plt.title('Distribution of Casual Labour')
plt.xlabel('Percentage of Workforce')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x=emp['casual labour'])
plt.title('Boxplot of casual labour')
plt.show()


In [None]:
# h = [('Self-Employed','helper in household enterprise')]
c = emp.groupby('Category')['casual labour'].describe()
c['mean'].plot(kind='bar', figsize=(10,5), title='Mean Percentage of Casual Labors by Category')
plt.ylabel('Percentage of Workforce')
plt.show()

### Conclusions 

* Histogram
  * Right Skewed Distribution: shows that most regions have a low percentage of casual labor with highest frequency in 0-20% range.
  * Long tail towards higher percentages: few regions where casual labor is significantly higher, above 50% but less frequent.
* Boxplot
  * Median close to the lower quartile
  * Outliers present, data has multiple outliers above 50% some regions have unusually high percentage of casual labor.
  * IQR is relatively small means that majority of the data falls within a narrow range (under 30%)
  * Casual Labor is generally low in most industries but a few industries deviate significantly
* Possible Analysis:
  * Investigate the industries with high casual labor participation - are they rural,industrial or economically underdeveloped?
  * Compare casual labor trends with regular wage/salary and self-employment for deeper insights.

## Univariate Analysis on Categorical Columns 

In [None]:
emp.head(2)

In [None]:
emp['Industry as per NIC-2008'].value_counts()

In [None]:
emp[emp['Industry as per NIC-2008'] == '55-56 (accommodation & food services))']

In [None]:
emp['Industry as per NIC-2008'] = emp['Industry as per NIC-2008'].replace('55-56 (accommodation & food services))','55-56 (accommodation & food services) ')

In [None]:
emp['Industry as per NIC-2008'].value_counts()

In [None]:
emp.loc[8,'Industry as per NIC-2008'] = '55-56 (accommodation & food services)'

In [None]:
emp['Industry as per NIC-2008'].value_counts().plot(kind='pie',autopct='%0.1f%%')

In [None]:
emp.groupby('Category')[[('Self-Employed','helper in household enterprise')]].max()

In [None]:
emp.head()

## Workforce Distribution & Trends 

### 1. How are workers distributed across different employment types ?
### 2. Which Industries have the highest and lowest percentage of regular wage/salary workers? 

In [None]:
total = emp_industry[emp_industry['Industry as per NIC-2008'].str.contains('total',case=False, na=False)]
total

In [None]:
import matplotlib.pyplot as plt 

In [None]:
employment_types = emp[[('Self-Employed','own account worker, employer'),('Self-Employed','helper in household enterprise'),('Self-Employed','all self employed'),('regular wage/salary',''),('casual labour','')]]

employment_distribution = employment_types.mean()

employment_distribution.index = ['Self-Employed (Employer/Own Account)', 
                                 'Self-Employed (Household Helper)', 
                                 'All Self-Employed', 
                                 'Regular Wage/Salary', 
                                 'Casual Labour']

plt.figure(figsize=(8,5))
sns.barplot(x=employment_distribution.index, y=employment_distribution.values,hue = employment_distribution.index, palette='viridis')
plt.title('How are workers distributed across different employment types ?')
# plt.ylable('Workforce')
plt.xticks(rotation=30,ha='right') 
plt.show()

### Key Insights:

* Which employment type has the highest percentage?: Regular Wage/Salary Employment type
* Which one is the least common?: Self-Employed(Household Helper)
* How do the employment types compare in terms of gender?: There is a greater number of females in self-employed type  and for regular wage/salary majority is male and in casual labor too number of female worker is greater than male workers
* Does any type have an outlier (very high or low percentage)?: Yes, Out of all the employment types, Helper in Household enterprise and casual labors have outliers. The reason we have outliers in Household enterprise is because majority of workers are female and similary majority workers are female in the casual labour type

In [None]:
emp_types = [('Self-Employed','own account worker, employer'),('Self-Employed','helper in household enterprise'),('Self-Employed','all self employed'),('regular wage/salary',''),('casual labour','')]
grp = emp.groupby('Category')[emp_types].mean()
grp_t = grp.T

In [None]:
emp_types = [
    ('Self-Employed', 'own account worker, employer'),
    ('Self-Employed', 'helper in household enterprise'),
    ('Self-Employed', 'all self employed'),
    ('regular wage/salary', ''),
    ('casual labour', '')
]

# Group by 'Category' and compute mean percentage
grp = emp.groupby('Category')[emp_types].mean()
grp = grp.reset_index()
grp
grp_melted = grp.melt(id_vars=[('Category','')], var_name='Employment Type',value_name='Percentage')

plt.figure(figsize=(12, 6))
sns.barplot(data=grp_melted, x=('Category',''), y='Percentage', hue='Employment Type', palette='viridis')

# Formatting
plt.title("Percentage Distribution of Employment Types Across Categories")
plt.ylabel("Percentage (%)")
plt.xlabel("Employment Category")
plt.xticks(rotation=45)
plt.legend(title="Employment Type")

# Show the plot
plt.show()

In [None]:
grp_melted

In [None]:
plt.figure(figsize=(12,6))
industry_grouped = emp.groupby("Industry as per NIC-2008")[emp_types].mean()
industry_grouped
# Plot heatmap
sns.heatmap(industry_grouped, annot=True, cmap="coolwarm", fmt=".1f", linewidths=0.5)
plt.title("Employment Type Distribution Across Industries")
plt.ylabel("Industry")
plt.xticks(rotation=45)
plt.show()


In [None]:
reg = emp.groupby('Industry as per NIC-2008')['regular wage/salary'].agg(['max','min'])

reg.plot(kind='bar',figsize=(12,6),colormap='coolwarm')

plt.title('Max & Min Percentage of Regular Wage/Salary Workers by Industry')
plt.ylabel('Percentage')
plt.xlabel('Industry')
plt.xticks(rotation=90)  # Rotate x labels for readability
plt.legend(title='Statistic')

# Show the plot
plt.show()

## Industry-Specific Insights

### 1. What percentage of workers are in casual labour across different industries?
### 2. Which industries have the highest rates of self-employment?

In [None]:
emp.groupby('Industry as per NIC-2008')['casual labour'].describe()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=emp,x='Industry as per NIC-2008',y='casual labour', palette='coolwarm', hue='Industry as per NIC-2008')
plt.title('Distribution of Casual Labour Across Different Industries')
# plt.ylabel('Percentage of Casual Labour')
plt.xlabel('Industry')
plt.xticks(rotation=90)
plt.show()

## Gender & Employment Type Analysis 

### Are men more likely to be self-employed compared to women? (refer to above graph on category and employment type)
### Do women have a higher self-employment rates by gender using a bar-chart or grouped bar-chart

## Casual Labour & Informal Economy

### 1. Are there industries where casual labour is more common than regular wage/salary jobs?
### 2. What does the distribution of casual labour looks like across industries?

In [None]:
col = [('casual labour',''),('regular wage/salary','')]
industry_grp = emp.groupby('Industry as per NIC-2008')[col].mean()
industry_grp
high_casual_labour = industry_grp[industry_grp['casual labour'] > industry_grp['regular wage/salary']]

high_casual_labour.plot(kind='bar', stacked=True, figsize=(12,6), colormap='coolwarm')


plt.title('1. Industries Where Casual Labour is More Common than Regular Wage Jobs', fontsize=14, fontweight='bold')
plt.ylabel('Percentage of Workforce', fontsize=12)
plt.xlabel('Industry', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title="Employment Type")
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show plot
plt.show()

## Conclusions:

* Construction has the highest percentage of casual labour
* Agriculture is also heavily casual labour dependent
* Secondary industries have a more mixed employment pattern

In [None]:
plt.figure(figsize=(10,5))

sns.histplot(industry_grp['casual labour'],bins=20,kde=True,color='red',alpha=0.6)
plt.title("2. Distribution of Casual Labour Across Industries (Industry level trends)",fontsize=14, fontweight='bold')
plt.xlabel('Percentage of Casual Labour',fontsize=12)
plt.ylabel('Frequency',fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(10,5))

sns.histplot(emp['casual labour'],bins=20,kde=True,color='red',alpha=0.6)
plt.title("2. Distribution of Casual Labour Across Industries (Worker level trends)",fontsize=14, fontweight='bold')
plt.xlabel('Percentage of Casual Labour',fontsize=12)
plt.ylabel('Frequency',fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(data=emp, x="Industry as per NIC-2008", y="casual labour")
plt.xticks(rotation=90)
plt.title("Casual Labour Distribution Across Industries")
plt.show()


## Conclusions:

* Industry-level Trends Graph:
  * Each bar represents average casual labour percentage per industry.
  * Shows most industries have a casual labour percentage below 20%, but a few industries have much higher rates(~80%)
  * The KDE(curve) suggests that industries mostly fall into low casual labour categories
  * Helps analyze which industries rely more or less on casual labour
* Worker-level Trends Graph:
   * Each bar represents individual data points of casual labour
   * Shows most workers fall in the lower casual labour range (0-20%), but some workers are in industries with high casual labour (80-100%)
   * KDE suggests a skewed distribution meaning casual labour is more common in specific industries
   * Helps understand how widespread casual labour is across workers rather than just industries.

### EDA on PLFS Employment State dataset

In [None]:
emp_state = pd.read_csv('Cleaned Data/PLFS/PLFS_emp_state.csv',index_col=False)

In [None]:
emp_state

In [None]:
emp_state = emp_state.drop('Unnamed: 0', axis=1)

In [None]:
emp_state.head()

In [None]:
emp_state.info()

In [None]:
categories = ['Male', 'Female', 'Persons'] * 9

rows_per_set = 36

category_col = [categories[i // rows_per_set] for i in range(len(emp_state))]
emp_state['Category'] = category_col
emp_state

In [None]:
emp_state.iloc[55]

In [None]:
# Filtered out all India rows from the dataframe
emp_1 = emp_state[~emp_state['State/UT'].str.contains('all India',case=False, na=False)]
all_india = emp_state[emp_state['State/UT'].str.contains('all India',case=False, na=False)]

In [None]:
all_india.sort_values(by='Category',inplace=True)

In [None]:
all_india.reset_index(drop=True,inplace=True)

In [None]:
all_india

In [None]:
emp_1.info()

## Column Types:

* `Numerical`: own account worker, employer; helper in household enterprise	all self ; all self employed; regular wage/salary; casual labour;  all
* `Categorical`: Category
* `Mixed`: State/UT


## Univariate Analysis on Numerical Cols

### Own account, worker, employer

In [None]:
emp_1.head()

In [None]:
emp_1.describe()

In [None]:
emp_1['Own account, worker, employer'].skew()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(emp_1['Own account, worker, employer'],kde=True,bins=10)
plt.title('Distribution of Own account, worker, employer')
plt.xlabel('Percentage')
plt.ylabel('Frequency')
plt.show()

#Boxplot
plt.figure(figsize=(10,5))
sns.boxplot(x=emp_1['Own account, worker, employer'])
plt.title('Boxplot of Own Account Worker, Employer')
plt.show()

## Conclusions:

* Histogram:
  * shows normal distribution as there almost 0 skewness in the distribution, this tells us that Own account worker, employer are normally distributed all over india
  * Here percentage peaks at 30-50%
* Boxplot:
  * 2 outliers are present in this data
  * Median value is above 30 and below 40 with a wide IQR range indicating siginificant variablility across states.
  * Outliers exist above 65-67% 

### Helper in Household Enterprise

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(emp_1['Helper in household enterprise'],kde=True,bins=10)
plt.title('Distribution of Helpers in Household Enterprises')
plt.xlabel('Percentage of Workforce')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(x=emp_1['Helper in household enterprise'])
plt.title('Boxplot of Helpers in Household Enterprises')
plt.show()

In [None]:
c = emp_1.groupby('Category')['Helper in household enterprise'].describe()
c.loc[:,'mean'].plot(kind='bar', figsize=(10,5), title='Mean Percentage of Helpers by Category')
plt.ylabel('Percentage of Workforce')
plt.show()

### Conclusion:

* Histogram & KDE:
  * The distribution is right-skewed(positively), showing that most States have low values of "Helpers in Household Enterprise"
  * The highest frequency is near 0-10% showing most states very few people employed in this type.
  * long tail extends towards 20-70%, meaning few states have some values of Helpers in household enterprise
* Boxplot:
  * median is low
  * There are several outliers beyond 45%
  * IQR is narrow, means that most values are concentrated in the lower range
* After doing groupby, we find that females have the highest mean out of the 3 category hence the outliers 

## All self employed

In [None]:
plt.figure(figsize=(10,5))
n = len(emp_1['All self employed'])
bins = int(np.sqrt(n)) # Square root rule for finding binsize
sns.histplot(emp_1['All self employed'], kde=True, bins=bins)
plt.xlabel('Percentage of Workforce')
plt.ylabel('Frequency')
plt.title('Distribution of All-self employed workers')
plt.show()

plt.figure(figsize=(10,5))
sns.boxplot(emp_1['All self employed'])
plt.title('Boxplot of All Self-Employed Workers')
plt.show()

### Conclusions 

* Distribution is roughly normal, the peak frequency occurs around 50-55% of the workforce
* Most data falls between 30-70% of the workforce
* The relatively symmetric shape indicates most states have similar self-employment percentages with fewer cases at the extreme
* Boxplot indicates:
  * median is around 50-55%
  * IQR range approx between 40-65%
  * min: around 15%
  * max: around 90%

In [None]:
plt.figure(figsize=(10,5))
n = len(emp_1['Regular wage/salary'])
bins = int(np.sqrt(n)) # Square root rule for finding binsize
sns.histplot(emp_1['Regular wage/salary'],kde=True, bins=10)
plt.xlabel('Percentage of Regular wage/Salary Workers')
plt.ylabel('Frequency')
plt.title('Distribution of Regular wage/salary')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x=emp_1['Regular wage/salary'])
plt.title('Boxplot of Regular wage/salary')
plt.show()

### Histogram (Distribution of Regular Wage/Salary)
* Primary Peak: The most prominent feature is the peak in the 10-20% range with a frequency of approximately 60 workers. This indicates that the largest concentration of workers earn between 10-20% of their regular wage/salary.

* Right-Skewed Distribution: The distribution shows a clear right skew, with a long tail extending toward higher percentage values. This means that while most workers are concentrated in the lower percentage ranges, there are progressively fewer workers as the percentage increases.

* Declining Frequency Pattern: After the initial peak, there's a steady decline in frequency as the percentage increases. The frequency drops significantly after the 20% mark.

* Low Frequency in Higher Ranges: Very few workers appear in the 60-80% range, showing that high percentages of regular wage/salary are relatively uncommon in this population.

### Boxplot (Boxplot of Regular Wage/Salary)
* Median Position: The median (vertical line inside the box) is positioned around 40%, indicating that half of the workers earn less than 40% of their regular wage/salary and half earn more.

* Interquartile Range (IQR): The box spans from approximately 20% to 60%, representing the middle 50% of the data. This 40% spread shows considerable variability in the central portion of the distribution.

* Whisker Extent: The whiskers extend from near 0% to about 80%, showing the full range of the data without outliers. The longer right whisker (from median to upper end) compared to the left whisker reinforces the right-skewed nature of the distribution.

* Data Concentration: The width of the box relative to the whiskers indicates that while the data spans a wide range (0-80%), there's a notable concentration in the 20-60% range.

* Overall Insights
Bimodal Tendency: Looking at both graphs together, there might be a slight bimodal tendency, with concentrations around the 10-20% range and another smaller concentration around 40-50%.

* Inequality Indicator: The right-skewed distribution suggests inequality in wage/salary distribution, with many workers at lower percentages and fewer at higher percentages.

* Central Tendency vs. Mode Discrepancy: There's an interesting discrepancy between the mode (10-20% from the histogram) and the median (around 40% from the boxplot), which further emphasizes the skewed nature of the distribution.

* Smooth Transition: The density curve overlaid on the histogram shows a relatively smooth transition, suggesting that while there are peaks, the distribution changes gradually rather than having sharp cutoffs between groups.

These features collectively paint a picture of a wage/salary distribution where most workers earn a relatively small percentage of their regular wage/salary, with progressively fewer workers earning higher percentages.

In [None]:
plt.figure(figsize=(10,5))
n = len(emp_1['Casual labour'])
bins = int(np.sqrt(n)) # Square root rule for finding binsize
sns.histplot(emp_1['Casual labour'],kde=True, bins=10)
plt.xlabel('Percentage of Casual Labour')
plt.ylabel('Frequency')
plt.title('Distribution of Casual Labour')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x=emp_1['Casual labour'])
plt.title('Boxplot of Casual Labour')
plt.show()

### Histogram: "Distribution of Casual Labour"
* Type of Graph: Histogram with a density curve.
* Key Features:
    * The distribution peaks around the 10% range, with the highest frequency of approximately 60 workers.
    * The distribution is right-skewed, with a long tail extending toward higher percentages.
    * Frequencies decrease steadily after the peak, with very few workers in the 30-40% range.
### Boxplot: "Boxplot of Casual Labour"
Key Features:
* The median is around 15%.
* The interquartile range (IQR) spans from approximately 15% to 20%.
* There are outliers beyond the upper whisker, around the 40% mark.

### Histogram Features:
* Primary Peak: The distribution has a prominent peak at approximately 10% with a frequency of about 60 workers, indicating this is the most common percentage of casual labor.
* Right Skewness: The distribution shows significant right skewness, with a long tail extending toward higher percentages (30-40%).
* Rapid Decline: After the peak, there's a steep decline in frequency, showing that higher percentages of casual labor become increasingly uncommon.
* Range: The distribution primarily spans from 0% to about 40%, with very few data points beyond this range.
### Boxplot Features:
* Median: Positioned at approximately 15%, showing the central tendency of casual labor percentage.
* Interquartile Range (IQR): Spans from about 15% to 20%, representing the middle 50% of the data.
* Outliers: Several outliers appear beyond the upper whisker, around the 40% mark, representing unusual cases with higher casual labor percentages.
* Compact Distribution: The relatively narrow IQR suggests consistency in the lower range of casual labor percentages.

In [None]:
emp_1.head()

In [None]:
emp_1.corr(numeric_only=True)

## Further Analysis

### 1. Regional Patterns: 

* How does employment status vary by state or union territory?
* Are there noticeable regional clusters or patterns in the percentages of self-employed, wage/salary, and casual labor workers?

In [None]:
cols = ['Own account, worker, employer','Helper in household enterprise','All self employed','Regular wage/salary','Casual labour']
emp_grp = emp_1.groupby('State/UT')[cols].describe()

In [None]:
# Sorting for Self-Employment
top_self_emp = emp_1.sort_values(
    ['Own account, worker, employer', 'Helper in household enterprise', 'All self employed'], 
    ascending=[False, False, False]
).head(10)

bottom_self_emp = emp_1.sort_values(
    ['Own account, worker, employer', 'Helper in household enterprise', 'All self employed'], 
    ascending=[True, True, True]
).head(10)

# Sorting for Regular Wage/Salary
top_wage_emp = emp_1.sort_values('Regular wage/salary', ascending=False).head(10)
bottom_wage_emp = emp_1.sort_values('Regular wage/salary', ascending=True).head(10)

# Sorting for Casual Labour
top_casual_emp = emp_1.sort_values('Casual labour', ascending=False).head(10)
bottom_casual_emp = emp_1.sort_values('Casual labour', ascending=True).head(10)

# **Fixing Visualization**
# Melt the dataframe for plotting
top_self_emp_melted = top_self_emp.melt(id_vars=['State/UT'], 
                                        value_vars=['Own account, worker, employer', 
                                                    'Helper in household enterprise', 
                                                    'All self employed'], 
                                        var_name='Employment Type', 
                                        value_name='Percentage')

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Percentage', y='State/UT', hue='Employment Type', data=top_self_emp_melted, palette='coolwarm')

plt.title('Top 10 States/UTs by Self-Employment Percentage', fontsize=16)
plt.xlabel('Percentage (%)', fontsize=14)
plt.ylabel('State/UT', fontsize=14)
plt.legend(title='Employment Type')
plt.tight_layout()
plt.show()

top_wage_emp_melted = top_wage_emp.melt(id_vars=['State/UT'],
                                        value_vars='Regular wage/salary',
                                        var_name = 'Employment Type',
                                        value_name='Percentage')
sns.barplot(x='Percentage',y='State/UT',hue='Employment Type',data=top_wage_emp_melted,palette='dark:skyblue',legend=False)
plt.title('Top 10 States/UTs by Wage/Salaried Percentage', fontsize=16)
plt.xlabel('Percentage (%)', fontsize=14)
plt.ylabel('State/UT', fontsize=14)
plt.tight_layout()
plt.show()

top_casual_emp_melted = top_casual_emp.melt(id_vars=['State/UT'],
                                           value_vars='Casual labour',
                                           var_name = 'Employment Type',
                                           value_name='Percentage')
sns.barplot(x='Percentage',y='State/UT',hue='Employment Type', data=top_casual_emp_melted,palette='viridis',legend=False)
plt.title('Top 10 States/UTs by Casual labour Percentage', fontsize=16)
plt.xlabel('Percentage (%)', fontsize=14)
plt.ylabel('State/UT', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Grouping by state to analyze employment trends
emp_statewise = emp_1.groupby('State/UT')[['Own account, worker, employer', 'Helper in household enterprise', 
                                           'All self employed', 'Regular wage/salary', 'Casual labour']].mean()



In [None]:
# Plotting the employment distribution across states
plt.figure(figsize=(14,7))

# Stacked bar chart
emp_statewise.plot(kind='bar', stacked=True, colormap='viridis', figsize=(14,7))

plt.title('Employment Status Distribution Across States/UTs', fontsize=16)
plt.xlabel('State/UT', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.legend(title="Employment Type")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(emp_statewise,cmap='coolwarm',annot=True,fmt='.1f',linewidth=0.5)
plt.title('Employment Patterns Across States/UT',fontsize=16)
plt.xlabel('Employment Type',fontsize=12)
plt.ylabel('State/UT',fontsize=12)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()

## Gender Based Analysis

* Splitting by the category column:
  * Compare how employment patterns differ among the gender categories.
* Self-Employment Rates by Gender:
  * Examine and compare the self employment percentages between male and female groups.

In [None]:
gen = emp_1.groupby('Category')[cols].mean()

In [None]:
plt.figure(figsize=(10,6))
gen[['Own account, worker, employer', 'Helper in household enterprise', 'All self employed']].plot(kind='bar', colormap='viridis', figsize=(10,6))
plt.title('Self-Employment Rates by Gender', fontsize=14, fontweight='bold')
plt.xlabel('Gender Category', fontsize=12)
plt.ylabel('Percentage of Workforce', fontsize=12)
plt.legend(title="Employment Type")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
gen['Regular wage/salary'].plot(kind='bar', colormap='viridis', figsize=(10,6))
plt.title('Regular wage/salary Rates by Gender', fontsize=14, fontweight='bold')
plt.xlabel('Gender Category', fontsize=12)
plt.ylabel('Percentage of Workforce', fontsize=12)
plt.legend(title="Employment Type")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
gen['Casual labour'].plot(kind='bar', colormap='viridis', figsize=(10,6))
plt.title('Casual Labour Rates by Gender', fontsize=14, fontweight='bold')
plt.xlabel('Gender Category', fontsize=12)
plt.ylabel('Percentage of Workforce', fontsize=12)
plt.legend(title="Employment Type")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
gen.plot(kind='bar',stacked=True,colormap='viridis',figsize=(10,6))
plt.title('Employment Patterns by Gender')
plt.ylabel('Percentage of Workforce',fontsize=12)
plt.legend(title='Employment Type', loc='upper left', bbox_to_anchor=(1,1))
plt.xticks(rotation=0)
plt.grid(axis='y',linestyle='--',alpha=0.7)
plt.show()

## Sectoral Comparisions

* Highest Self-Employed States versus Wage/Salary Workers:
    * Identify states where self-employment percentage is highest relative to regular wage/salary workers.
* Correlations Between Employment Types:
  * Explore if there's a statistical correlation between different categories.

In [None]:
emp_1['SelfEmp_Wage_Ratio'] = emp_1['All self employed']/emp_1['Regular wage/salary']

top_self_emp_states = emp_1.sort_values('SelfEmp_Wage_Ratio',ascending=False).head(10)
bottom_self_emp_states = emp_1.sort_values('SelfEmp_Wage_Ratio').head(10)

plt.figure(figsize=(12,6))
sns.scatterplot(data=emp_1,x='All self employed',y='Regular wage/salary',size='SelfEmp_Wage_Ratio',hue='SelfEmp_Wage_Ratio',palette='coolwarm',sizes=(50,300),edgecolor='black')

for i, row in top_self_emp_states.iterrows():
    plt.text(row['All self employed'], row['Regular wage/salary'], row['State/UT'],fontsize=10,ha='right')

plt.title('Self-Employment vs Regular Wage/Salary by State', fontsize=14, fontweight='bold')
plt.xlabel('Self-Employment (%)', fontsize=12)
plt.ylabel('Regular Wage/Salary (%)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(title="SelfEmp/Wage Ratio", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()

In [None]:
# Exploring Correlations Between Employment Types 
emp_cols = ['All self employed','Regular wage/salary','Casual labour']

corr_matrix = emp_1[emp_cols].corr(method='pearson')

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm',fmt='.2f',linewidths=0.5)
plt.title('Correlation Matrix of Employment Categories',fontsize=16,fontweight='bold')
plt.show()

plt.figure(figsize=(12,5))
for i,col in enumerate(emp_cols[1:]):
    plt.subplot(1,2,i+1)
    sns.regplot(x=emp_1['All self employed'],y=emp_1[col], scatter_kws={'alpha':0.6},line_kws={'color':'red'})
    plt.title(f"Self-Employment vs {col}")
    plt.xlabel('Self-Employment(%)')
    plt.ylabel(f'{col}(%)')

plt.tight_layout()
plt.show()

sns.pairplot(emp_1[emp_cols],kind='reg',diag_kind='kde',plot_kws={'scatter_kws':{'alpha':0.7}})
plt.suptitle('Pairwise Relationships Between Employment Types',fontsize=16, fontweight='bold',y=1.02)
plt.show()

### Unemployment Rate Dataset (PLFS)

In [None]:
unemp_rate = pd.read_csv('Cleaned Data/PLFS/Unemp_Rate.csv',header=None,skiprows=1)
unemp_rate.head()

In [None]:
col = pd.MultiIndex.from_tuples([
    ('State/UT',''),
    ('Rural','Male'),('Rural','Female'),('Rural','Person'),
    ('Urban','Male'),('Urban','Female'),('Urban','Person'),
    ('Rural+Urban','Male'),('Rural+Urban','Female'),('Rural+Urban','Person')
])

In [None]:
unemp_rate.columns = col
unemp_rate.head()

In [None]:
unemp_rate.info()

In [None]:
unemp_rate

In [None]:
unemp_rate.loc[31,('Rural','Male')] = 0
unemp_rate.loc[31,('Rural','Female')] = 0
unemp_rate.loc[31,('Rural','Person')] = 0

In [None]:
unemp_rate.info()

In [None]:
for category in ['Rural','Urban','Rural+Urban']:
    for group in ['Male','Female','Person']:
        unemp_rate[(category,group)] = pd.to_numeric(unemp_rate[(category,group)], errors='coerce')

In [None]:
unemp_rate.info()

In [None]:
unemp_rate_f = unemp_rate[~unemp_rate['State/UT'].str.contains('all India',case=False, na=False)]
all_ = unemp_rate[unemp_rate['State/UT'].str.contains('all India',case=False, na=False)]

### Unemployment Rate Distribution

* Histogram of rural, urban, and overall unemployment rates.

* Boxplots to show the spread of unemployment rates across states.

In [None]:
sns.set_style('whitegrid')

plt.figure(figsize=(12, 5))
plt.hist(unemp_rate_f[("Rural", "Person")], bins=15, alpha=0.6, label="Rural", color='blue', edgecolor='black')
plt.hist(unemp_rate_f[("Urban", "Person")], bins=15, alpha=0.6, label="Urban", color='red', edgecolor='black')
plt.hist(unemp_rate_f[("Rural+Urban", "Person")], bins=15, alpha=0.6, label="Overall", color='green', edgecolor='black')

# Labels and Title
plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Number of States/UTs")
plt.title("Distribution of Unemployment Rates (Rural, Urban, Overall)")
plt.legend()
plt.show()

In [None]:
# --- KDE (Density Plot) ---
plt.figure(figsize=(10, 5))
sns.kdeplot(unemp_rate_f[("Rural", "Person")], fill=True, color="blue", label="Rural", alpha=0.5)
sns.kdeplot(unemp_rate_f[("Urban", "Person")], fill=True, color="red", label="Urban", alpha=0.5)
sns.kdeplot(unemp_rate_f[("Rural+Urban", "Person")], fill=True, color="green", label="Overall", alpha=0.5)

# Labels and Title
plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Density")
plt.title("Distribution of Unemployment Rates (Rural, Urban, Overall)")
plt.legend()
plt.show()

plt.figure(figsize=(8, 5))
sns.boxplot(data=unemp_rate_f[[("Rural", "Person"), ("Urban", "Person"), ("Rural+Urban", "Person")]],
            palette=["blue", "red", "green"])

plt.title("Spread of Unemployment Rates Across States/UTs")
plt.ylabel("Unemployment Rate (%)")
plt.xticks(ticks=[0, 1, 2], labels=["Rural", "Urban", "Overall"])
plt.show()

### State-wise Comparision 

* Bar charts comparing unemployment rates across states
* Highlight the states with the highest and lowest unemployment 

In [None]:
unemp_rate_sorted = unemp_rate_f.sort_values(by=('Rural+Urban','Person'),ascending=False).reset_index(drop=True)
unemp_rate_sorted.drop(36,inplace=True)
top_state = unemp_rate_sorted.iloc[0]
bottom_state = unemp_rate_sorted.iloc[-1]

plt.figure(figsize=(12,6))
barplot = sns.barplot(
    data=unemp_rate_sorted,
    y=('State/UT',''),
    x=('Rural+Urban','Person'),
    hue = ('Rural+Urban','Person'),
    palette='coolwarm'
)

barplot.bar_label(barplot.containers[0], fmt='%0.1f%%',fontsize=9)
barplot.get_yticklabels()[0].set_color('red')
barplot.get_yticklabels()[-1].set_color('green')

plt.xlabel('Unemployment Rate (%)')
plt.ylabel('State/UT')
plt.title('State-wise Unemployment Rate (Overall)')

plt.show()

### Urban vs Rural Unemployment Trends 

* Compare Rural and Urban Unemployment
  * Grouped bar chart or side by side violin plots for rural vs urban unemployment rates
  * Line chart comparing national-level rural and urban unemployment over time (if you have time-series data).
* Ratio of rural to urban unemployment
  * Heatmap or scatter plot showing the rural-to-urban unemployment ratio across states.

In [None]:
#Grouped Bar chart 
plt.figure(figsize=(14,6))
unemp_melted = unemp_rate_f.melt(id_vars=[('State/UT','')],
                           value_vars=[('Rural','Person'),('Urban','Person')],
                           var_name='Category',value_name='Unemployment Rate')

sns.barplot(data=unemp_melted,
           x=('State/UT',''),
           y='Unemployment Rate',
           hue='Category',
           palette='Set2')
plt.xticks(rotation=90)
plt.xlabel('State/UT',fontsize=12)
plt.ylabel('Unemployment Rate(%)',fontsize=12)
plt.title('Urban vs Rural Unemployment Rate by State',fontsize=16,fontweight='bold')
plt.legend(title='Category')
plt.show()

In [None]:
#Violin Plot 
plt.figure(figsize=(8,6))
sns.violinplot(data=unemp_melted,
              x='Category',
              y='Unemployment Rate',
              palette='muted',
              hue='Category')
plt.xlabel('Category')
plt.ylabel('Unemployment Rate(%)')
plt.title('Distribution of Rural vs. Urban Unemployment Rates')
plt.show()

In [None]:

# Ensure the column names are correctly accessed
if ('Rural', 'Person') in unemp_rate_f.columns and ('Urban', 'Person') in unemp_rate_f.columns:
    unemp_rate_f['Ratio'] = unemp_rate_f[('Rural', 'Person')].astype(float) / unemp_rate_f[('Urban', 'Person')].astype(float)
else:
    raise KeyError("Column names ('Rural', 'Person') or ('Urban', 'Person') not found.")

# Ensure 'State/UT' is used correctly
state_col = 'State/UT' if 'State/UT' in unemp_rate_f.columns else ('State/UT', '')  # Check if it's a multi-index

# Create pivot table
heatmap_data = unemp_rate_f.pivot_table(index=state_col, values='Ratio')

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, linewidths=0.5)

plt.xlabel('State/UT')
plt.ylabel('Rural-to-Urban Unemployment Ratio')
plt.title('Heatmap of Rural-to-Urban Unemployment Ratio by State')

plt.show()

In [None]:
unemp_rate_f.info()

In [None]:
# Scatter plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=unemp_rate_f,
               x=('Urban','Person'),
               y=('Rural','Person'),
               hue=('State/UT',''),
               palette='coolwarm')
plt.xlabel("Urban Unemployment Rate (%)")
plt.ylabel("Rural Unemployment Rate (%)")
plt.title("Scatter Plot: Rural vs. Urban Unemployment Across States")
plt.axline((0, 0), slope=1, color="gray", linestyle="--")  # Line y=x for reference
plt.legend(title="State/UT", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

## Gender-based Unemployment Trends

* Male vs Female Unemployment
  * Side by side bar chart for male vs female unemployment in rural and urban areas.
  * States where female unemployment is significantly higher than male unemployment.

In [None]:
gender_unemp = unemp_rate_f[[('State/UT',''),('Rural','Male'),('Rural','Female'),('Urban','Male'),('Urban','Female')]].copy()
gender_unemp.columns = ['State/UT','Rural Male','Rural Female','Urban Male','Urban Female']

#melt the df
gender_unemp_melted = gender_unemp.melt(id_vars=['State/UT'], var_name='Category', value_name='Unemployment Rate')

plt.figure(figsize=(14,6))
sns.barplot(data=gender_unemp_melted, x='State/UT',y='Unemployment Rate',hue='Category')
plt.xticks(rotation=90)
plt.ylabel('Unemployment Rate (%)')
plt.xlabel('State/UT')
plt.title('Male vs Female Unemployment in Rural and Urban Areas')
plt.legend(title='Category')
plt.show()

# Identifying States where female unemployment is significantly higher
gender_unemp['Rural Diff'] = (gender_unemp['Rural Female'] - gender_unemp['Rural Male'])/gender_unemp['Rural Male']
gender_unemp['Urban Diff'] = (gender_unemp['Urban Female'] - gender_unemp['Urban Male'])/gender_unemp['Urban Male']

#filter states where female unemployment is significantly higher 
threshold = 0.5
high_female_unemp_states = gender_unemp[(gender_unemp['Rural Diff'] > threshold) | (gender_unemp['Urban Diff'] > threshold)]
print("States where Female Unemployment is significantly higher than Male: ")
print(high_female_unemp_states[['State/UT','Rural Diff','Urban Diff']])

plt.figure(figsize=(10,6))
sns.barplot(data=high_female_unemp_states.melt(id_vars=['State/UT'], value_vars=['Rural Diff','Urban Diff'], var_name='Category',value_name='Difference'),
           x='State/UT',y='Difference',hue='Category')

plt.xticks(rotation=90)
plt.ylabel('Percentage Difference (Female - Male)')
plt.xlabel('State/UT')
plt.title('States where Female Unemployment is Significantly Higher than Male')
plt.axhline(y=0.5, color='r', linestyle='--', label="50% Threshold")
plt.legend()
plt.show()

## Identify Anomalies & Trends 

* Outlier Analysis
  * Use boxplots or scatter plots to detect states with extreme unemployment values 

In [None]:
unemp_data = unemp_rate_f[[('State/UT',''),('Rural','Male'),('Rural','Female'),('Urban','Male'),('Urban','Female')]].copy()
unemp_data.columns = ['State/UT', 'Rural Male', 'Rural Female', 'Urban Male', 'Urban Female']
unemp_data['Overall Unemployment'] = (unemp_data['Rural Male'] + unemp_data['Rural Female'] +
                                      unemp_data['Urban Male'] + unemp_data['Urban Female']) / 4

# 1️⃣ Outlier Analysis - Boxplots
plt.figure(figsize=(12, 6))
sns.boxplot(data=unemp_data[['Rural Male', 'Rural Female', 'Urban Male', 'Urban Female']])
plt.title('Boxplot of Unemployment Rates (Male vs Female in Rural & Urban)')
plt.ylabel('Unemployment Rate (%)')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=unemp_data, x='State/UT', y='Overall Unemployment', hue='Overall Unemployment', palette='coolwarm', size='Overall Unemployment', sizes=(50, 200))
plt.xticks(rotation=90)
plt.ylabel('Unemployment Rate (%)')
plt.xlabel('State/UT')
plt.title('State-wise Unemployment Rate with Outliers')
plt.show()

In [None]:
Q1 = unemp_data['Overall Unemployment'].quantile(0.25)
Q3 = unemp_data['Overall Unemployment'].quantile(0.75)
IQR = Q3 - Q1
outliers = unemp_data[(unemp_data['Overall Unemployment'] > (Q3 + 1.5 * IQR)) | (unemp_data['Overall Unemployment'] < (Q1 - 1.5 * IQR))]
print("States with Extreme Unemployment Rates (Outliers):")
print(outliers[['State/UT', 'Overall Unemployment']])

In [None]:
unemp_rate_f

### WPR Dataset (PLFS)

In [None]:
wpr = pd.read_csv('Cleaned Data/PLFS/WPR.csv',header=None,skiprows=2)
wpr_f = wpr[~wpr[0].str.contains('all India',case=False, na=False)]
wpr_all = wpr[wpr[0].str.contains('all India',case=False, na=False)]

In [None]:
wpr_f.columns = col
wpr_f


In [None]:
wpr_f.info()

In [None]:
# Filling missing values 
wpr_f.loc[30,('Rural','Male')] = 0
wpr_f.loc[30,('Rural','Female')] = 0
wpr_f.loc[30,('Rural','Person')] = 0

In [None]:
wpr_f.info()

In [None]:
wpr_f.describe()

## Gender Based Analysis

### Male vs. Female WPR across states

In [None]:
plt.figure(figsize=(12,6))
wpr_f_sorted = wpr_f.sort_values(by=[('Rural+Urban','Person')],ascending=False)
sns.barplot(y=wpr_f_sorted['State/UT'],x=wpr_f_sorted[('Rural+Urban','Male')],color='blue',label='Male',alpha=0.7)
sns.barplot(y=wpr_f_sorted['State/UT'],x=wpr_f_sorted[('Rural+Urban','Female')],color='red',label='Female',alpha=0.7)
plt.xlabel('Worker Population Ratio (%)')
plt.ylabel('State/UT')
plt.title("Worker Population Ratio by Gender")
plt.legend()
plt.show()

## Urban vs Rural Divide

In [None]:
wpr_f[('Rural_Urban_Diff', 'Male')] = wpr_f[('Rural', 'Male')] - wpr_f[('Urban', 'Male')]
wpr_f[('Rural_Urban_Diff', 'Female')] = wpr_f[('Rural', 'Female')] - wpr_f[('Urban', 'Female')]
wpr_f[('Rural_Urban_Diff', 'Person')] = wpr_f[('Rural', 'Person')] - wpr_f[('Urban', 'Person')]

wpr_f

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(y=wpr_f['State/UT'], x=wpr_f[('Rural_Urban_Diff','Person')], palette='RdBu')
plt.axvline(x=0, color='black',linestyle='--')

plt.title('Difference in WPR: Rural vs. Urban (Person)')
plt.xlabel('Difference (Rural-Urban)')
plt.ylabel('States/UT')
plt.show()

## Top & Bottom 5 states 

In [None]:
# Define correct column references
state_col = ('State/UT', '')  # Ensure correct MultiIndex format
wpr_col = ('Rural+Urban', 'Person')  # MultiIndex column

# Top 5 states based on WPR (Rural+Urban)
top_states = wpr_f.nlargest(5, wpr_col)[[state_col, wpr_col]]

# Bottom 5 states based on WPR (Rural+Urban)
bottom_states = wpr_f.nsmallest(5, wpr_col)[[state_col, wpr_col]]

# Print results
print('Top 5 states by WPR: \n', top_states)
print('Bottom 5 states by WPR: \n', bottom_states)


In [None]:
wpr_f.columns.tolist()

## Correlation Analysis 

In [None]:
corr_matrix = wpr_f.corr(numeric_only = True).drop('Rural_Urban_Diff',axis=1)

plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='coolwarm',fmt='.2f',linewidths=0.5)
plt.title('Correlation between Employment Types')
plt.show()

## Scatter Plot for Outliers
### Checking for states where Male WPR is high but Female WPR is very low

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=wpr_f['Rural+Urban','Male'], y=wpr_f['Rural+Urban','Female'], hue=wpr_f['State/UT'], palette='coolwarm',s=100)
plt.axline((0,0), slope=1, color="black", linestyle="--")  # Reference line for equal WPR
plt.xlabel("Male WPR")
plt.ylabel("Female WPR")
plt.title("Gender-Based WPR Comparison")
plt.legend(title="State/UT", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

### All values are below the line which indicates there is a severe gender gap in workforce population

In [None]:
lfpr_15_29 = pd.read_csv('Cleaned Data/PLFS/LFPR_15_29.csv',header=None,skiprows=2)
lfpr_15_29.head()

In [None]:
lfpr_15_29_f = lfpr_15_29[~lfpr_15_29[0].str.contains('all India',case=False, na=False)]


In [None]:
lfpr_15_29_f

In [None]:
col = pd.MultiIndex.from_tuples([
    ('State/UT',''),
    ('Rural','Male'),('Rural','Female'),('Rural','Person'),
    ('Urban','Male'),('Urban','Female'),('Urban','Person'),
    ('Rural+Urban','Male'),('Rural+Urban','Female'),('Rural+Urban','Person')
])

In [None]:
lfpr_15_29_f.columns = col

In [None]:
lfpr_15_29_f.head()

In [None]:
lfpr_15_29_f.loc[30,('Rural','Male')] = 0
lfpr_15_29_f.loc[30,('Rural','Female')] = 0
lfpr_15_29_f.loc[30,('Rural','Person')] = 0

In [None]:
lfpr_15_59 = pd.read_csv('Cleaned Data/PLFS/LFPR_15_59.csv',header=None,skiprows=2)

lfpr_15_59_f = lfpr_15_59[~lfpr_15_59[0].str.contains('all India',case=False,na=False)]

lfpr_15_59_f.columns = col

lfpr_15_59_f.loc[30,('Rural','Male')] = 0
lfpr_15_59_f.loc[30,('Rural','Female')] = 0
lfpr_15_59_f.loc[30,('Rural','Person')] = 0

In [None]:
lfpr_15_above = pd.read_csv('Cleaned Data/PLFS/LFPR_15_above.csv',header=None,skiprows=2)

lfpr_15_above_f = lfpr_15_above[~lfpr_15_above[0].str.contains('all India',case=False,na=False)]

lfpr_15_above_f.columns = col

lfpr_15_above_f.loc[30,('Rural','Male')] = 0
lfpr_15_above_f.loc[30,('Rural','Female')] = 0
lfpr_15_above_f.loc[30,('Rural','Person')] = 0

In [None]:
lfpr_all_ages = pd.read_csv('Cleaned Data/PLFS/LFPR_all_ages.csv',header=None,skiprows=2)

lfpr_all_ages_f = lfpr_all_ages[~lfpr_all_ages[0].str.contains('all India',case=False,na=False)]

lfpr_all_ages_f.columns = col

lfpr_all_ages_f.loc[30,('Rural','Male')] = 0
lfpr_all_ages_f.loc[30,('Rural','Female')] = 0
lfpr_all_ages_f.loc[30,('Rural','Person')] = 0

In [None]:
frames = [lfpr_15_29_f,lfpr_15_59_f,lfpr_15_above_f,lfpr_all_ages_f]
lfpr_res = pd.concat(frames)

In [None]:
lfpr_res.reset_index(drop=True,inplace=True)

In [None]:
lfpr_res

In [None]:
lfpr_res.iloc[0:36]

In [None]:
categories = ['15-29 Age','15-59 Age','15 and above Age','All Ages'] * 9

rows_per_set = 36

category_col = [categories[i // rows_per_set] for i in range(len(lfpr_res))]
lfpr_res['Age Category'] = category_col
lfpr_res

# HCES 

## 1. Absolute and percentage breakup of MPCE by item-groups in 2023-24 All India

In [None]:
ab_perc_mpce = pd.read_csv('Cleaned Data/HCES/Abs_Perc_MPCE.csv',header=None,skiprows=2)
ab_perc_mpce.reset_index(drop=True,inplace=True)

In [None]:
ab_perc_mpce = ab_perc_mpce.drop(0,axis=1)

In [None]:
cols = pd.MultiIndex.from_tuples([
    ('Item group',''),
    ('MPCE(Rs.)','Rural'),('MPCE(Rs.)','Urban'),
    ('% share in total MPCE','Rural'),('% share in total MPCE','Urban')
])
ab_perc_mpce.columns = cols

In [None]:
ab_perc_mpce

In [None]:
ab_perc_mpceF = ab_perc_mpce[~ab_perc_mpce['Item group'].str.contains('all items',case=False,na=False)]
ab_perc_mpceA = ab_perc_mpce[ab_perc_mpce['Item group'].str.contains('all items',case=False,na=False)]

## Analysis
### 1. Food vs. Non-Food Expenditure (Essentials vs. Discretionary Spending)

In [None]:
data = {
    'Category':['Food','Non-Food'],
    "Rural": [1939,2183],
    'Urban': [2776,4220]
}
df = pd.DataFrame(data)

fig,axes = plt.subplots(1,2,figsize=(12,6))

#Rural Pie Chart
axes[0].pie(df['Rural'],labels=df['Category'],autopct='%0.1f%%',colors=["#66b3ff", "#ff9999"])
axes[0].set_title('Rural: Food vs. Non-Food Expenditure')

#Urban Pie Chart
axes[1].pie(df['Urban'],labels=df['Category'],autopct='%0.1f%%',colors=["#66b3ff", "#ff9999"])
axes[1].set_title('Urban: Food vs. Non-Food Expenditure')
plt.legend()
plt.show()

### Food spending percentage is higher in rural areas. It suggests they spend more on necessities.

### Urban areas spend more on non food, it shows a shift towards discretionary and service-based epenses.

### 2. Major Spending Categories (Top 5 in Rural & Urban)

In [None]:
df = ab_perc_mpceF[~ab_perc_mpceF['Item group'].isin(['food total','non-food total'])]
df

In [None]:
top5_rural = df.nlargest(10, ('MPCE(Rs.)',"Rural"))
top5_urban = df.nlargest(10, ('MPCE(Rs.)','Urban'))

In [None]:
fig, axes = plt.subplots(1,2,figsize=(14,6))

#Rural Plot 
axes[0].barh(top5_rural['Item group'], top5_rural[('MPCE(Rs.)','Rural')],color="#66b3ff")
axes[0].set_title('Top 10 spending Categories (Rural)')
axes[0].invert_yaxis()
axes[0].set_xlabel('MPCE (Rs.)')

#Urban Plot
axes[1].barh(top5_urban['Item group'], top5_urban[('MPCE(Rs.)','Urban')], color="#ff9999")
axes[1].set_title("Top 10 Spending Cateogries (Urban)")
axes[1].invert_yaxis()
axes[1].set_xlabel('MPCE (Rs.)')

plt.tight_layout()
plt.show()

### Top 1 Spending Categories is same for both Rural & Urban

### 3. Rent, Education and Medical Expenditure Comparisiion

In [None]:
categories = ['rent','education','medical']
df_selected = ab_perc_mpceF[ab_perc_mpceF['Item group'].isin(categories)]

x = np.arange(len(categories))
width = 0.3

fig,ax = plt.subplots(figsize=(8,6))

bars1 = ax.bar(x-width/2, df_selected[('MPCE(Rs.)','Rural')],width,label='Rural',color="#66b3ff",edgecolor='black')
bars2 = ax.bar(x+width/2,df_selected[('MPCE(Rs.)','Urban')],width,label='Urban',color='#ff9999',edgecolor='black')

ax.set_xticks(x)
ax.set_xticklabels(categories,fontsize=12)
ax.set_ylabel('MPCE (Rs.)')
ax.set_title('Comparision of Rent, Education, and Medical Expenditure')
ax.legend()

#display values on top of bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.0f}', xy=(bar.get_x() + bar.get_width()/2, height),
                    xytext=(0, 3), textcoords="offset points", ha='center', fontsize=10)

# Show plot
plt.show()

### 4. Essential vs Luxury Spending Breakdown

In [None]:
# Define essential & luxury categories
essential_items = ["food total", "fuel and light", "medical", "education", "clothing, bedding & footwear", "rent"]
luxury_items = ["misc. goods, entertainment", "consumer services excluding conveyance", "pan, tobacco & intoxicants", 
                "beverages, refreshments, processed food#", "conveyance", "taxes and cesses", "durable goods"]

# Compute total spending in each category
essential_rural = df[df["Item group"].isin(essential_items)][('MPCE(Rs.)',"Rural")].sum()
luxury_rural = df[df["Item group"].isin(luxury_items)][('MPCE(Rs.)',"Rural")].sum()
essential_urban = df[df["Item group"].isin(essential_items)][('MPCE(Rs.)',"Urban")].sum()
luxury_urban = df[df["Item group"].isin(luxury_items)][('MPCE(Rs.)',"Urban")].sum()

# Create DataFrame for visualization
data = {
    "Category": ["Rural", "Urban"],
    "Essential": [essential_rural, essential_urban],
    "Luxury": [luxury_rural, luxury_urban]
}
df_vis = pd.DataFrame(data)

# Plot Stacked Bar Chart
fig, ax = plt.subplots(figsize=(7, 5))
ax.bar(df_vis["Category"], df_vis["Essential"], label="Essential Spending", color="#66b3ff")
ax.bar(df_vis["Category"], df_vis["Luxury"], bottom=df_vis["Essential"], label="Luxury Spending", color="#ff9999")

# Labels & Title
ax.set_ylabel("MPCE (Rs.)")
ax.set_title("Essential vs. Luxury Spending Breakdown")
ax.legend()

# Show values on bars
for i, row in df_vis.iterrows():
    ax.text(i, row["Essential"] / 2, f"{row['Essential']:.0f}", ha="center", va="center", color="white", fontsize=10)
    ax.text(i, row["Essential"] + row["Luxury"] / 2, f"{row['Luxury']:.0f}", ha="center", va="center", color="white", fontsize=10)

# Show plot
plt.show()

## Average MPCE and share of food and non food 

In [None]:
avg_mpce = pd.read_csv('Cleaned Data/HCES/Avg_MPCE.csv',header=None,skiprows=2)

In [None]:
avg_mpce.drop(0,axis=1,inplace=True)

In [None]:
c = pd.MultiIndex.from_tuples([
    ('Item group',''),
    ('Rural India','Average MPCE(Rs.)'),('Rural India','Share in MPCE(%)'),
    ('Urban India','Average MPCE(Rs.)'),('Urban India','Share in MPCE(%)')
])
avg_mpce.columns = c

In [None]:
avg_mpce

In [None]:
categories = ["Rural India", "Urban India"]
food_expense = [47.04, 39.68]  # Share in MPCE (%)
non_food_expense = [52.96, 60.32]  # Share in MPCE (%)

# Plot
fig, ax = plt.subplots(figsize=(6, 5))
bars1 = ax.bar(categories, food_expense, label="Food", color="#66b3ff")
bars2 = ax.bar(categories, non_food_expense, bottom=food_expense, label="Non-Food", color="#ff9999")

# Labels
ax.set_ylabel("Share in MPCE (%)")
ax.set_title("Food vs. Non-Food Expenditure Share")
ax.legend()

# Show values on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_y() + height/2, f"{height:.1f}%", 
                ha='center', va='center', color="white", fontsize=10)

# Show plot
plt.show()

## Average MPCE for each state

In [None]:
avg_MPCE_state = pd.read_csv('Cleaned Data/HCES/Avg_MPCE_State.csv',header=None,skiprows=2)

In [None]:
avg_MPCE_state

In [None]:
avg_MPCE_stateF = avg_MPCE_state[~avg_MPCE_state[1].str.contains('All-India',case=False,na=False)]

In [None]:
avg_MPCE_stateF.drop(0,axis=1,inplace=True)

In [None]:
col = pd.MultiIndex.from_tuples([
    ('State/UT',''),
    ('Average MPCE(Rs.)','Rural'),('Average MPCE(Rs.)','Urban')
])
avg_MPCE_stateF.columns = col

In [None]:
avg_MPCE_stateF

### 1. Top 10 States with highest MPCE (Rural & Urban)


In [None]:
df_rural_top10 = avg_MPCE_stateF.nlargest(10,('Average MPCE(Rs.)','Rural'))
df_urban_top10 = avg_MPCE_stateF.nlargest(10, ('Average MPCE(Rs.)','Urban'))

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

df_rural_top10.plot(kind='barh', x='State/UT', y=('Average MPCE(Rs.)','Rural'), ax=axes[0], color='skyblue', legend=False)
axes[0].set_xlabel("Average MPCE (Rs.)")
axes[0].set_title("Top 10 States with Highest Rural MPCE")

df_urban_top10.plot(kind='barh', x='State/UT', y=('Average MPCE(Rs.)','Urban'), ax=axes[1], color='orange', legend=False)
axes[1].set_xlabel("Average MPCE (Rs.)")
axes[1].set_title("Top 10 States with Highest Urban MPCE")

plt.tight_layout()
plt.show()

In [None]:
avg_MPCE_stateF['MPCE Difference'] = avg_MPCE_stateF[('Average MPCE(Rs.)','Urban')] - avg_MPCE_stateF[('Average MPCE(Rs.)','Rural')]

avg_sorted = avg_MPCE_stateF.sort_values(by='MPCE Difference',ascending=False)
plt.figure(figsize=(12, 6))
plt.barh(avg_sorted["State/UT"], avg_sorted["MPCE Difference"], color="purple")
plt.xlabel("Urban - Rural MPCE Difference (Rs.)")
plt.ylabel("State/UT")
plt.title("Biggest Gaps in Rural vs. Urban MPCE")
plt.gca().invert_yaxis()  # To show the largest difference at the top

plt.show()

In [None]:
import json
avg_MPCE_stateF["Total_MPCE"] = (avg_MPCE_stateF[('Average MPCE(Rs.)',"Rural")] + avg_MPCE_stateF[('Average MPCE(Rs.)',"Urban")]) / 2  # Average of Rural & Urban

# Load the GeoJSON file for India states
with open("india_state_geo.json", "r") as f:
    india_states = json.load(f)

# Ensure state names in both data and GeoJSON match
avg_MPCE_stateF["State/UT"] = avg_MPCE_stateF["State/UT"].str.title()

In [None]:
avg_MPCE_stateF.head(2)

In [None]:
avg_MPCE_stateF_copy = avg_MPCE_stateF.copy()
avg_MPCE_stateF_copy.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in avg_MPCE_stateF.columns]
avg_MPCE_stateF_copy.head(2)

In [None]:
import plotly.express as px
fig = px.choropleth(
    avg_MPCE_stateF_copy,
    geojson = india_states,
    locations = 'State/UT_',
    featureidkey = 'properties.NAME_1',
    color = 'Total_MPCE_',
    color_continuous_scale = px.colors.sequential.Viridis,
    range_color =[avg_MPCE_stateF_copy['Total_MPCE_'].min(), avg_MPCE_stateF_copy['Total_MPCE_'].max()],
    title='State-wise MPCE in India'
)

fig.update_geos(fitbounds='locations',visible=False)
fig.update_layout(margin={'r':0, 't':30, 'l':0, 'b':0})
fig.update_layout(transition={'duration':0})
fig.show()

### Trend in Cereal and Food (HCES)

In [None]:
trend_food = pd.read_csv('Cleaned Data/HCES/Trend_cereal_food.csv',header=None,skiprows=2)
trend_food

In [None]:
trend_food = trend_food.drop(0,axis=1)
col = pd.MultiIndex.from_tuples([
    ('Period',''),
    ('Rural','%share of cereals in avg.'),('Rural','%share of food in avg.'),
    ('Urban','%share of cereals in avg.'),('Urban','%share of food in avg.')
])
trend_food.columns = col
trend_food

### Insights:

* `Declining trend in cereal share:`  The share of cereals in MPCE has consistently decreased from 2011-12 to 2023-24 in both rural (10.75% → 4.99%) and urban (6.66% → 3.76%) areas.
* `Declining share of food overall:` The total food share in MPCE has also reduced over time, indicating a possible shift in expenditure patterns towards non-food items.
* `Rural vs Urban Differences:`
  * Rural areas have always had a higher share of cereals in MPCE compared to urban areas.
  * The drop in cereal share is sharper in rural areas, possibly due to dietary diversification.
  * Urban areas show a more gradual decline in both cereals and food share.

In [None]:
trend_food.columns
trend_food_copy = trend_food.copy()
trend_food_copy.columns = ['Period','Rural_Cereal','Rural_Food','Urban_Cereal','Urban_Food']

### Trend over time

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(x='Period',y='Rural_Cereal',data=trend_food_copy,marker='o',label='Cereal (Rural)')
sns.lineplot(x='Period',y='Urban_Cereal',data=trend_food_copy,marker='o',label='Cereal (Urban')
sns.lineplot(x='Period',y='Rural_Food',data=trend_food_copy,marker='o',label='Food (Rural)')
sns.lineplot(x='Period',y='Urban_Food',data=trend_food_copy,marker='o',label='Food (Urban)')
plt.xlabel('Year')
plt.ylabel('Percentage Share')
plt.title('Trends in Share of Cereal and Food Items in MPCE')
plt.legend()
plt.grid(True)
plt.show()

### Rural vs. Urban Comparision (Bar chart)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,5))

sns.barplot(x='Period',y='Rural_Cereal',data=trend_food_copy, ax=ax[0],color='blue',label='Rural')
sns.barplot(x='Period',y='Urban_Cereal',data=trend_food_copy,ax=ax[0],color='red',alpha=0.7,label='Urban')
ax[0].set_title('Cereal Share: Rural vs. Urban')
ax[0].set_ylabel('% Share of Cereal')
ax[0].legend

sns.barplot(x='Period',y='Rural_Food',data=trend_food_copy,ax=ax[1],color='blue',label='Rural')
sns.barplot(x='Period',y='Urban_Food',data=trend_food_copy,ax=ax[1],color='red',alpha=0.7,label='Urban')
ax[1].set_title('Food Share: Rural vs Urban')
ax[1].set_ylabel('% Share of Food')
ax[1].legend()

plt.tight_layout()
plt.show()


### Percentage Decline Calculation 

In [None]:
def percentage_decline(initial, final):
    return round(((initial-final)/initial)*100,2)

cereal_rural_decline = percentage_decline(10.75,4.99)
cereal_urban_decline = percentage_decline(6.66, 3.76)
food_rural_decline = percentage_decline(52.90, 47.04)
food_urban_decline = percentage_decline(42.62, 39.68)

print(f"Percentage decline in cereal consumption (Rural): {cereal_rural_decline}%")
print(f"Percentage decline in cereal consumption (Urban): {cereal_urban_decline}%")
print(f"Percentage decline in food consumption (Rural): {food_rural_decline}%")
print(f"Percentage decline in food consumption (Urban): {food_urban_decline}%")

## Trend in level of Consumption since 2011-12 All-India

In [None]:
trend_consump = pd.read_csv('Cleaned Data/HCES/Trend_Consump.csv',header=None,skiprows=2)
trend_consump = trend_consump.drop(0,axis=1)
col = pd.MultiIndex.from_tuples([
    ('Sector',''),
    ('Average MPCE(Rs.) over different period','2011-12'),('Average MPCE(Rs.) over different period','2022-23'),('Average MPCE(Rs.) over different period','2023-24')
])
trend_consump.columns = col
trend_consump

### Insights:

1. Steady Increase in MPCE Across both sectors
    * The Rural MPCE has grown from ₹1430 in 2011-12 to ₹4122 in 2023-24, an almost 3× increase.
    * The Urban MPCE has grown from ₹2630 in 2011-12 to ₹6996 in 2023-24, also increasing almost 3×.
    * This indicates a steady rise in per capita spending, which may be attributed to factors like inflation, increased income, and improved living standards.
2. Urban-Rural in Consumption
   * The absolute difference in MPCE between Urban and Rural areas has increased over time, but the relative difference (as % of Rural MPCE) has decreased:
     * 2011-12: Urban MPCE was 83.9% higher than Rural.
     * 2022-23: Gap reduced to 71.2%.
     * 2023-24: Further declined to 69.7%.
   * This suggests that rural consumption levels are catching up with urban areas, indicating economic improvement in rural regions.
3. Potential Factors behind the trends
    * Higher Economic Growth & Inflation: The increase in MPCE suggests that household expenditures have risen due to both economic growth and inflation.
    * Government Schemes & Rural Development: The narrowing Urban-Rural gap suggests that rural welfare programs and better income opportunities may have helped bridge the divide.
    * Lifestyle Changes: Increased spending could reflect a shift in consumption patterns, with more spending on non-essential goods, technology, and services.

In [None]:
trend_consump.columns = ['_'.join(col).strip() for col in trend_consump.columns]

In [None]:
trend_consump_melt = trend_consump.melt(id_vars=['Sector_'],var_name='Year',value_name='MPCE')
trend_consump_melt['Year'] = trend_consump_melt['Year'].str.extract(r'(\d{4}-\d{2})')
trend_consump_melt['Year'] = trend_consump_melt['Year'].astype(str)

In [None]:
trend_consump_melt

In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(x='Year',y='MPCE',hue='Sector_',data=trend_consump_melt,marker='o')
plt.xlabel('Year')
plt.ylabel('Average MPCE (Rs.)')
plt.title('Trend in Level of Consumption (MPCE) Since 2011-12')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df_diff = trend_consump_melt[trend_consump_melt['Sector_'] == 'Difference as % of Rural MPCE'] 
plt.figure(figsize=(10,5))
sns.lineplot(x='Year',y='MPCE',data=df_diff,marker='d',color='red',label='Urban-Rural MPCE Difference (%)')

plt.xlabel('Year')
plt.ylabel('Difference (%)')
plt.title('Urban-Rural MPCE Difference Over Time')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
lfpr_res.to_csv('LFPS_res.csv',index=False,header=True)

# Combined Datasets (PLFS+HCES)

## Merging Datasets:

* Merging PLFS LFPR_res.csv with Avg MPCE_State: Analyze how LFPR correlates with household expenditure at the state level.


### 1. lfpr_res and avg_MPCE_stateF

In [None]:
lfpr_res

In [None]:
avg_MPCE_stateF.head(5)

In [None]:
final_lfpr = lfpr_res[lfpr_res['Age Category'] == 'All Ages']
final_lfpr.reset_index(drop=True,inplace=True)

In [None]:
final_lfpr = final_lfpr.drop(('Age Category',''),axis=1)

In [None]:
final_lfpr

In [None]:
merged_df1 = pd.merge(final_lfpr,avg_MPCE_stateF,on='State/UT',how='inner')

In [None]:
merged_df1.isnull().sum()

In [None]:
merged_df1.corr(numeric_only=True)

### To Analyze the relationship between employment trends and consumption patterns across different states in India


  

In [None]:
merged_df1.info()

In [None]:
merged_df1.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in merged_df1.columns]
merged_df1.head(2)

### Distribution of MPCE and Employment Rates (Scaled) Boxplot Analysis

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(merged_df1[['Average MPCE(Rs.)_Rural', 'Average MPCE(Rs.)_Urban', 'Rural+Urban_Person']])

scaled_df = merged_df1.copy()
scaled_df[['Avg_MPCE_Rural', 'Avg_MPCE_Urban', 'Rural+Urban_Person']] = scaled_data

# Plot after scaling
plt.figure(figsize=(10,5))
sns.boxplot(data=scaled_df[['Avg_MPCE_Rural', 'Avg_MPCE_Urban', 'Rural+Urban_Person']])
plt.title('Distribution of MPCE and Employment Rates (Scaled)')
plt.show()


### Conclusions:

* 

In [None]:
plt.figure(figsize=(12, 6))
df_sorted = merged_df1.sort_values(by="Total_MPCE", ascending=False)
sns.barplot(data=df_sorted, x="Total_MPCE", y="State/UT", hue=('Average MPCE(Rs.)',"Urban"), palette="viridis")
plt.xlabel("MPCE (Rs.)")
plt.ylabel("State/UT")
plt.title("State-wise MPCE: Rural vs. Urban")
plt.legend(title="MPCE Type")
plt.show()

### Correlation Analysis

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(merged_df1.corr(numeric_only=True),annot=True,cmap='coolwarm',linewidth=0.5,linecolor='black',square=True,annot_kws={'size':8,'color':'black'})
plt.title('Correlation Matrix Heatmap',fontsize=14,fontweight='bold',color='darkred')
plt.xticks(rotation=30,ha='right',fontsize=10)
plt.show()

In [None]:
# 1️⃣ State-wise MPCE Analysis (Bar Plot)
plt.figure(figsize=(14, 6))
statewise_data = merged_df1[['State/UT_', 'Average MPCE(Rs.)_Rural', 'Average MPCE(Rs.)_Urban']].sort_values(by='Average MPCE(Rs.)_Urban', ascending=False)

sns.barplot(data=statewise_data, x='State/UT_', y='Average MPCE(Rs.)_Urban', color='royalblue', label='Urban')
sns.barplot(data=statewise_data, x='State/UT_', y='Average MPCE(Rs.)_Rural', color='lightcoral', label='Rural')

plt.xticks(rotation=90)
plt.title('State-wise Average MPCE (Rural vs Urban)', fontsize=14, fontweight='bold')
plt.ylabel('Average MPCE (Rs.)')
plt.xlabel('State/UT')
plt.legend()
plt.show()

In [None]:
top_states = merged_df1.nlargest(5, 'Average MPCE(Rs.)_Urban')[['State/UT_', 'Average MPCE(Rs.)_Urban']]
bottom_states = merged_df1.nsmallest(5, 'Average MPCE(Rs.)_Urban')[['State/UT_', 'Average MPCE(Rs.)_Urban']]

# Display results
print("🔝 Top 5 States with Highest MPCE (Urban):\n", top_states)
print("\n🔻 Bottom 5 States with Lowest MPCE (Urban):\n", bottom_states)


### MPCE vs Employment Rate (State-Wise)

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=merged_df1, x='Average MPCE(Rs.)_Urban', y='Rural+Urban_Person', hue='State/UT_', palette='coolwarm', edgecolor='black', s=100)

plt.title('MPCE vs Employment Rate (State-wise)', fontsize=14, fontweight='bold')
plt.xlabel('Average MPCE (Urban) (Rs.)')
plt.ylabel('Employment Rate (%)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='State/UT',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
merged_df1.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in merged_df1.columns]

In [None]:
unemp_rate_f.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in unemp_rate_f.columns]

In [None]:
trend_consump

In [None]:
trend_consump_long = trend_consump.melt(id_vars=['Sector_'], var_name='Year',value_name='MPCE')
trend_consump_long['Year'] = trend_consump_long['Year'].str.extract('(\d{4}-\d{2})').astype(str)

In [None]:
trend_consump_long

### Unemployment Analysis

### 1. Gender Disparity

In [None]:
unemp_rate_f.head(1)

In [None]:
gender_gap = unemp_rate_f[['State/UT_','Rural+Urban_Female','Rural+Urban_Male']]
gender_gap['Gap'] = gender_gap['Rural+Urban_Female'] - gender_gap['Rural+Urban_Male']
gender_gap.sort_values('Gap', ascending=False)

### Insight: States like Kerala, Goa and Chandigarh show high female unemployment rates (>15%) compared to males

### 2. Urban-Rural Divide

In [None]:
unemp_rate_f['Urban_Rural_Gap'] = unemp_rate_f['Urban_Person'] - unemp_rate_f['Rural_Person']
unemp_rate_f.sort_values('Urban_Rural_Gap',ascending=False)

### Insight: Urban unemployment is higher in states like Ladakh (9.6%) while  rural unemployment dominates in Punjab (equal rates(0))

### MPCE (Consumption) Analysis 
### 1. Rural vs Urban MPCE Growth

### Unemployment Heatmap 

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(unemp_rate_f[['Rural_Person', 'Urban_Person', 'Rural+Urban_Person']],annot=True,cmap='YlOrRd',linewidth=0.5,linecolor='black',annot_kws={'size':7,'color':'black'})
plt.title('Unemployment Heatmap',fontsize=14,fontweight='bold',color='darkred')
plt.xticks(rotation=0,fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(data=trend_consump_long, x='Year', y='MPCE', hue='Sector_')
plt.title('MPCE Growth Trends',fontsize=16)
plt.show()

### Combined Insights:

1. **Gender & Sectoral Gaps**:
   * High female unemployment correlates with lower MPCE growth in states like Kerala (16.7% female unemployment).
   * Urban areas show higher MPCE but also higher unemployment volatility (e.g., Chandigarh: 16.3% female urban unemployment).
2. **Policy Implications**:
   * **Rural Focus**: Boost employment schemes (e.g., MGNREGA) to align with MPCE growth.
   * **Urban Focus**: Address female unemployment (e.g., skill development) to stabilize consumption.

In [None]:
%who_ls DataFrame |