In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
import matplotlib.dates as mdates

# Set the style for our plots
plt.style.use('fivethirtyeight')
sns.set_palette("deep")

# Load the data
df = pd.read_csv('/content/Unemployment_Rate_upto_11_2020.csv')

# Print column names to find the actual column name for date
print("Column names in the dataset:")
print(df.columns.tolist())

# Display the first few rows to understand the data structure
print("\nData Overview:")
print(df.head())

# Check data information
print("\nData Information:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Convert ' Date' column to datetime (with proper column name)
df[' Date'] = pd.to_datetime(df[' Date'].str.strip(), format='%d-%m-%Y')

# Basic statistics
print("\nBasic Statistics:")
print(df.describe())

# Number of regions in the dataset
print("\nNumber of unique regions:", df['Region'].nunique())
print("Regions:", df['Region'].unique())

# Add a month column for easier grouping
df['Month'] = df[' Date'].dt.month
df['Month_Name'] = df[' Date'].dt.strftime('%b')
df['Year'] = df[' Date'].dt.year

# Group by Region to see average unemployment rates
region_avg = df.groupby('Region')[' Estimated Unemployment Rate (%)'].mean().sort_values(ascending=False)
print("\nAverage Unemployment Rate by Region:")
print(region_avg)

# Group by Month to see seasonal patterns
month_avg = df.groupby(['Month', 'Month_Name'])[' Estimated Unemployment Rate (%)'].mean().reset_index()
month_avg = month_avg.sort_values('Month')
print("\nAverage Unemployment Rate by Month:")
print(month_avg)

# Visualization 1: Time series of unemployment rate for all regions
plt.figure(figsize=(14, 8))
for region in sorted(df['Region'].unique()):
    region_data = df[df['Region'] == region]
    plt.plot(region_data[' Date'], region_data[' Estimated Unemployment Rate (%)'], label=region)

plt.title('Unemployment Rate Trends Across Regions (Jan-Oct 2020)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Unemployment Rate (%)', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('unemployment_time_series.png', dpi=300)
plt.close()

# Visualization 2: Regional comparison with boxplot
plt.figure(figsize=(14, 10))
sns.boxplot(x='Region', y=' Estimated Unemployment Rate (%)', data=df, palette='viridis')
plt.title('Distribution of Unemployment Rates by Region', fontsize=16)
plt.xlabel('Region', fontsize=14)
plt.ylabel('Unemployment Rate (%)', fontsize=14)
plt.xticks(rotation=90)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('unemployment_by_region.png', dpi=300)
plt.close()

# Visualization 3: COVID impact - pre vs during (April-May 2020)
pre_covid = df[df[' Date'] < pd.to_datetime('2020-04-01')]
during_covid = df[(df[' Date'] >= pd.to_datetime('2020-04-01')) & (df[' Date'] <= pd.to_datetime('2020-05-31'))]

pre_avg = pre_covid.groupby('Region')[' Estimated Unemployment Rate (%)'].mean().reset_index()
during_avg = during_covid.groupby('Region')[' Estimated Unemployment Rate (%)'].mean().reset_index()

# Combine pre and during data for comparison
comparison = pd.merge(pre_avg, during_avg, on='Region', suffixes=('_pre', '_during'))
comparison['change'] = comparison[' Estimated Unemployment Rate (%)_during'] - comparison[' Estimated Unemployment Rate (%)_pre']
comparison = comparison.sort_values('change', ascending=False)

plt.figure(figsize=(14, 10))
plt.bar(comparison['Region'], comparison['change'], color='crimson')
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.title('Increase in Unemployment Rate During COVID (Apr-May 2020 vs Pre-COVID)', fontsize=16)
plt.xlabel('Region', fontsize=14)
plt.ylabel('Change in Unemployment Rate (%)', fontsize=14)
plt.xticks(rotation=90)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('covid_impact.png', dpi=300)
plt.close()

# Visualization 4: Heatmap of unemployment rates over time by region
pivot_df = df.pivot_table(index='Region', columns=df[' Date'].dt.strftime('%b-%Y'),
                         values=' Estimated Unemployment Rate (%)', aggfunc='mean')

plt.figure(figsize=(16, 12))
sns.heatmap(pivot_df, cmap='YlOrRd', annot=True, fmt='.1f', linewidths=.5)
plt.title('Unemployment Rate Heatmap by Region and Month (2020)', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Region', fontsize=14)
plt.tight_layout()
plt.savefig('unemployment_heatmap.png', dpi=300)
plt.close()

# Visualization 5: Regional unemployment by geographical regions
# Make sure the column exists before using it
if 'geographical_region' not in df.columns:
    # Use the 'Region' column instead
    plt.figure(figsize=(14, 8))
    sns.boxplot(x='Region', y=' Estimated Unemployment Rate (%)', data=df, palette='Set2')
    plt.title('Unemployment Distribution by Region', fontsize=16)
    plt.xlabel('Region', fontsize=14)
    plt.ylabel('Unemployment Rate (%)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig('unemployment_by_region_detailed.png', dpi=300)
    plt.close()
else:
    plt.figure(figsize=(14, 8))
    sns.boxplot(x='geographical_region', y=' Estimated Unemployment Rate (%)', data=df, palette='Set2')
    plt.title('Unemployment Distribution by Geographical Region', fontsize=16)
    plt.xlabel('Geographical Region', fontsize=14)
    plt.ylabel('Unemployment Rate (%)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('unemployment_by_geo_region.png', dpi=300)
    plt.close()

# Visualization 6: Top 5 regions with highest average unemployment during COVID
top_during_covid = during_covid.groupby('Region')[' Estimated Unemployment Rate (%)'].mean().sort_values(ascending=False).head(5)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_during_covid.index, y=top_during_covid.values, palette='Reds_r')
plt.title('Top 5 Regions with Highest Unemployment During COVID (Apr-May 2020)', fontsize=16)
plt.xlabel('Region', fontsize=14)
plt.ylabel('Average Unemployment Rate (%)', fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('top5_unemployment_covid.png', dpi=300)
plt.close()

# Visualization 7: Relationship between Labour Participation and Unemployment
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x=' Estimated Labour Participation Rate (%)',
                y=' Estimated Unemployment Rate (%)', hue='Region', size=' Estimated Employed',
                sizes=(20, 200), alpha=0.7)
plt.title('Relationship: Labour Participation vs Unemployment Rate', fontsize=16)
plt.xlabel('Labour Participation Rate (%)', fontsize=14)
plt.ylabel('Unemployment Rate (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('participation_vs_unemployment.png', dpi=300)
plt.close()

# Deeper analysis: Monthly trends across all regions combined
monthly_trend = df.groupby(df[' Date'].dt.strftime('%b-%Y'))[' Estimated Unemployment Rate (%)'].mean().reset_index()
monthly_trend[' Date'] = pd.to_datetime(monthly_trend[' Date'], format='%b-%Y')
monthly_trend = monthly_trend.sort_values(' Date')

plt.figure(figsize=(14, 6))
plt.plot(monthly_trend[' Date'], monthly_trend[' Estimated Unemployment Rate (%)'], marker='o', linewidth=2, color='darkred')
plt.axvspan(pd.to_datetime('2020-04-01'), pd.to_datetime('2020-05-31'), color='red', alpha=0.2, label='COVID-19 Lockdown Period')
plt.title('Average Monthly Unemployment Rate Across India (2020)', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Unemployment Rate (%)', fontsize=14)
plt.xticks(monthly_trend[' Date'], monthly_trend[' Date'].dt.strftime('%b'), rotation=45)
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.savefig('monthly_unemployment_trend.png', dpi=300)
plt.close()

# Recovery analysis: Compare April peak to October values
april_data = df[df[' Date'].dt.month == 4].groupby('Region')[' Estimated Unemployment Rate (%)'].mean()
october_data = df[df[' Date'].dt.month == 10].groupby('Region')[' Estimated Unemployment Rate (%)'].mean()

recovery = pd.DataFrame({'April': april_data, 'October': october_data})
recovery['Recovery (%)'] = ((recovery['April'] - recovery['October']) / recovery['April']) * 100
recovery = recovery.sort_values('Recovery (%)', ascending=False)

plt.figure(figsize=(14, 8))
sns.barplot(x=recovery.index, y=recovery['Recovery (%)'], palette='RdYlGn')
plt.title('Recovery from COVID Unemployment Peak (April to October 2020)', fontsize=16)
plt.xlabel('Region', fontsize=14)
plt.ylabel('Recovery Percentage (%)', fontsize=14)
plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
plt.xticks(rotation=90)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('unemployment_recovery.png', dpi=300)
plt.close()

print("\nAnalysis Complete! Visualizations saved to files.")

# Statistical Analysis: Correlation between variables
correlation = df[[' Estimated Unemployment Rate (%)', ' Estimated Labour Participation Rate (%)', ' Estimated Employed']].corr()
print("\nCorrelation Matrix:")
print(correlation)

# Summarize key findings
print("\nKey Findings:")
print(f"1. Average unemployment rate before COVID (Jan-Mar 2020): {pre_covid[' Estimated Unemployment Rate (%)'].mean():.2f}%")
print(f"2. Average unemployment rate during peak COVID period (Apr-May 2020): {during_covid[' Estimated Unemployment Rate (%)'].mean():.2f}%")
print(f"3. Region with highest unemployment during COVID: {during_avg.sort_values(' Estimated Unemployment Rate (%)', ascending=False).iloc[0]['Region']} ({during_avg.sort_values(' Estimated Unemployment Rate (%)', ascending=False).iloc[0][' Estimated Unemployment Rate (%)']:.2f}%)")
print(f"4. Region with lowest unemployment during COVID: {during_avg.sort_values(' Estimated Unemployment Rate (%)').iloc[0]['Region']} ({during_avg.sort_values(' Estimated Unemployment Rate (%)').iloc[0][' Estimated Unemployment Rate (%)']:.2f}%)")

# Calculate recovery rates for all regions
late_covid = df[df[' Date'] >= pd.to_datetime('2020-09-01')]
late_avg = late_covid.groupby('Region')[' Estimated Unemployment Rate (%)'].mean()
recovery_rate = ((during_avg.set_index('Region')[' Estimated Unemployment Rate (%)'] - late_avg) /
                 during_avg.set_index('Region')[' Estimated Unemployment Rate (%)'] * 100)

print(f"5. Region with best recovery by October: {recovery_rate.idxmax()} ({recovery_rate.max():.2f}%)")
print(f"6. Region with worst recovery by October: {recovery_rate.idxmin()} ({recovery_rate.min():.2f}%)")

# NEW: Analysis of regional distribution
if 'Region' in df.columns and 'latitude' in df.columns and 'longitude' in df.columns:
    # Geographical visualization prep (if you have a plotting library for maps)
    # Let's create a dataframe for regional averages with coordinates
    region_geo = df.groupby(['Region', 'latitude', 'longitude'])[' Estimated Unemployment Rate (%)'].mean().reset_index()
    print("\nRegional Geographical Summary:")
    print(region_geo.head())

Column names in the dataset:
['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)', ' Estimated Employed', ' Estimated Labour Participation Rate (%)', 'Region.1', 'longitude', 'latitude']

Data Overview:
           Region         Date  Frequency   Estimated Unemployment Rate (%)  \
0  Andhra Pradesh   31-01-2020          M                              5.48   
1  Andhra Pradesh   29-02-2020          M                              5.83   
2  Andhra Pradesh   31-03-2020          M                              5.79   
3  Andhra Pradesh   30-04-2020          M                             20.51   
4  Andhra Pradesh   31-05-2020          M                             17.43   

    Estimated Employed   Estimated Labour Participation Rate (%) Region.1  \
0             16635535                                     41.02    South   
1             16545652                                     40.90    South   
2             15881197                                     39.18    South  


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Region', y=' Estimated Unemployment Rate (%)', data=df, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Region', y=' Estimated Unemployment Rate (%)', data=df, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_during_covid.index, y=top_during_covid.values, palette='Reds_r')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=recovery.index, y=recovery['Recovery (%)'], palett


Analysis Complete! Visualizations saved to files.

Correlation Matrix:
                                         Estimated Unemployment Rate (%)  \
Estimated Unemployment Rate (%)                                 1.000000   
Estimated Labour Participation Rate (%)                        -0.073540   
Estimated Employed                                             -0.245176   

                                         Estimated Labour Participation Rate (%)  \
Estimated Unemployment Rate (%)                                        -0.073540   
Estimated Labour Participation Rate (%)                                 1.000000   
Estimated Employed                                                     -0.047948   

                                         Estimated Employed  
Estimated Unemployment Rate (%)                   -0.245176  
Estimated Labour Participation Rate (%)           -0.047948  
Estimated Employed                                 1.000000  

Key Findings:
1. Average unemployment