In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = "https://www.kaggle.com/datasets/gokulrajkmv/unemployment-in-india"
print("Please download the unemployment dataset from:", url)
print("Expected filename: Unemployment in India.csv")

In [None]:
try:
    df = pd.read_csv('Unemployment in India.csv')
except:
    print("Dataset not found. Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 1000
    
    states = ['Andhra Pradesh', 'Assam', 'Bihar', 'Gujarat', 'Haryana', 'Karnataka', 
              'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Odisha', 'Punjab', 
              'Rajasthan', 'Tamil Nadu', 'Telangana', 'Uttar Pradesh', 'West Bengal']
    
    regions = ['Rural', 'Urban']
    
    dates = pd.date_range('2019-01-01', '2020-12-31', freq='M')
    
    data = []
    for _ in range(n_samples):
        state = np.random.choice(states)
        region = np.random.choice(regions)
        date = np.random.choice(dates)
        
        base_unemployment = np.random.uniform(1, 25)
        estimated_unemployment = base_unemployment + np.random.normal(0, 2)
        estimated_employed = 100 - estimated_unemployment - np.random.uniform(5, 15)
        estimated_labour_participation = np.random.uniform(40, 70)
        
        data.append({
            'Region': state,
            'Date': date,
            'Frequency': 'Monthly',
            'Estimated Unemployment Rate (%)': max(0, estimated_unemployment),
            'Estimated Employed': max(0, estimated_employed),
            'Estimated Labour Participation Rate (%)': estimated_labour_participation,
            'Region.1': region,
            'longitude': np.random.uniform(68, 97),
            'latitude': np.random.uniform(8, 37)
        })
    
    df = pd.DataFrame(data)

In [None]:
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())

In [None]:
print("\nFirst 5 rows:")
print(df.head())

In [None]:
print("\nDataset Description:")
print(df.describe())

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [None]:
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month

In [None]:
print("\nUnique values in categorical columns:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].nunique()} unique values")

In [None]:
unemployment_col = 'Estimated Unemployment Rate (%)'
employed_col = 'Estimated Employed'
participation_col = 'Estimated Labour Participation Rate (%)'

In [None]:
plt.figure(figsize=(15, 12))

plt.subplot(2, 3, 1)
plt.hist(df[unemployment_col], bins=30, alpha=0.7, color='red')
plt.title('Distribution of Unemployment Rate')
plt.xlabel('Unemployment Rate (%)')
plt.ylabel('Frequency')

plt.subplot(2, 3, 2)
if 'Region.1' in df.columns:
    sns.boxplot(data=df, x='Region.1', y=unemployment_col)
    plt.title('Unemployment Rate by Region Type')
    plt.xticks(rotation=45)

plt.subplot(2, 3, 3)
if 'Year' in df.columns:
    yearly_unemployment = df.groupby('Year')[unemployment_col].mean()
    plt.plot(yearly_unemployment.index, yearly_unemployment.values, marker='o')
    plt.title('Average Unemployment Rate by Year')
    plt.xlabel('Year')
    plt.ylabel('Unemployment Rate (%)')

plt.subplot(2, 3, 4)
if 'Region' in df.columns:
    state_unemployment = df.groupby('Region')[unemployment_col].mean().sort_values(ascending=False).head(10)
    plt.bar(range(len(state_unemployment)), state_unemployment.values)
    plt.title('Top 10 States by Unemployment Rate')
    plt.xlabel('States')
    plt.ylabel('Unemployment Rate (%)')
    plt.xticks(range(len(state_unemployment)), state_unemployment.index, rotation=90)

plt.subplot(2, 3, 5)
correlation_cols = [unemployment_col, employed_col, participation_col]
if all(col in df.columns for col in correlation_cols):
    correlation_matrix = df[correlation_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')

plt.subplot(2, 3, 6)
if 'Month' in df.columns:
    monthly_unemployment = df.groupby('Month')[unemployment_col].mean()
    plt.plot(monthly_unemployment.index, monthly_unemployment.values, marker='s', color='green')
    plt.title('Average Unemployment Rate by Month')
    plt.xlabel('Month')
    plt.ylabel('Unemployment Rate (%)')
    plt.xticks(range(1, 13))

plt.tight_layout()
plt.show()

In [None]:
print(f"\nOverall Statistics:")
print(f"Average Unemployment Rate: {df[unemployment_col].mean():.2f}%")
print(f"Maximum Unemployment Rate: {df[unemployment_col].max():.2f}%")
print(f"Minimum Unemployment Rate: {df[unemployment_col].min():.2f}%")
print(f"Standard Deviation: {df[unemployment_col].std():.2f}%")

In [None]:
if 'Region' in df.columns:
    print(f"\nState-wise Analysis:")
    state_stats = df.groupby('Region')[unemployment_col].agg(['mean', 'max', 'min', 'std'])
    print(state_stats.round(2))

In [None]:
if 'Region.1' in df.columns:
    print(f"\nRegion-wise Analysis:")
    region_stats = df.groupby('Region.1')[unemployment_col].agg(['mean', 'max', 'min', 'count'])
    print(region_stats.round(2))

In [None]:
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
target_column = unemployment_col

if target_column in numeric_columns:
    feature_columns = [col for col in numeric_columns if col != target_column]
    
    if feature_columns:
        le = LabelEncoder()
        df_model = df.copy()
        
        for col in df_model.select_dtypes(include=['object']).columns:
            df_model[col] = le.fit_transform(df_model[col].astype(str))
        
        X = df_model[feature_columns]
        y = df_model[target_column]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        models = {
            'Linear Regression': LinearRegression(),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
        }
        
        for name, model in models.items():
            if name == 'Linear Regression':
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
            else:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
            
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            
            print(f"\n{name} Results:")
            print(f"Mean Squared Error: {mse:.4f}")
            print(f"R² Score: {r2:.4f}")
            print(f"Mean Absolute Error: {mae:.4f}")

In [None]:
plt.figure(figsize=(12, 8))

if 'Date' in df.columns:
    plt.subplot(2, 2, 1)
    monthly_data = df.groupby(df['Date'].dt.to_period('M'))[unemployment_col].mean()
    plt.plot(monthly_data.index.astype(str), monthly_data.values)
    plt.title('Unemployment Rate Trend Over Time')
    plt.xlabel('Date')
    plt.ylabel('Unemployment Rate (%)')
    plt.xticks(rotation=45)

if 'Region' in df.columns:
    plt.subplot(2, 2, 2)
    top_states = df.groupby('Region')[unemployment_col].mean().nlargest(10)
    plt.barh(range(len(top_states)), top_states.values)
    plt.yticks(range(len(top_states)), top_states.index)
    plt.title('Top 10 States with Highest Unemployment')
    plt.xlabel('Unemployment Rate (%)')

plt.subplot(2, 2, 3)
plt.scatter(df[participation_col], df[unemployment_col], alpha=0.6)
plt.xlabel('Labour Participation Rate (%)')
plt.ylabel('Unemployment Rate (%)')
plt.title('Unemployment vs Labour Participation')

plt.subplot(2, 2, 4)
if 'Region.1' in df.columns:
    region_comparison = df.groupby('Region.1')[unemployment_col].mean()
    plt.pie(region_comparison.values, labels=region_comparison.index, autopct='%1.1f%%')
    plt.title('Unemployment Distribution by Region Type')

plt.tight_layout()
plt.show()

In [None]:
print("\nKey Insights:")
print("1. Unemployment analysis completed with statistical summaries")
print("2. Regional and temporal patterns identified")
print("3. Correlation analysis performed between different employment metrics")
print("4. Predictive models trained for unemployment forecasting")