In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import geopandas as gpd
from pyproj import CRS
import pickle
from datetime import datetime


# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

print(f"EDA started at: {datetime.now()}")






EDA started at: 2025-12-03 12:02:51.994500


In [2]:
# Load cleaned Data 

try:
    df = pd.read_parquet('cleaned_agricultural_data.parquet')
except:
    df = pd.read_csv('cleaned_agricultural_data.csv')

print(f"Dataset loaded: {df.shape}")
print(f"Years: {df['Year'].min()} to {df['Year'].max()}")
print(f"States: {df['State Name'].nunique()}")
print(f"Districts: {df['Dist Name'].nunique()}")





Dataset loaded: (2484, 118)
Years: 2010 to 2017
States: 20
Districts: 311


In [5]:
# 2.Statistical and Distribution Analysis

def comprehensive_statistical_summary(df):
    """Generate comprehensive statistical summary"""
    
    # Basic statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    basic_stats = df[numeric_cols].describe().T
    basic_stats['skewness'] = df[numeric_cols].skew()
    basic_stats['kurtosis'] = df[numeric_cols].kurtosis()
    basic_stats['cv'] = basic_stats['std'] / basic_stats['mean']  # Coefficient of variation
    
    # Normality tests for key columns
    key_columns = [
        'OVERALL_YIELD_Kg_per_ha',
        'TOTAL_PRODUCTION_1000_tons',
        'RICE YIELD (Kg per ha)',
        'WHEAT YIELD (Kg per ha)'
    ]
    
    normality_results = {}
    
    for col in key_columns:
        if col in df.columns:
            col_data = df[col].dropna()
            
            # Shapiro max is 5000 samples
            sample_size = min(len(col_data), 5000)
            
            # Only sample if we have more than 3 values
            if sample_size > 3:
                sample = col_data.sample(sample_size, replace=False)
                stat, p_value = stats.shapiro(sample)
                normality_results[col] = {
                    'statistic': stat,
                    'p_value': p_value
                }
            else:
                normality_results[col] = {
                    'statistic': None,
                    'p_value': None
                }
    
    return basic_stats, normality_results


# Generate statistics
basic_stats, normality_results = comprehensive_statistical_summary(df)

print("Statistical Summary (Key Columns):")
print("=" * 80)
print(basic_stats[['mean', 'std', 'min', '50%', 'max', 'skewness', 'cv']].head(10))



Statistical Summary (Key Columns):
                                      mean          std     min       50%        max  skewness        cv
Dist Code                       270.787440   279.103876     1.0   156.000   917.0000  1.178858  1.030712
Year                           2013.497987     2.292275  2010.0  2013.000  2017.0000  0.001089  0.001138
State Code                        9.579710     4.993319     1.0    10.000    20.0000  0.294842  0.521239
RICE AREA (1000 ha)             134.141732   156.417189     0.0    79.675   747.5777  1.548856  1.166059
RICE PRODUCTION (1000 tons)     338.239573   425.917681     0.0   178.145  1923.3372  1.717933  1.259219
RICE YIELD (Kg per ha)         2078.304614  1115.620580     0.0  2178.335  4684.4664 -0.105758  0.536794
WHEAT AREA (1000 ha)             95.806710   112.937175     0.0    48.970   519.8319  1.388971  1.178802
WHEAT PRODUCTION (1000 tons)    310.925588   455.633676     0.0    90.635  2543.7544  2.378978  1.465411
WHEAT YIELD (Kg per 