In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import geopandas as gpd
from pyproj import CRS
import pickle
from datetime import datetime


# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

print(f"EDA started at: {datetime.now()}")






EDA started at: 2025-12-03 12:02:51.994500


In [2]:
# Load cleaned Data 

try:
    df = pd.read_parquet('cleaned_agricultural_data.parquet')
except:
    df = pd.read_csv('cleaned_agricultural_data.csv')

print(f"Dataset loaded: {df.shape}")
print(f"Years: {df['Year'].min()} to {df['Year'].max()}")
print(f"States: {df['State Name'].nunique()}")
print(f"Districts: {df['Dist Name'].nunique()}")





Dataset loaded: (2484, 118)
Years: 2010 to 2017
States: 20
Districts: 311


In [None]:
# 2.Statistical and Distribution Analysis

def comprehensive_statistical_summary(df):
    """Generate comprehensive statistical summary"""
    
    # Basic statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    basic_stats = df[numeric_cols].describe().T
    basic_stats['skewness'] = df[numeric_cols].skew()
    basic_stats['kurtosis'] = df[numeric_cols].kurtosis()
    basic_stats['cv'] = basic_stats['std'] / basic_stats['mean']  # Coefficient of variation
    
    # Normality tests for key columns
    key_columns = ['OVERALL_YIELD_Kg_per_ha', 'TOTAL_PRODUCTION_1000_tons', 
                   'RICE YIELD (Kg per ha)', 'WHEAT YIELD (Kg per ha)']
    
    normality_results = {}
    for col in key_columns:
        if col in df.columns:
            stat, p_value = stats.shapiro(df[col].dropna().sample(min(5000, len(df))))
            normality_results[col] = {'statistic': stat, 'p_value': p_value}
    
    return basic_stats, normality_results

# Generate statistics
basic_stats, normality_results = comprehensive_statistical_summary(df)

print("Statistical Summary (Key Columns):")
print("=" * 80)
print(basic_stats[['mean', 'std', 'min', '50%', 'max', 'skewness', 'cv']].head(10))



