# 02_exploratory_data_analysis

## Comprehensive EDA for Biomass and Carbon Data

**Objectives:**
- Analyze spatial patterns
- Identify correlations between variables
- Visualize carbon distribution
- Detect outliers and anomalies
- Feature importance analysis
- Interactive mapping

## 1. Import Dependencies and Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium import plugins
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Setup plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Set random seed for reproducibility
np.random.seed(42)

## 2. Data Loading and Preparation

In [None]:
# Load processed data from previous notebook
try:
    biomass_df = pd.read_csv('data/processed_biomass_csv.csv')
    json_df = pd.read_csv('data/processed_biomass_json.csv')
    print("‚úÖ Successfully loaded processed data")
    print(f"Biomass data shape: {biomass_df.shape}")
    print(f"JSON data shape: {json_df.shape}")
except FileNotFoundError:
    print("‚ö†Ô∏è Processed data not found. Creating sample data...")
    from src.data_processing.loader import EnhancedBiomassLoader
    loader = EnhancedBiomassLoader()
    loader.create_sample_csv()
    loader.create_sample_json()
    biomass_df = loader.load_csv("sample_biomass.csv")
    json_data = loader.load_json("sample_biomass.json")
    
    # Convert JSON to DataFrame
    biomass_readings = []
    for reading in json_data['biomass_readings']:
        biomass_readings.append({
            'site_id': reading['site_id'],
            'biomass': reading['biomass'],
            'latitude': reading['coordinates']['lat'],
            'longitude': reading['coordinates']['lon'],
            'quality_flag': reading['quality_flag']
        })
    json_df = pd.DataFrame(biomass_readings)

# Add carbon estimation (assuming biomass to carbon conversion factor)
def biomass_to_carbon(biomass, conversion_factor=0.47):
    """Convert biomass to carbon stock using standard conversion factor."""
    return biomass * conversion_factor

# Add carbon columns
biomass_df['carbon_stock'] = biomass_to_carbon(biomass_df['biomass_value'])
json_df['carbon_stock'] = biomass_to_carbon(json_df['biomass'])

# Display basic info
print("\nüìä Biomass Data Overview:")
print(biomass_df.info())
print("\nüìä JSON Data Overview:")
print(json_df.info())

## 3. Spatial Distribution Analysis

In [None]:
def analyze_spatial_distribution(df, title_suffix=""):
    """Comprehensive spatial analysis of biomass and carbon distribution."""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            f'Biomass Spatial Distribution {title_suffix}',
            f'Carbon Stock Spatial Distribution {title_suffix}',
            'Biomass Density Heatmap',
            'Carbon Density Heatmap'
        ),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Scatter plots for spatial distribution
    biomass_col = 'biomass_value' if 'biomass_value' in df.columns else 'biomass'
    
    # 1. Biomass spatial scatter
    fig.add_trace(
        go.Scatter(
            x=df['longitude'], y=df['latitude'],
            mode='markers',
            marker=dict(
                size=8,
                color=df[biomass_col],
                colorscale='Viridis',
                showscale=True,
                colorbar=dict(title='Biomass')
            ),
            text=df[biomass_col].round(2),
            hovertemplate='<b>Lat</b>: %{y}<br><b>Lon</b>: %{x}<br><b>Biomass</b>: %{text}<extra></extra>'
        ),
        row=1, col=1
    )
    
    # 2. Carbon spatial scatter
    fig.add_trace(
        go.Scatter(
            x=df['longitude'], y=df['latitude'],
            mode='markers',
            marker=dict(
                size=8,
                color=df['carbon_stock'],
                colorscale='Plasma',
                showscale=True,
                colorbar=dict(title='Carbon')
            ),
            text=df['carbon_stock'].round(2),
            hovertemplate='<b>Lat</b>: %{y}<br><b>Lon</b>: %{x}<br><b>Carbon</b>: %{text}<extra></extra>'
        ),
        row=1, col=2
    )
    
    # 3. Biomass density heatmap
    fig.add_trace(
        go.Densitymapbox(
            lat=df['latitude'],
            lon=df['longitude'],
            z=df[biomass_col],
            radius=20,
            colorscale='Viridis',
            colorbar=dict(title='Biomass Density')
        ),
        row=2, col=1
    )
    
    # 4. Carbon density heatmap
    fig.add_trace(
        go.Densitymapbox(
            lat=df['latitude'],
            lon=df['longitude'],
            z=df['carbon_stock'],
            radius=20,
            colorscale='Plasma',
            colorbar=dict(title='Carbon Density')
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text=f"Spatial Distribution Analysis {title_suffix}",
        height=800,
        showlegend=False
    )
    
    fig.show()
    
    # Spatial statistics
    print(f"\nüìà Spatial Statistics {title_suffix}:")
    print(f"Geographic Range - Lat: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
    print(f"Geographic Range - Lon: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")
    print(f"Spatial Coverage: {df['latitude'].max() - df['latitude'].min():.4f}¬∞ lat √ó {df['longitude'].max() - df['longitude'].min():.4f}¬∞ lon")

# Analyze spatial distribution for both datasets
analyze_spatial_distribution(biomass_df, "(CSV Data)")
analyze_spatial_distribution(json_df, "(JSON Data)")

## 4. Correlation Matrix of Variables

In [None]:
def comprehensive_correlation_analysis(df, title_suffix=""):
    """Perform detailed correlation analysis with multiple visualization methods."""
    
    # Select numerical columns for correlation
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numerical_cols].corr()
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            f'Correlation Matrix {title_suffix}',
            'Correlation Heatmap',
            'Top Correlations with Biomass',
            'Top Correlations with Carbon'
        ),
        specs=[[{"type": "table"}, {"type": "heatmap"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )
    
    # 1. Correlation matrix table
    fig.add_trace(
        go.Table(
            header=dict(values=['Variable'] + list(corr_matrix.columns)),
            cells=dict(values=[corr_matrix.index] + [corr_matrix[col].round(3) for col in corr_matrix.columns])
        ),
        row=1, col=1
    )
    
    # 2. Correlation heatmap
    fig.add_trace(
        go.Heatmap(
            z=corr_matrix.values,
            x=corr_matrix.columns,
            y=corr_matrix.index,
            colorscale='RdBu_r',
            zmin=-1, zmax=1,
            colorbar=dict(title='Correlation')
        ),
        row=1, col=2
    )
    
    # 3. Top correlations with biomass
    biomass_col = 'biomass_value' if 'biomass_value' in corr_matrix.index else 'biomass'
    if biomass_col in corr_matrix.index:
        biomass_corrs = corr_matrix[biomass_col].drop(biomass_col).sort_values(ascending=False)
        fig.add_trace(
            go.Bar(x=biomass_corrs.index, y=biomass_corrs.values,
                  marker_color='lightgreen'),
            row=2, col=1
        )
    
    # 4. Top correlations with carbon
    if 'carbon_stock' in corr_matrix.index:
        carbon_corrs = corr_matrix['carbon_stock'].drop('carbon_stock').sort_values(ascending=False)
        fig.add_trace(
            go.Bar(x=carbon_corrs.index, y=carbon_corrs.values,
                  marker_color='lightcoral'),
            row=2, col=2
        )
    
    fig.update_layout(
        title_text=f"Comprehensive Correlation Analysis {title_suffix}",
        height=800
    )
    
    fig.show()
    
    # Statistical significance of correlations
    print(f"\nüîç Correlation Insights {title_suffix}:")
    for col1 in corr_matrix.columns:
        for col2 in corr_matrix.columns:
            if col1 < col2 and abs(corr_matrix.loc[col1, col2]) > 0.5:
                print(f"Strong correlation: {col1} ‚Üî {col2}: {corr_matrix.loc[col1, col2]:.3f}")

# Perform correlation analysis for both datasets
comprehensive_correlation_analysis(biomass_df, "(CSV Data)")
comprehensive_correlation_analysis(json_df, "(JSON Data)")

## 5. Temporal Trends Analysis (Simulated)

In [None]:
def analyze_temporal_trends():
    """Analyze temporal patterns in biomass and carbon data (simulated)."""
    
    # Simulate temporal data (monthly trends over 2 years)
    dates = pd.date_range('2022-01-01', '2023-12-31', freq='M')
    n_periods = len(dates)
    
    # Simulate seasonal biomass patterns
    base_biomass = 2.5
    seasonal_pattern = np.sin(2 * np.pi * np.arange(n_periods) / 12) * 0.8  # Annual cycle
    trend = np.arange(n_periods) * 0.05  # Slight increasing trend
    noise = np.random.normal(0, 0.2, n_periods)
    
    simulated_biomass = base_biomass + seasonal_pattern + trend + noise
    simulated_carbon = biomass_to_carbon(simulated_biomass)
    
    temporal_df = pd.DataFrame({
        'date': dates,
        'biomass': simulated_biomass,
        'carbon_stock': simulated_carbon,
        'month': dates.month,
        'year': dates.year
    })
    
    # Create temporal visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Biomass Trends Over Time',
            'Carbon Stock Trends Over Time',
            'Seasonal Biomass Patterns',
            'Year-over-Year Comparison'
        )
    )
    
    # 1. Biomass trends
    fig.add_trace(
        go.Scatter(x=temporal_df['date'], y=temporal_df['biomass'],
                 mode='lines+markers', name='Biomass', line=dict(color='green')),
        row=1, col=1
    )
    
    # 2. Carbon trends
    fig.add_trace(
        go.Scatter(x=temporal_df['date'], y=temporal_df['carbon_stock'],
                 mode='lines+markers', name='Carbon Stock', line=dict(color='blue')),
        row=1, col=2
    )
    
    # 3. Seasonal patterns
    monthly_avg = temporal_df.groupby('month').agg({'biomass': 'mean', 'carbon_stock': 'mean'}).reset_index()
    fig.add_trace(
        go.Scatter(x=monthly_avg['month'], y=monthly_avg['biomass'],
                 mode='lines+markers', name='Monthly Biomass', line=dict(color='darkgreen')),
        row=2, col=1
    )
    
    # 4. Year-over-year comparison
    yearly_avg = temporal_df.groupby('year').agg({'biomass': 'mean', 'carbon_stock': 'mean'}).reset_index()
    fig.add_trace(
        go.Bar(x=yearly_avg['year'], y=yearly_avg['biomass'],
              name='Yearly Avg Biomass', marker_color='lightgreen'),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text="Temporal Trends Analysis (Simulated Data)",
        height=700
    )
    
    fig.show()
    
    # Statistical analysis of trends
    print("\nüìà Temporal Analysis Insights:")
    print(f"Overall biomass trend: {temporal_df['biomass'].iloc[-1] - temporal_df['biomass'].iloc[0]:.3f} change")
    print(f"Seasonal amplitude: {simulated_biomass.max() - simulated_biomass.min():.3f}")
    print(f"Average monthly biomass: {temporal_df['biomass'].mean():.3f} ¬± {temporal_df['biomass'].std():.3f}")
    
    return temporal_df

# Run temporal analysis
temporal_data = analyze_temporal_trends()

## 6. Feature Importance Analysis

In [None]:
def feature_importance_analysis(df, target_column):
    """Analyze feature importance for predicting biomass/carbon."""
    
    # Prepare features and target
    feature_columns = [col for col in df.columns if col not in [target_column, 'site_id', 'quality_flag', 'biomass_category'] 
                     and pd.api.types.is_numeric_dtype(df[col])]
    
    X = df[feature_columns].fillna(df[feature_columns].median())
    y = df[target_column]
    
    # Remove constant columns
    X = X.loc[:, X.std() > 0]
    
    if len(X.columns) == 0:
        print(f"No valid features for {target_column} analysis")
        return
    
    # Random Forest for feature importance
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Get feature importance
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Create visualization
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(
            f'Feature Importance for {target_column}',
            'Cumulative Importance'
        )
    )
    
    # Feature importance bars
    fig.add_trace(
        go.Bar(x=importance_df['importance'], y=importance_df['feature'],
              orientation='h', marker_color='skyblue'),
        row=1, col=1
    )
    
    # Cumulative importance
    cumulative_importance = importance_df['importance'].cumsum()
    fig.add_trace(
        go.Scatter(x=cumulative_importance, y=importance_df['feature'],
                 mode='lines+markers', line=dict(color='red', width=3)),
        row=1, col=2
    )
    
    fig.update_layout(
        title_text=f"Feature Importance Analysis - {target_column}",
        height=500,
        showlegend=False
    )
    
    fig.show()
    
    # Print insights
    print(f"\nüîç Feature Importance for {target_column}:")
    print(f"Top 3 most important features:")
    for i, row in importance_df.head(3).iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")
    
    print(f"\nFeatures explaining 80% of variance: {len(cumulative_importance[cumulative_importance <= 0.8])}")
    
    return importance_df

# Analyze feature importance for both targets
print("="*60)
target_col = 'biomass_value' if 'biomass_value' in biomass_df.columns else 'biomass'
biomass_importance = feature_importance_analysis(biomass_df, target_col)

print("\n" + "="*60)
carbon_importance = feature_importance_analysis(biomass_df, 'carbon_stock')

## 7. Interactive Maps with Folium

In [None]:
def create_interactive_maps(df, map_title="Biomass Distribution"):
    """Create interactive Folium maps for spatial analysis."""
    
    biomass_col = 'biomass_value' if 'biomass_value' in df.columns else 'biomass'
    
    # Calculate center of the data
    center_lat = df['latitude'].mean()
    center_lon = df['longitude'].mean()
    
    # Create base map
    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)
    
    # Add biomass points
    for idx, row in df.iterrows():
        # Determine color based on biomass value
        biomass_val = row[biomass_col]
        if biomass_val < 1.5:
            color = 'lightgray'
        elif biomass_val < 2.5:
            color = 'lightgreen'
        elif biomass_val < 3.5:
            color = 'green'
        else:
            color = 'darkgreen'
        
        # Create popup text
        popup_text = f"""
        <b>Location Info</b><br>
        Biomass: {biomass_val:.2f}<br>
        Carbon: {row['carbon_stock']:.2f}<br>
        Lat: {row['latitude']:.4f}<br>
        Lon: {row['longitude']:.4f}<br>
        """
        
        if 'vegetation_type' in row:
            popup_text += f"Vegetation: {row['vegetation_type']}<br>"
        
        # Add marker
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=8,
            popup=folium.Popup(popup_text, max_width=300),
            color=color,
            fillColor=color,
            fillOpacity=0.7,
            weight=1
        ).add_to(m)
    
    # Add heatmap
    heat_data = [[row['latitude'], row['longitude'], row[biomass_col]] for idx, row in df.iterrows()]
    plugins.HeatMap(heat_data, name='Biomass Heatmap', min_opacity=0.3, max_zoom=18).add_to(m)
    
    # Add layer control
    folium.LayerControl().add_to(m)
    
    # Add title
    title_html = f'''
             <h3 align="center" style="font-size:20px"><b>{map_title}</b></h3>
             '''
    m.get_root().html.add_child(folium.Element(title_html))
    
    return m

# Create interactive maps
print("üåç Creating Interactive Maps...")

map1 = create_interactive_maps(biomass_df, "Biomass Distribution - CSV Data")
print("‚úÖ CSV Data Map Created")

map2 = create_interactive_maps(json_df, "Biomass Distribution - JSON Data")
print("‚úÖ JSON Data Map Created")

# Display maps
print("\nüìã Map Legend:")
print("üî¥ Red points: High biomass (> 3.5)")
print("üü¢ Green points: Medium biomass (2.5-3.5)")
print("üü° Yellow points: Low biomass (1.5-2.5)")
print("‚ö™ Gray points: Very low biomass (< 1.5)")

# Save maps
map1.save('outputs/biomass_map_csv.html')
map2.save('outputs/biomass_map_json.html')
print("\nüíæ Maps saved to 'outputs/' directory")

# Display one map in notebook
display(map1)

## 8. Outlier and Anomaly Detection

In [None]:
def detect_anomalies(df):
    """Comprehensive outlier and anomaly detection."""
    
    biomass_col = 'biomass_value' if 'biomass_value' in df.columns else 'biomass'
    
    # Multiple outlier detection methods
    methods = {}
    
    # 1. IQR method
    Q1 = df[biomass_col].quantile(0.25)
    Q3 = df[biomass_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    iqr_outliers = df[(df[biomass_col] < lower_bound) | (df[biomass_col] > upper_bound)]
    methods['IQR'] = iqr_outliers
    
    # 2. Z-score method
    z_scores = np.abs(stats.zscore(df[biomass_col]))
    z_outliers = df[z_scores > 3]
    methods['Z-Score'] = z_outliers
    
    # 3. Percentile method
    lower_percentile = df[biomass_col].quantile(0.01)
    upper_percentile = df[biomass_col].quantile(0.99)
    percentile_outliers = df[(df[biomass_col] < lower_percentile) | (df[biomass_col] > upper_percentile)]
    methods['Percentile'] = percentile_outliers
    
    # Create visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Biomass Distribution with Outliers',
            'Outlier Detection Methods Comparison',
            'Spatial Distribution of Outliers',
            'Carbon vs Biomass Anomalies'
        )
    )
    
    # 1. Biomass distribution with outliers highlighted
    normal_data = df[~df.index.isin(iqr_outliers.index)]
    
    fig.add_trace(
        go.Histogram(x=normal_data[biomass_col], name='Normal', marker_color='lightblue'),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Histogram(x=iqr_outliers[biomass_col], name='Outliers', marker_color='red'),
        row=1, col=1
    )
    
    # 2. Method comparison
    method_counts = [len(methods[method]) for method in methods]
    fig.add_trace(
        go.Bar(x=list(methods.keys()), y=method_counts, marker_color='orange'),
        row=1, col=2
    )
    
    # 3. Spatial outliers
    fig.add_trace(
        go.Scatter(x=normal_data['longitude'], y=normal_data['latitude'],
                 mode='markers', name='Normal', marker=dict(color='blue', size=6)),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=iqr_outliers['longitude'], y=iqr_outliers['latitude'],
                 mode='markers', name='Outliers', marker=dict(color='red', size=10)),
        row=2, col=1
    )
    
    # 4. Carbon vs Biomass anomalies
    fig.add_trace(
        go.Scatter(x=normal_data[biomass_col], y=normal_data['carbon_stock'],
                 mode='markers', name='Normal', marker=dict(color='green')),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Scatter(x=iqr_outliers[biomass_col], y=iqr_outliers['carbon_stock'],
                 mode='markers', name='Outliers', marker=dict(color='red', size=8)),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text="Comprehensive Anomaly Detection",
        height=700
    )
    
    fig.show()
    
    # Print anomaly insights
    print("\nüö® Anomaly Detection Results:")
    print(f"Total data points: {len(df)}")
    for method, outliers in methods.items():
        print(f"{method} method detected {len(outliers)} outliers ({len(outliers)/len(df)*100:.1f}%)")
    
    # Common outliers across methods
    common_outliers = set()
    for outliers in methods.values():
        common_outliers.update(outliers.index)
    
    print(f"\nTotal unique outliers detected: {len(common_outliers)} ({len(common_outliers)/len(df)*100:.1f}%)")
    
    return methods, common_outliers

# Detect anomalies in both datasets
print("="*60)
print("ANOMALY DETECTION - CSV DATA")
csv_anomalies, csv_common = detect_anomalies(biomass_df)

print("\n" + "="*60)
print("ANOMALY DETECTION - JSON DATA")
json_anomalies, json_common = detect_anomalies(json_df)

## 9. Summary and Key Insights

In [None]:
def generate_eda_summary(biomass_df, json_df, csv_common, json_common):
    """Generate comprehensive EDA summary report."""
    
    biomass_col_csv = 'biomass_value' if 'biomass_value' in biomass_df.columns else 'biomass'
    biomass_col_json = 'biomass_value' if 'biomass_value' in json_df.columns else 'biomass'
    
    print("="*80)
    print("üìä EXPLORATORY DATA ANALYSIS - COMPREHENSIVE SUMMARY")
    print("="*80)
    
    print("\n1. DATASET OVERVIEW:")
    print(f"   CSV Data: {biomass_df.shape[0]} samples, {biomass_df.shape[1]} features")
    print(f"   JSON Data: {json_df.shape[0]} samples, {json_df.shape[1]} features")
    
    print("\n2. SPATIAL COVERAGE:")
    print(f"   CSV - Lat: {biomass_df['latitude'].min():.4f} to {biomass_df['latitude'].max():.4f}")
    print(f"   CSV - Lon: {biomass_df['longitude'].min():.4f} to {biomass_df['longitude'].max():.4f}")
    print(f"   JSON - Lat: {json_df['latitude'].min():.4f} to {json_df['latitude'].max():.4f}")
    print(f"   JSON - Lon: {json_df['longitude'].min():.4f} to {json_df['longitude'].max():.4f}")
    
    print("\n3. BIOMASS DISTRIBUTION:")
    print(f"   CSV - Mean: {biomass_df[biomass_col_csv].mean():.3f} ¬± {biomass_df[biomass_col_csv].std():.3f}")
    print(f"   JSON - Mean: {json_df[biomass_col_json].mean():.3f} ¬± {json_df[biomass_col_json].std():.3f}")
    print(f"   Overall Carbon Stock: {biomass_df['carbon_stock'].sum() + json_df['carbon_stock'].sum():.2f} units")
    
    print("\n4. DATA QUALITY:")
    print(f"   CSV Outliers: {len(csv_common)} ({len(csv_common)/len(biomass_df)*100:.1f}%)")
    print(f"   JSON Outliers: {len(json_common)} ({len(json_common)/len(json_df)*100:.1f}%)")
    
    # Check for spatial clustering
    print("\n5. SPATIAL PATTERNS:")
    csv_lat_range = biomass_df['latitude'].max() - biomass_df['latitude'].min()
    csv_lon_range = biomass_df['longitude'].max() - biomass_df['longitude'].min()
    print(f"   CSV Spatial Coverage: {csv_lat_range:.4f}¬∞ √ó {csv_lon_range:.4f}¬∞")
    
    # Correlation insights
    print("\n6. KEY RELATIONSHIPS:")
    if 'latitude' in biomass_df.columns and biomass_col_csv in biomass_df.columns:
        lat_corr = biomass_df['latitude'].corr(biomass_df[biomass_col_csv])
        lon_corr = biomass_df['longitude'].corr(biomass_df[biomass_col_csv])
        print(f"   Latitude-Biomass correlation: {lat_corr:.3f}")
        print(f"   Longitude-Biomass correlation: {lon_corr:.3f}")
    
    print("\n7. RECOMMENDATIONS:")
    if len(csv_common) > 0:
        print("   ‚ö†Ô∏è  Investigate outliers for data quality issues")
    if biomass_df[biomass_col_csv].std() > 1.5:
        print("   üìà High variability detected - consider stratification")
    if abs(lat_corr) > 0.3 or abs(lon_corr) > 0.3:
        print("   üåç Strong spatial patterns detected - include spatial features in models")
    
    print("\n" + "="*80)
    print("‚úÖ EDA COMPLETED SUCCESSFULLY")
    print("="*80)

# Generate final summary
generate_eda_summary(biomass_df, json_df, csv_common, json_common)

# Save processed data with insights
biomass_df.to_csv('outputs/biomass_data_with_insights.csv', index=False)
json_df.to_csv('outputs/json_data_with_insights.csv', index=False)
print("\nüíæ Data with insights saved to 'outputs/' directory")