# Comprehensive EDA Part 5: Soil & Area Analysis

**Objective:** Analyze soil property relationships with yield and examine planting/harvesting patterns to understand production efficiency.

**Contents:**
1. Soil Property Distributions and Correlations
2. Soil-Yield Relationships and Optimal Ranges
3. Planting & Harvesting Patterns
4. Production Efficiency Analysis
5. Regional Soil Quality Mapping

**Author:** Ahsan Riaz | CS 245 Machine Learning Project

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("="*80)
print("COMPREHENSIVE EDA - PART 5: SOIL & AREA ANALYSIS")
print("="*80)

COMPREHENSIVE EDA - PART 5: SOIL & AREA ANALYSIS


In [3]:
df = pd.read_csv('../data/processed/modeling_dataset_final.csv')
print(f"Dataset loaded: {df.shape[0]:,} rows")

Dataset loaded: 82,436 rows


## 1. Soil Property Distributions

Examine the distribution of key soil properties across counties.

In [4]:
# Identify soil columns
soil_cols = [c for c in df.columns if 'Soil' in c or 'AWC' in c or 'Clay' in c or 'pH' in c or 'OM' in c or 'Organic' in c]
print(f"Soil columns found: {soil_cols}\n")

# Get unique county-level soil data
soil_data = df.groupby(['State', 'County'])[soil_cols].first().reset_index()

print("SOIL PROPERTY STATISTICS")
print("="*80)
print(soil_data[soil_cols].describe().round(2))

Soil columns found: ['Soil_AWC', 'Soil_Clay_Pct', 'Soil_pH', 'Soil_Organic_Matter_Pct']

SOIL PROPERTY STATISTICS
       Soil_AWC  Soil_Clay_Pct  Soil_pH  Soil_Organic_Matter_Pct
count   2635.00        2635.00  2635.00                  2635.00
mean       0.14          24.71     5.83                     1.77
std        0.05          12.77     1.39                     6.13
min        0.00           0.00     0.00                     0.00
25%        0.11          16.28     5.01                     0.50
50%        0.14          24.20     5.69                     0.83
75%        0.17          31.82     6.83                     1.48
max        0.54          85.47     9.10                    89.75


In [5]:
# Distribution plots for each soil property
if len(soil_cols) >= 4:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=soil_cols[:4]
    )
    
    for idx, col in enumerate(soil_cols[:4]):
        row = idx // 2 + 1
        col_num = idx % 2 + 1
        
        fig.add_trace(
            go.Histogram(x=soil_data[col], name=col, nbinsx=40),
            row=row, col=col_num
        )
    
    fig.update_layout(height=600, showlegend=False, title_text='Soil Property Distributions')
    fig.show()

## 2. Soil-Yield Relationships

Analyze how soil properties correlate with corn yield.

In [6]:
# Calculate correlation with yield
correlations = []
for col in soil_cols:
    if col in df.columns:
        corr = df[[col, 'Yield_BU_ACRE']].corr().iloc[0, 1]
        correlations.append({'Soil_Property': col, 'Correlation': corr})

corr_df = pd.DataFrame(correlations).sort_values('Correlation', ascending=False)

print("SOIL PROPERTY CORRELATIONS WITH YIELD")
print("="*80)
print(corr_df.to_string(index=False))

# Bar plot
fig = px.bar(
    corr_df,
    x='Correlation',
    y='Soil_Property',
    orientation='h',
    title='Soil Property Correlations with Yield',
    color='Correlation',
    color_continuous_scale='RdYlGn'
)
fig.show()

SOIL PROPERTY CORRELATIONS WITH YIELD
          Soil_Property  Correlation
               Soil_AWC     0.160922
                Soil_pH     0.155930
Soil_Organic_Matter_Pct     0.004136
          Soil_Clay_Pct    -0.002943


In [6]:
# Scatter plots for top correlated properties
if len(soil_cols) > 0:
    top_soil = corr_df.iloc[0]['Soil_Property']
    
    fig = px.scatter(
        df.sample(min(10000, len(df))),
        x=top_soil,
        y='Yield_BU_ACRE',
        trendline='ols',
        title=f'Yield vs {top_soil}',
        opacity=0.3,
        height=500
    )
    fig.show()

## 3. Planting & Harvesting Patterns

Analyze area planted vs harvested and abandonment rates.

In [7]:
# Check for area columns
area_cols = [c for c in df.columns if 'Area' in c or 'Planted' in c or 'Harvested' in c]
print(f"Area columns: {area_cols}\n")

if 'Area_Planted_ACRES' in df.columns and 'Area_Harvested_ACRES' in df.columns:
    df['Abandonment_Rate'] = 1 - (df['Area_Harvested_ACRES'] / df['Area_Planted_ACRES'])
    df['Abandonment_Rate'] = df['Abandonment_Rate'].clip(0, 1)  # Ensure 0-1 range
    
    print("ABANDONMENT RATE STATISTICS")
    print("="*80)
    print(df['Abandonment_Rate'].describe())
    
    # Time series of abandonment
    abandon_trend = df.groupby('Year')['Abandonment_Rate'].mean().reset_index()
    
    fig = px.line(
        abandon_trend,
        x='Year',
        y='Abandonment_Rate',
        title='Average Abandonment Rate Over Time',
        labels={'Abandonment_Rate': 'Abandonment Rate (fraction)'},
        height=400
    )
    fig.show()

Area columns: ['Area_Planted_ACRES', 'Area_Harvested_ACRES']

ABANDONMENT RATE STATISTICS
count    82436.000000
mean         0.183606
std          0.214977
min          0.000000
25%          0.031250
50%          0.090909
75%          0.258446
max          0.999468
Name: Abandonment_Rate, dtype: float64


In [8]:
# Abandonment vs Yield
if 'Abandonment_Rate' in df.columns:
    fig = px.scatter(
        df.sample(min(10000, len(df))),
        x='Abandonment_Rate',
        y='Yield_BU_ACRE',
        trendline='ols',
        title='Yield vs Abandonment Rate',
        opacity=0.3,
        height=500
    )
    fig.show()
    
    corr = df[['Abandonment_Rate', 'Yield_BU_ACRE']].corr().iloc[0, 1]
    print(f"\nCorrelation: {corr:.3f}")


Correlation: -0.311


## 4. Production Efficiency Analysis

Identify high-efficiency counties and best practices.

In [9]:
# Calculate average yield by county
county_efficiency = df.groupby(['State', 'County']).agg({
    'Yield_BU_ACRE': 'mean',
    'Year': 'count'
}).reset_index()
county_efficiency.columns = ['State', 'County', 'Avg_Yield', 'Years']

# Filter counties with sufficient data
county_efficiency = county_efficiency[county_efficiency['Years'] >= 10]

# Top performers
top_counties = county_efficiency.nlargest(20, 'Avg_Yield')

print("TOP 20 HIGHEST-YIELDING COUNTIES (10+ years data)")
print("="*80)
print(top_counties.to_string(index=False))

TOP 20 HIGHEST-YIELDING COUNTIES (10+ years data)
     State      County  Avg_Yield  Years
WASHINGTON      BENTON 208.960000     20
    OREGON      MORROW 195.235714     28
WASHINGTON WALLA WALLA 191.823810     21
    OREGON    UMATILLA 188.924138     29
     TEXAS     HARTLEY 188.844118     34
WASHINGTON    FRANKLIN 184.250000     32
     TEXAS       MOORE 183.945238     42
NEW MEXICO       UNION 183.254545     33
     TEXAS      CASTRO 182.702778     36
    KANSAS       MEADE 182.129730     37
  NEBRASKA      PHELPS 181.055814     43
     TEXAS     SHERMAN 180.953488     43
WASHINGTON       GRANT 180.858974     39
     TEXAS     ROBERTS 180.775000     16
CALIFORNIA       GLENN 180.197368     38
    KANSAS        GRAY 179.762500     40
     TEXAS    LIPSCOMB 177.373333     15
     TEXAS  HUTCHINSON 177.197436     39
    KANSAS     HASKELL 177.071795     39
     TEXAS      DALLAM 176.543902     41


In [10]:
# State-level efficiency
state_efficiency = df.groupby('State')['Yield_BU_ACRE'].mean().sort_values(ascending=False).head(15)

fig = px.bar(
    x=state_efficiency.values,
    y=state_efficiency.index,
    orientation='h',
    title='Top 15 States by Average Yield',
    labels={'x': 'Average Yield (BU/ACRE)', 'y': 'State'},
    height=500
)
fig.show()