# 🌾 Shamba Score: Data Exploration

## Overview
This notebook explores the synthetic farmer dataset for the Shamba Score credit scoring system.
- **Dataset**: 500 Kenyan farmer profiles
- **Features**: 15 ML input features + target variable
- **Purpose**: Understand data distribution and relationships for model training

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

print("📊 Shamba Score Data Exploration")
print("🌍 Climate-Adaptive Credit Scoring for Kenyan Farmers")

## 1. Load and Inspect Data

In [None]:
# Load data
df = pd.read_csv('../data/farmers_training_data.csv')

# Display basic info
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nData Types:")
display(df.dtypes)

print("\nMissing Values:")
display(df.isnull().sum())

print("\nCredit Score Statistics:")
display(df['credit_score'].describe())

## 2. Feature Analysis

In [None]:
# Key features for analysis
ml_features = [
    'mean_ndvi', 'ndvi_trend', 'growing_season_match',
    'transaction_velocity', 'savings_rate', 'loan_repayment_history',
    'cooperative_endorsement', 'chama_participation', 'neighbor_vouches',
    'fertilizer_purchase_timing', 'seed_quality_tier', 'advisory_usage',
    'drought_exposure_index', 'rainfall_deviation', 'temperature_anomaly'
]

print("📈 15 ML Features:")
for i, feature in enumerate(ml_features, 1):
    print(f"{i:2d}. {feature}")

print(f"\n🎯 Target Variable: credit_score")
print(f"📊 Farmer Types Distribution:")
print(df['farmer_type'].value_counts())

## 3. Data Visualizations

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('🌾 Shamba Score: Data Exploration Dashboard', fontsize=16, fontweight='bold')

# 1. Credit Score Distribution
axes[0, 0].hist(df['credit_score'], bins=25, color='green', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Credit Score Distribution')
axes[0, 0].set_xlabel('Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)

# 2. Score by Farmer Type
df.boxplot(column='credit_score', by='farmer_type', ax=axes[0, 1])
axes[0, 1].set_title('Credit Score by Farmer Type')
axes[0, 1].set_xlabel('Farmer Type')
axes[0, 1].set_ylabel('Credit Score')

# 3. Score by County
df.boxplot(column='credit_score', by='county', ax=axes[1, 0])
axes[1, 0].set_title('Credit Score by County')
axes[1, 0].set_xlabel('County')
axes[1, 0].set_ylabel('Credit Score')
axes[1, 0].tick_params(axis='x', rotation=45)

# 4. NDVI vs Credit Score
scatter = axes[1, 1].scatter(df['mean_ndvi'], df['credit_score'], 
                           alpha=0.6, c=df['credit_score'], cmap='RdYlGn', s=30)
axes[1, 1].set_title('NDVI vs Credit Score')
axes[1, 1].set_xlabel('Mean NDVI (Vegetation Health)')
axes[1, 1].set_ylabel('Credit Score')
axes[1, 1].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[1, 1], label='Credit Score')

plt.tight_layout()
plt.savefig('../docs/data_exploration.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✅ Exploration complete! Check docs/data_exploration.png")

## 4. Feature Correlations

In [None]:
# Correlation analysis
correlation_features = ml_features + ['credit_score']
corr_matrix = df[correlation_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, 
            square=True, fmt='.2f', cbar_kws={'label': 'Correlation'})
plt.title('🌾 Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../docs/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Top correlations with credit score
credit_corr = corr_matrix['credit_score'].abs().sort_values(ascending=False)
print("\n🎯 Top Features Correlated with Credit Score:")
for feature, corr in credit_corr.head(10).items():
    if feature != 'credit_score':
        print(f"{feature:25s}: {corr:.3f}")

## 5. Summary Statistics

In [None]:
# Summary by farmer type
print("📊 Credit Score Summary by Farmer Type:")
summary = df.groupby('farmer_type')['credit_score'].agg(['count', 'mean', 'std', 'min', 'max'])
display(summary.round(2))

print("\n🌍 Credit Score Summary by County:")
county_summary = df.groupby('county')['credit_score'].agg(['count', 'mean', 'std']).round(2)
display(county_summary)

print("\n📈 Key Insights:")
print(f"• Total Farmers: {len(df)}")
print(f"• Average Credit Score: {df['credit_score'].mean():.1f}")
print(f"• Score Range: {df['credit_score'].min():.1f} - {df['credit_score'].max():.1f}")
print(f"• Excellent Farmers: {(df['farmer_type'] == 'excellent').sum()} ({(df['farmer_type'] == 'excellent').mean()*100:.1f}%)")
print(f"• Struggling Farmers: {(df['farmer_type'] == 'struggling').sum()} ({(df['farmer_type'] == 'struggling').mean()*100:.1f}%)")

print("\n🚀 Dataset ready for ML model training!")