In [1]:
# ============================================================================
# AGRISHIELD RISK PREDICTION MODEL TRAINING
# ============================================================================

import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print(" "*25 + "AGRISHIELD RISK PREDICTION MODEL TRAINING")
print(" "*28 + "ML Pipeline - 2026")
print("="*80 + "\n")

                         AGRISHIELD RISK PREDICTION MODEL TRAINING
                            ML Pipeline - 2026



In [7]:
# ============================================================================
# STEP 1: LOAD PROCESSED DATA
# ============================================================================

print("üì• STEP 1: LOADING PROCESSED DATA")
print("-"*80)

try:
    df = pd.read_csv('../data/processed/merged_dataset.csv')
    print(f"‚úÖ Dataset loaded: {df.shape}")
    print(f"   Years: {int(df['Year'].min())} - {int(df['Year'].max())}")
    print(f"   Crops: {df['Crop'].nunique()}")
    print(f"   States: {df['State'].nunique()}")
    print(f"   Districts: {df['District'].nunique()}")
    
    # Display first few rows
    print("\nüìä Sample Data:")
    display(df[['Crop', 'State', 'Year', 'Season', 'Yield', 'Crop_Failure']].head())
    
except FileNotFoundError:
    print("‚ùå ERROR: merged_dataset.csv not found!")
    print("   Please run data_preprocessing.ipynb first")

üì• STEP 1: LOADING PROCESSED DATA
--------------------------------------------------------------------------------
‚úÖ Dataset loaded: (15113, 27)
   Years: 1997 - 2019
   Crops: 55
   States: 24
   Districts: 24

üìä Sample Data:


Unnamed: 0,Crop,State,Year,Season,Yield,Crop_Failure
0,Arecanut,Assam,1997,Whole Year,0.796087,1
1,Arhar/Tur,Assam,1997,Kharif,0.710435,0
2,Castor Seed,Assam,1997,Kharif,0.238333,1
3,Coconut,Assam,1997,Whole Year,5238.051739,1
4,Cotton(Lint),Assam,1997,Kharif,0.420909,1


In [9]:
# ============================================================================
# STEP 2: PREPARE FEATURES FOR ML
# ============================================================================

print("\n‚öôÔ∏è  STEP 2: PREPARING FEATURES")
print("-"*80)

# Encode categorical variables
print("Encoding categorical variables...")

# Crop encoder
crop_encoder = LabelEncoder()
df['Crop_Encoded'] = crop_encoder.fit_transform(df['Crop'])

# State encoder
state_encoder = LabelEncoder()
df['State_Encoded'] = state_encoder.fit_transform(df['State'])

print(f"‚úÖ Encoded {df['Crop'].nunique()} crops")
print(f"‚úÖ Encoded {df['State'].nunique()} states")

# Select features (only those a farmer can easily provide)
feature_cols = [
    'Crop_Encoded',
    'State_Encoded',
    'Season_Encoded',
    'Avg_Temperature',
    'Total_Rainfall',
    'Avg_Humidity',
    'Soil_Quality_Score',
    'Rainfall_Deviation',
    'Temperature_Deviation',
    'Disaster_Occurred',
    'Severity_Score'
]

X = df[feature_cols]
y = df['Crop_Failure']

print(f"\n‚úÖ Features prepared:")
print(f"   Feature count: {len(feature_cols)}")
print(f"   Sample size: {len(X):,}")
print(f"   Features: {feature_cols}")
print(f"\nüìä Target distribution:")
print(f"   Failures: {y.sum():,} ({y.mean()*100:.2f}%)")
print(f"   Success: {(y==0).sum():,} ({(1-y.mean())*100:.2f}%)")

# Display feature statistics
print("\nüìä Feature Statistics:")
X.describe()


‚öôÔ∏è  STEP 2: PREPARING FEATURES
--------------------------------------------------------------------------------
Encoding categorical variables...
‚úÖ Encoded 55 crops
‚úÖ Encoded 24 states

‚úÖ Features prepared:
   Feature count: 11
   Sample size: 15,113
   Features: ['Crop_Encoded', 'State_Encoded', 'Season_Encoded', 'Avg_Temperature', 'Total_Rainfall', 'Avg_Humidity', 'Soil_Quality_Score', 'Rainfall_Deviation', 'Temperature_Deviation', 'Disaster_Occurred', 'Severity_Score']

üìä Target distribution:
   Failures: 3,310 (21.90%)
   Success: 11,803 (78.10%)

üìä Feature Statistics:


Unnamed: 0,Crop_Encoded,State_Encoded,Season_Encoded,Avg_Temperature,Total_Rainfall,Avg_Humidity,Soil_Quality_Score,Rainfall_Deviation,Temperature_Deviation,Disaster_Occurred,Severity_Score
count,15113.0,15113.0,15113.0,15113.0,15113.0,15113.0,15113.0,15113.0,15113.0,15113.0,15113.0
mean,29.229736,11.347317,2.039105,25.74115,945.333762,71.379078,0.724067,-4.513236e-16,0.0,0.156951,0.364918
std,16.062429,7.239475,1.147819,4.162029,867.819498,6.237021,0.066468,5.243676e-15,0.0,0.363767,0.87999
min,0.0,0.0,1.0,20.0,14.738462,60.0,0.61,-1.718258e-14,0.0,0.0,0.0
25%,16.0,6.0,1.0,20.0,161.957143,65.0,0.66,0.0,0.0,0.0,0.0
50%,29.0,11.0,2.0,27.0,882.670423,70.0,0.72,0.0,0.0,0.0,0.0
75%,44.0,17.0,3.0,28.0,1211.42,78.0,0.79,0.0,0.0,0.0,0.0
max,54.0,23.0,4.0,35.0,3682.842857,78.0,0.83,1.583372e-14,0.0,1.0,3.0


In [11]:
# ============================================================================
# STEP 3: SAVE ENCODERS AND LISTS
# ============================================================================

print("\nüíæ STEP 3: SAVING ENCODERS AND LISTS")
print("-"*80)

# Create models directory
os.makedirs('../models', exist_ok=True)

# Save encoders
with open('../models/crop_encoder.pkl', 'wb') as f:
    pickle.dump(crop_encoder, f)
print("‚úÖ Saved: crop_encoder.pkl")

with open('../models/state_encoder.pkl', 'wb') as f:
    pickle.dump(state_encoder, f)
print("‚úÖ Saved: state_encoder.pkl")

# Save lists for frontend dropdowns
crop_list = sorted(df['Crop'].unique().tolist())
state_list = sorted(df['State'].unique().tolist())
district_list = sorted(df['District'].unique().tolist())

with open('../models/crop_list.pkl', 'wb') as f:
    pickle.dump(crop_list, f)
print(f"‚úÖ Saved: crop_list.pkl ({len(crop_list)} crops)")

with open('../models/state_list.pkl', 'wb') as f:
    pickle.dump(state_list, f)
print(f"‚úÖ Saved: state_list.pkl ({len(state_list)} states)")

with open('../models/district_list.pkl', 'wb') as f:
    pickle.dump(district_list, f)
print(f"‚úÖ Saved: district_list.pkl ({len(district_list)} districts)")

print(f"\nüìã Available Crops (first 20):")
print(crop_list[:20])

print(f"\nüìã Available States:")
print(state_list)


üíæ STEP 3: SAVING ENCODERS AND LISTS
--------------------------------------------------------------------------------
‚úÖ Saved: crop_encoder.pkl
‚úÖ Saved: state_encoder.pkl
‚úÖ Saved: crop_list.pkl (55 crops)
‚úÖ Saved: state_list.pkl (24 states)
‚úÖ Saved: district_list.pkl (24 districts)

üìã Available Crops (first 20):
['Arecanut', 'Arhar/Tur', 'Bajra', 'Banana', 'Barley', 'Black Pepper', 'Cardamom', 'Cashewnut', 'Castor Seed', 'Coconut', 'Coriander', 'Cotton(Lint)', 'Cowpea(Lobia)', 'Dry Chillies', 'Garlic', 'Ginger', 'Gram', 'Groundnut', 'Guar Seed', 'Horse-Gram']

üìã Available States:
['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Jammu And Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Punjab', 'Sikkim', 'Tamil Nadu', 'Tripura', 'Uttar Pradesh', 'West Bengal']


In [13]:
# ============================================================================
# STEP 4: TRAIN-TEST SPLIT
# ============================================================================

print("\nüìä STEP 4: SPLITTING DATA")
print("-"*80)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"‚úÖ Split complete:")
print(f"   Training samples: {len(X_train):,}")
print(f"   Testing samples: {len(X_test):,}")
print(f"\n   Training failures: {y_train.sum():,} ({y_train.mean()*100:.2f}%)")
print(f"   Testing failures: {y_test.sum():,} ({y_test.mean()*100:.2f}%)")


üìä STEP 4: SPLITTING DATA
--------------------------------------------------------------------------------
‚úÖ Split complete:
   Training samples: 12,090
   Testing samples: 3,023

   Training failures: 2,648 (21.90%)
   Testing failures: 662 (21.90%)


In [15]:
# ============================================================================
# STEP 5: FEATURE SCALING
# ============================================================================

print("\nüìè STEP 5: SCALING FEATURES")
print("-"*80)

# Initialize scaler
scaler = StandardScaler()

# Fit on training data and transform both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("‚úÖ Features scaled using StandardScaler")
print("‚úÖ Scaler saved: scaler.pkl")

print("\nüìä Scaled feature ranges (first 5 features):")
for i, col in enumerate(feature_cols[:5]):
    print(f"   {col:30s}: [{X_train_scaled[:, i].min():.2f}, {X_train_scaled[:, i].max():.2f}]")


üìè STEP 5: SCALING FEATURES
--------------------------------------------------------------------------------
‚úÖ Features scaled using StandardScaler
‚úÖ Scaler saved: scaler.pkl

üìä Scaled feature ranges (first 5 features):
   Crop_Encoded                  : [-1.81, 1.54]
   State_Encoded                 : [-1.57, 1.61]
   Season_Encoded                : [-0.91, 1.71]
   Avg_Temperature               : [-1.38, 2.22]
   Total_Rainfall                : [-1.07, 3.15]


In [17]:
# ============================================================================
# STEP 6: TRAIN MULTIPLE MODELS
# ============================================================================

print("\nü§ñ STEP 6: TRAINING ML MODELS")
print("="*80)

# Define models
models = {
    'Logistic Regression': LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced'
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=15,
        min_samples_split=5,
        class_weight='balanced',
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        random_state=42,
        max_depth=7,
        learning_rate=0.1
    )
}

results = {}

for name, model in models.items():
    print(f"\n{'='*80}")
    print(f"Training: {name}")
    print(f"{'='*80}")
    
    # Train
    print("Training model...")
    model.fit(X_train_scaled, y_train)
    print("‚úÖ Training complete!")
    
    # Predict
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test, zero_division=0)
    recall = recall_score(y_test, y_pred_test, zero_division=0)
    f1 = f1_score(y_test, y_pred_test, zero_division=0)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results[name] = {
        'model': model,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc': auc
    }
    
    # Print metrics
    print(f"\nüìä PERFORMANCE METRICS:")
    print(f"   Train Accuracy: {train_acc:.4f}")
    print(f"   Test Accuracy:  {test_acc:.4f}")
    print(f"   Precision:      {precision:.4f}")
    print(f"   Recall:         {recall:.4f}")
    print(f"   F1-Score:       {f1:.4f}")
    print(f"   AUC-ROC:        {auc:.4f}")
    
    # Confusion matrix
    print(f"\nüìâ CONFUSION MATRIX:")
    cm = confusion_matrix(y_test, y_pred_test)
    print(f"                Predicted")
    print(f"                No  Yes")
    print(f"   Actual No  [{cm[0][0]:5d} {cm[0][1]:5d}]")
    print(f"   Actual Yes [{cm[1][0]:5d} {cm[1][1]:5d}]")
    print(f"\n   True Negatives:  {cm[0][0]:,}")
    print(f"   False Positives: {cm[0][1]:,}")
    print(f"   False Negatives: {cm[1][0]:,}")
    print(f"   True Positives:  {cm[1][1]:,}")

print("\n" + "="*80)
print("‚úÖ All models trained successfully!")
print("="*80)


ü§ñ STEP 6: TRAINING ML MODELS

Training: Logistic Regression
Training model...
‚úÖ Training complete!

üìä PERFORMANCE METRICS:
   Train Accuracy: 0.5660
   Test Accuracy:  0.5607
   Precision:      0.2729
   Recall:         0.6042
   F1-Score:       0.3759
   AUC-ROC:        0.6116

üìâ CONFUSION MATRIX:
                Predicted
                No  Yes
   Actual No  [ 1295  1066]
   Actual Yes [  262   400]

   True Negatives:  1,295
   False Positives: 1,066
   False Negatives: 262
   True Positives:  400

Training: Random Forest
Training model...
‚úÖ Training complete!

üìä PERFORMANCE METRICS:
   Train Accuracy: 0.8577
   Test Accuracy:  0.8207
   Precision:      0.5638
   Recall:         0.8006
   F1-Score:       0.6617
   AUC-ROC:        0.8882

üìâ CONFUSION MATRIX:
                Predicted
                No  Yes
   Actual No  [ 1951   410]
   Actual Yes [  132   530]

   True Negatives:  1,951
   False Positives: 410
   False Negatives: 132
   True Positives:  530

Tr

In [19]:
# ============================================================================
# STEP 7: COMPARE MODELS
# ============================================================================

print("\nüìä STEP 7: MODEL COMPARISON")
print("="*80)

# Create comparison dataframe
comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Train Acc': [results[m]['train_accuracy'] for m in results],
    'Test Acc': [results[m]['test_accuracy'] for m in results],
    'Precision': [results[m]['precision'] for m in results],
    'Recall': [results[m]['recall'] for m in results],
    'F1-Score': [results[m]['f1_score'] for m in results],
    'AUC-ROC': [results[m]['auc'] for m in results]
})

print("\nüèÜ MODEL COMPARISON TABLE:")
print(comparison.to_string(index=False))

# Highlight best model for each metric
print("\nü•á BEST MODELS BY METRIC:")
for metric in ['Test Acc', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']:
    best_model = comparison.loc[comparison[metric].idxmax(), 'Model']
    best_score = comparison[metric].max()
    print(f"   {metric:12s}: {best_model:25s} ({best_score:.4f})")

comparison


üìä STEP 7: MODEL COMPARISON

üèÜ MODEL COMPARISON TABLE:
              Model  Train Acc  Test Acc  Precision   Recall  F1-Score  AUC-ROC
Logistic Regression   0.566005  0.560701   0.272851 0.604230  0.375940 0.611646
      Random Forest   0.857651  0.820708   0.563830 0.800604  0.661673 0.888160
  Gradient Boosting   0.888420  0.866358   0.771008 0.554381  0.644991 0.900237

ü•á BEST MODELS BY METRIC:
   Test Acc    : Gradient Boosting         (0.8664)
   Precision   : Gradient Boosting         (0.7710)
   Recall      : Random Forest             (0.8006)
   F1-Score    : Random Forest             (0.6617)
   AUC-ROC     : Gradient Boosting         (0.9002)


Unnamed: 0,Model,Train Acc,Test Acc,Precision,Recall,F1-Score,AUC-ROC
0,Logistic Regression,0.566005,0.560701,0.272851,0.60423,0.37594,0.611646
1,Random Forest,0.857651,0.820708,0.56383,0.800604,0.661673,0.88816
2,Gradient Boosting,0.88842,0.866358,0.771008,0.554381,0.644991,0.900237


In [21]:
# ============================================================================
# STEP 8: SELECT BEST MODEL
# ============================================================================

print("\nüèÜ STEP 8: SELECTING BEST MODEL")
print("="*80)

# Select based on F1-score (balanced metric for imbalanced data)
best_model_name = max(results, key=lambda x: results[x]['f1_score'])
best_model = results[best_model_name]['model']
best_metrics = results[best_model_name]

print(f"\n‚úÖ BEST MODEL: {best_model_name}")
print(f"\nüìä FINAL PERFORMANCE:")
print(f"   Test Accuracy: {best_metrics['test_accuracy']:.4f}")
print(f"   Precision:     {best_metrics['precision']:.4f}")
print(f"   Recall:        {best_metrics['recall']:.4f}")
print(f"   F1-Score:      {best_metrics['f1_score']:.4f}")
print(f"   AUC-ROC:       {best_metrics['auc']:.4f}")

# Save best model
model_path = '../models/crop_failure_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nüíæ Model saved: {model_path}")
print("\n" + "="*80)


üèÜ STEP 8: SELECTING BEST MODEL

‚úÖ BEST MODEL: Random Forest

üìä FINAL PERFORMANCE:
   Test Accuracy: 0.8207
   Precision:     0.5638
   Recall:        0.8006
   F1-Score:      0.6617
   AUC-ROC:       0.8882

üíæ Model saved: ../models/crop_failure_model.pkl



In [23]:
# ============================================================================
# STEP 9: FEATURE IMPORTANCE
# ============================================================================

if hasattr(best_model, 'feature_importances_'):
    print("\nüìä STEP 9: FEATURE IMPORTANCE")
    print("="*80)
    
    importances = best_model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    print("\nüîç TOP 10 MOST IMPORTANT FEATURES:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"   {row['Feature']:30s}: {row['Importance']:.4f} {'‚ñà' * int(row['Importance']*100)}")
    
    feature_importance
else:
    print("\n‚ö†Ô∏è  Selected model doesn't have feature importance")


üìä STEP 9: FEATURE IMPORTANCE

üîç TOP 10 MOST IMPORTANT FEATURES:
   Crop_Encoded                  : 0.6137 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   State_Encoded                 : 0.1278 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   Total_Rainfall                : 0.0978 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   Soil_Quality_Score            : 0.0686 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   Rainfall_Deviation            : 0.0216 ‚ñà‚ñà
   Avg_Humidity                  : 0.0171 ‚ñà
   Severity_Score                : 0.0167 ‚ñà
   Avg_Temperature               : 0.0146 ‚ñà
   Season_Encoded                : 0.0125 ‚ñà
   Disaster_Occurred             : 0.0096 


In [27]:
# ============================================================================
# STEP 10: FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print(" "*30 + "‚úÖ TRAINING COMPLETE!")
print("="*80)

print(f"\nüìÅ FILES CREATED IN ../models/:")
print(f"   ‚îú‚îÄ‚îÄ crop_failure_model.pkl      (Best ML model)")
print(f"   ‚îú‚îÄ‚îÄ scaler.pkl                  (Feature scaler)")
print(f"   ‚îú‚îÄ‚îÄ crop_encoder.pkl            (Crop label encoder)")
print(f"   ‚îú‚îÄ‚îÄ state_encoder.pkl           (State label encoder)")
print(f"   ‚îú‚îÄ‚îÄ crop_list.pkl               (List of crops)")
print(f"   ‚îú‚îÄ‚îÄ state_list.pkl              (List of states)")
print(f"   ‚îú‚îÄ‚îÄ district_list.pkl           (List of districts)")
print(f"   ‚îî‚îÄ‚îÄ district_info.pkl           (District soil info)")

print(f"\nüéØ BEST MODEL: {best_model_name}")
print(f"   F1-Score: {best_metrics['f1_score']:.4f}")
print(f"   Accuracy: {best_metrics['test_accuracy']:.4f}")

print(f"\nüìä DATASET INFO:")
print(f"   Total samples: {len(df):,}")
print(f"   Training samples: {len(X_train):,}")
print(f"   Testing samples: {len(X_test):,}")
print(f"   Number of crops: {df['Crop'].nunique()}")
print(f"   Number of states: {df['State'].nunique()}")

print("\n" + "="*80 + "\n")

print("‚úÖ Model is ready for deployment!")


                              ‚úÖ TRAINING COMPLETE!

üìÅ FILES CREATED IN ../models/:
   ‚îú‚îÄ‚îÄ crop_failure_model.pkl      (Best ML model)
   ‚îú‚îÄ‚îÄ scaler.pkl                  (Feature scaler)
   ‚îú‚îÄ‚îÄ crop_encoder.pkl            (Crop label encoder)
   ‚îú‚îÄ‚îÄ state_encoder.pkl           (State label encoder)
   ‚îú‚îÄ‚îÄ crop_list.pkl               (List of crops)
   ‚îú‚îÄ‚îÄ state_list.pkl              (List of states)
   ‚îú‚îÄ‚îÄ district_list.pkl           (List of districts)
   ‚îî‚îÄ‚îÄ district_info.pkl           (District soil info)

üéØ BEST MODEL: Random Forest
   F1-Score: 0.6617
   Accuracy: 0.8207

üìä DATASET INFO:
   Total samples: 15,113
   Training samples: 12,090
   Testing samples: 3,023
   Number of crops: 55
   Number of states: 24


‚úÖ Model is ready for deployment!
