In [1]:
# 1. Import libraries and setup 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE, VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

# Additional utilities
import joblib
import pickle
from datetime import datetime
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import shap

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

print(f"Model Training started at: {datetime.now()}")





Model Training started at: 2025-12-11 11:34:46.493630


In [2]:
# 2. Load Processed Data 

try:
    df = pd.read_parquet('cleaned_agricultural_data.csv')
except:
    df = pd.read_csv('cleaned_agricultural_data.csv')


# Load EDA results for reference
try:
    with open('eda_analysis_results.pkl', 'rb') as f:
        eda_results = pickle.load(f)
except:
    eda_results = {}

print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

#Display first few rows 
print("\nDataset Preview:")
print(df.head())



Dataset Shape: (2484, 118)
Memory Usage: 2.24 MB

Dataset Preview:
   Dist Code  Year  State Code      State Name   Dist Name  \
0         52  2010           1  Andhra Pradesh  Ananthapur   
1         52  2011           1  Andhra Pradesh  Ananthapur   
2         52  2012           1  Andhra Pradesh  Ananthapur   
3         52  2013           1  Andhra Pradesh  Ananthapur   
4         52  2014           1  Andhra Pradesh  Ananthapur   

   RICE AREA (1000 ha)  RICE PRODUCTION (1000 tons)  RICE YIELD (Kg per ha)  \
0                59.80                       171.40                 2866.22   
1                48.67                       120.07                 2467.02   
2                29.05                        76.45                 2631.67   
3                40.40                        87.94                 2176.73   
4                29.21                        82.53                 2825.40   

   WHEAT AREA (1000 ha)  WHEAT PRODUCTION (1000 tons)  \
0                  0.13     

In [5]:
# 3. Problem Formulation and Target Definition

def define_problems(df):
    """
    Define multiple prediction problems for the agricultural dataset
    Returns dictionary with problem definitions and target columns
    """
    
    problems = {}
    
    # Problem 1: Overall Yield Prediction (Regression)
    problems['yield_prediction'] = {
        'type': 'regression',
        'target': 'OVERALL_YIELD_Kg_per_ha',
        'description': 'Predict overall agricultural yield based on various features',
        'metrics': ['RMSE', 'MAE', 'R2']
    }
    
    # Problem 2: Crop-Specific Yield Prediction (Rice)
    problems['rice_yield_prediction'] = {
        'type': 'regression',
        'target': 'RICE YIELD (Kg per ha)',
        'description': 'Predict rice yield specifically',
        'metrics': ['RMSE', 'MAE', 'R2']
    }
    
    # Problem 3: High-Yield District Classification (Binary)
    # Define high yield as above median overall yield
    median_yield = df['OVERALL_YIELD_Kg_per_ha'].median()
    problems['high_yield_classification'] = {
        'type': 'classification',
        'target': 'HIGH_YIELD_FLAG',
        'description': 'Classify districts as high-yield vs low-yield',
        'metrics': ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC-ROC']
    }
    
    # Problem 4: Productivity Cluster Prediction (Multi-class)
    if 'Cluster' in eda_results.get('district_clusters', pd.DataFrame()):
        problems['cluster_prediction'] = {
            'type': 'multiclass',
            'target': 'CLUSTER',
            'description': 'Predict district productivity cluster',
            'metrics': ['Accuracy', 'F1_macro', 'F1_micro']
        }
    
    # Problem 5: Year-over-Year Yield Change (Regression)
    if 'OVERALL_YIELD_Kg_per_ha_YoY_Growth' in df.columns:
        problems['yoy_change_prediction'] = {
            'type': 'regression',
            'target': 'OVERALL_YIELD_Kg_per_ha_YoY_Growth',
            'description': 'Predict year-over-year yield change',
            'metrics': ['RMSE', 'MAE', 'R2']
        }
    
    return problems

problems = define_problems(df)
print("Defined Prediction Problems:")
print("=" * 80)
for prob_name, prob_config in problems.items():
    print(f"\n{prob_name.upper()}:")
    print(f"  Type: {prob_config['type']}")
    print(f"  Target: {prob_config['target']}")
    print(f"  Description: {prob_config['description']}")



        

    





Defined Prediction Problems:

YIELD_PREDICTION:
  Type: regression
  Target: OVERALL_YIELD_Kg_per_ha
  Description: Predict overall agricultural yield based on various features

RICE_YIELD_PREDICTION:
  Type: regression
  Target: RICE YIELD (Kg per ha)
  Description: Predict rice yield specifically

HIGH_YIELD_CLASSIFICATION:
  Type: classification
  Target: HIGH_YIELD_FLAG
  Description: Classify districts as high-yield vs low-yield

CLUSTER_PREDICTION:
  Type: multiclass
  Target: CLUSTER
  Description: Predict district productivity cluster

YOY_CHANGE_PREDICTION:
  Type: regression
  Target: OVERALL_YIELD_Kg_per_ha_YoY_Growth
  Description: Predict year-over-year yield change
