In [24]:
# 1. Import Libraries and Setup

# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE, VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier


from sklearn.preprocessing import FunctionTransformer
import traceback
# extra  Scikit-learn models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.svm import SVR, SVC

# XGBoost
from xgboost import XGBRegressor, XGBClassifier

# LightGBM
from lightgbm import LGBMRegressor, LGBMClassifier

# Additional utilities
import joblib
import pickle
from datetime import datetime
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import shap

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

print(f"Model Training started at: {datetime.now()}")


Model Training started at: 2025-12-12 12:21:24.678491


In [25]:
# 2. Load Processed Data

# %%
try:
    # Load data from previous step
    df = pd.read_parquet('cleaned_agricultural_data.parquet')
except:
    df = pd.read_csv('cleaned_agricultural_data.csv')

# Load EDA results for reference
try:
    with open('eda_analysis_results.pkl', 'rb') as f:
        eda_results = pickle.load(f)
except:
    eda_results = {}

print(f"Dataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")

# Display first few rows
print("\nDataset Preview:")
print(df.head())


Dataset Shape: (2484, 118)
Memory Usage: 2.24 MB

Dataset Preview:
   Dist Code  Year  State Code      State Name   Dist Name  \
0         52  2010           1  Andhra Pradesh  Ananthapur   
1         52  2011           1  Andhra Pradesh  Ananthapur   
2         52  2012           1  Andhra Pradesh  Ananthapur   
3         52  2013           1  Andhra Pradesh  Ananthapur   
4         52  2014           1  Andhra Pradesh  Ananthapur   

   RICE AREA (1000 ha)  RICE PRODUCTION (1000 tons)  RICE YIELD (Kg per ha)  \
0                59.80                       171.40                 2866.22   
1                48.67                       120.07                 2467.02   
2                29.05                        76.45                 2631.67   
3                40.40                        87.94                 2176.73   
4                29.21                        82.53                 2825.40   

   WHEAT AREA (1000 ha)  WHEAT PRODUCTION (1000 tons)  \
0                  0.13     

In [26]:
# 3. Problem Formulation and Target Definition

def define_problems(df):
    """
    Define multiple prediction problems for the agricultural dataset
    Returns dictionary with problem definitions and target columns
    """
    
    problems = {}
    
    # Problem 1: Overall Yield Prediction (Regression)
    problems['yield_prediction'] = {
        'type': 'regression',
        'target': 'OVERALL_YIELD_Kg_per_ha',
        'description': 'Predict overall agricultural yield based on various features',
        'metrics': ['RMSE', 'MAE', 'R2']
    }
    
    # Problem 2: Crop-Specific Yield Prediction (Rice)
    problems['rice_yield_prediction'] = {
        'type': 'regression',
        'target': 'RICE YIELD (Kg per ha)',
        'description': 'Predict rice yield specifically',
        'metrics': ['RMSE', 'MAE', 'R2']
    }
    
    # Problem 3: High-Yield District Classification (Binary)
    # Define high yield as above median overall yield
    median_yield = df['OVERALL_YIELD_Kg_per_ha'].median()
    problems['high_yield_classification'] = {
        'type': 'classification',
        'target': 'HIGH_YIELD_FLAG',
        'description': 'Classify districts as high-yield vs low-yield',
        'metrics': ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC-ROC']
    }
    
    # Problem 4: Productivity Cluster Prediction (Multi-class)
    if 'Cluster' in eda_results.get('district_clusters', pd.DataFrame()):
        problems['cluster_prediction'] = {
            'type': 'multiclass',
            'target': 'CLUSTER',
            'description': 'Predict district productivity cluster',
            'metrics': ['Accuracy', 'F1_macro', 'F1_micro']
        }
    
    # Problem 5: Year-over-Year Yield Change (Regression)
    if 'OVERALL_YIELD_Kg_per_ha_YoY_Growth' in df.columns:
        problems['yoy_change_prediction'] = {
            'type': 'regression',
            'target': 'OVERALL_YIELD_Kg_per_ha_YoY_Growth',
            'description': 'Predict year-over-year yield change',
            'metrics': ['RMSE', 'MAE', 'R2']
        }
    
    return problems

problems = define_problems(df)
print("Defined Prediction Problems:")
print("=" * 80)
for prob_name, prob_config in problems.items():
    print(f"\n{prob_name.upper()}:")
    print(f"  Type: {prob_config['type']}")
    print(f"  Target: {prob_config['target']}")
    print(f"  Description: {prob_config['description']}")



        

    





Defined Prediction Problems:

YIELD_PREDICTION:
  Type: regression
  Target: OVERALL_YIELD_Kg_per_ha
  Description: Predict overall agricultural yield based on various features

RICE_YIELD_PREDICTION:
  Type: regression
  Target: RICE YIELD (Kg per ha)
  Description: Predict rice yield specifically

HIGH_YIELD_CLASSIFICATION:
  Type: classification
  Target: HIGH_YIELD_FLAG
  Description: Classify districts as high-yield vs low-yield

CLUSTER_PREDICTION:
  Type: multiclass
  Target: CLUSTER
  Description: Predict district productivity cluster

YOY_CHANGE_PREDICTION:
  Type: regression
  Target: OVERALL_YIELD_Kg_per_ha_YoY_Growth
  Description: Predict year-over-year yield change


In [27]:
# 4. Data Preparation for ML


def prepare_ml_data(df, problems):
    """
    Prepare data for machine learning by creating target variables
    and feature sets for each problem
    """
    
    ml_data = {}
    
    # Create a copy for modification
    df_ml = df.copy()
    
    # Problem 1: Overall Yield Prediction
    target1 = problems['yield_prediction']['target']
    if target1 in df_ml.columns:
        # Drop rows with missing target
        df_yield = df_ml.dropna(subset=[target1]).copy()
        ml_data['yield_prediction'] = {
            'X': df_yield.drop(columns=[target1]),
            'y': df_yield[target1],
            'problem_type': 'regression'
        }
    
    # Problem 2: Rice Yield Prediction
    target2 = problems['rice_yield_prediction']['target']
    if target2 in df_ml.columns:
        df_rice = df_ml.dropna(subset=[target2]).copy()
        ml_data['rice_yield_prediction'] = {
            'X': df_rice.drop(columns=[target2]),
            'y': df_rice[target2],
            'problem_type': 'regression'
        }
    
    # Problem 3: High-Yield Classification
    median_yield = df_ml['OVERALL_YIELD_Kg_per_ha'].median()
    df_ml['HIGH_YIELD_FLAG'] = (df_ml['OVERALL_YIELD_Kg_per_ha'] > median_yield).astype(int)
    
    df_class = df_ml.dropna(subset=['HIGH_YIELD_FLAG']).copy()
    ml_data['high_yield_classification'] = {
        'X': df_class.drop(columns=['HIGH_YIELD_FLAG']),
        'y': df_class['HIGH_YIELD_FLAG'],
        'problem_type': 'classification'
    }
    
    # Problem 4: Cluster Prediction (if available)
    if 'cluster_prediction' in problems:
        cluster_df = eda_results.get('district_clusters', pd.DataFrame())
        if not cluster_df.empty and 'Cluster' in cluster_df.columns:
            # Merge cluster information
            cluster_mapping = cluster_df.set_index(['State Name', 'Dist Name'])['Cluster'].to_dict()
            df_ml['CLUSTER'] = df_ml.apply(
                lambda row: cluster_mapping.get((row['State Name'], row['Dist Name']), -1),
                axis=1
            )
            
            df_cluster = df_ml[df_ml['CLUSTER'] != -1].copy()
            ml_data['cluster_prediction'] = {
                'X': df_cluster.drop(columns=['CLUSTER']),
                'y': df_cluster['CLUSTER'],
                'problem_type': 'multiclass'
            }
    
    # Problem 5: YoY Change Prediction
    if 'yoy_change_prediction' in problems:
        target5 = problems['yoy_change_prediction']['target']
        if target5 in df_ml.columns:
            df_yoy = df_ml.dropna(subset=[target5]).copy()
            ml_data['yoy_change_prediction'] = {
                'X': df_yoy.drop(columns=[target5]),
                'y': df_yoy[target5],
                'problem_type': 'regression'
            }
    
    return ml_data

ml_data = prepare_ml_data(df, problems)

print("\nML Data Preparation Summary:")
print("=" * 80)
for prob_name, data_dict in ml_data.items():
    print(f"\n{prob_name.upper()}:")
    print(f"  Samples: {len(data_dict['y'])}")
    print(f"  Features: {data_dict['X'].shape[1]}")
    print(f"  Problem Type: {data_dict['problem_type']}")
    print(f"  Target Distribution:")
    if data_dict['problem_type'] == 'classification':
        print(data_dict['y'].value_counts(normalize=True).round(3))
    elif data_dict['problem_type'] == 'regression':
        print(f"    Mean: {data_dict['y'].mean():.2f}, Std: {data_dict['y'].std():.2f}")



ML Data Preparation Summary:

YIELD_PREDICTION:
  Samples: 2481
  Features: 117
  Problem Type: regression
  Target Distribution:
    Mean: 1784.72, Std: 900.71

RICE_YIELD_PREDICTION:
  Samples: 2484
  Features: 117
  Problem Type: regression
  Target Distribution:
    Mean: 2078.30, Std: 1115.62

HIGH_YIELD_CLASSIFICATION:
  Samples: 2484
  Features: 118
  Problem Type: classification
  Target Distribution:
HIGH_YIELD_FLAG
0    0.501
1    0.499
Name: proportion, dtype: float64

CLUSTER_PREDICTION:
  Samples: 2484
  Features: 119
  Problem Type: multiclass
  Target Distribution:

YOY_CHANGE_PREDICTION:
  Samples: 2170
  Features: 119
  Problem Type: regression
  Target Distribution:
    Mean: 0.19, Std: 1.74


In [28]:
# 5. Feature Engineering for ML


def create_ml_features(X_df):
    """
    Create additional features specifically for machine learning
    """
    
    X_processed = X_df.copy()
    
    # 1. Create temporal features
    if 'Year' in X_processed.columns:
        X_processed['DECADE'] = (X_processed['Year'] // 10) * 10
        X_processed['YEAR_SIN'] = np.sin(2 * np.pi * X_processed['Year'] / 10)
        X_processed['YEAR_COS'] = np.cos(2 * np.pi * X_processed['Year'] / 10)
    
    # 2. Create interaction features
    if all(col in X_processed.columns for col in ['TOTAL_AREA_1000_ha', 'TOTAL_PRODUCTION_1000_tons']):
        X_processed['AREA_PRODUCTION_RATIO'] = (
            X_processed['TOTAL_PRODUCTION_1000_tons'] / 
            X_processed['TOTAL_AREA_1000_ha'].replace(0, np.nan)
        )
    
    # 3. Create polynomial features for key numerical columns
    key_numeric_cols = [
        'TOTAL_AREA_1000_ha', 'TOTAL_PRODUCTION_1000_tons',
        'CROP_DIVERSIFICATION_INDEX', 'PRODUCTIVITY_EFFICIENCY'
    ]
    
    for col in key_numeric_cols:
        if col in X_processed.columns:
            X_processed[f'{col}_SQ'] = X_processed[col] ** 2
            X_processed[f'{col}_LOG'] = np.log1p(np.abs(X_processed[col]))
    
    # 4. Create statistical aggregation features
    # Group by state and create state-level statistics
    if 'State Name' in X_processed.columns:
        state_stats = X_processed.groupby('State Name').agg({
            'TOTAL_AREA_1000_ha': ['mean', 'std', 'min', 'max'],
            'TOTAL_PRODUCTION_1000_tons': ['mean', 'std']
        }).round(2)
        
        state_stats.columns = ['STATE_' + '_'.join(col).strip() for col in state_stats.columns]
        state_stats = state_stats.reset_index()
        
        X_processed = X_processed.merge(state_stats, on='State Name', how='left')
    
    # 5. Create lag features for time-series analysis
    if 'Year' in X_processed.columns and 'Dist Name' in X_processed.columns:
        # Sort by district and year
        X_processed = X_processed.sort_values(['Dist Name', 'Year'])
        
        # Create 1-year lag for key features
        lag_cols = ['TOTAL_PRODUCTION_1000_tons', 'OVERALL_YIELD_Kg_per_ha']
        for col in lag_cols:
            if col in X_processed.columns:
                X_processed[f'{col}_LAG1'] = X_processed.groupby('Dist Name')[col].shift(1)
                X_processed[f'{col}_CHANGE'] = X_processed[col] - X_processed[f'{col}_LAG1']
    
    # 6. Create seasonality features
    if 'Year' in X_processed.columns:
        X_processed['IS_EVEN_YEAR'] = (X_processed['Year'] % 2 == 0).astype(int)
        X_processed['YEAR_QUARTER'] = ((X_processed['Year'] % 4) + 1).astype(int)
    
    return X_processed

# Apply feature engineering to all problems
for prob_name, data_dict in ml_data.items():
    if prob_name in ['yield_prediction', 'rice_yield_prediction', 'yoy_change_prediction']:
        ml_data[prob_name]['X'] = create_ml_features(data_dict['X'])

print("\nFeature Engineering Completed:")
print("=" * 80)
for prob_name, data_dict in ml_data.items():
    if prob_name in ['yield_prediction', 'rice_yield_prediction', 'yoy_change_prediction']:
        original_features = df.shape[1]
        new_features = data_dict['X'].shape[1]
        print(f"{prob_name}: {original_features} → {new_features} features")



Feature Engineering Completed:
yield_prediction: 118 → 139 features
rice_yield_prediction: 118 → 141 features
yoy_change_prediction: 118 → 143 features


In [29]:
# 6. Feature Selection Pipeline


def create_feature_selection_pipeline(X, y, problem_type='regression', n_features=50):
    """
    Feature selection pipeline with strict protection against:
    - inf values
    - overflow
    - zero-variance scaling bugs
    """

    #  PRE-CLEAN RAW VALUES
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.applymap(lambda v: np.nan if isinstance(v, (int, float)) and (abs(v) > 1e308) else v)

    # Split columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

    # Remove ID-like columns
    exclude_cols = ['Year', 'Dist Code', 'State Code', 'Dist Name', 'State Name']
    numerical_cols = [col for col in numerical_cols if col not in exclude_cols]

    # Remove zero-variance cols BEFORE scaling to avoid divide-by-zero
    zero_var = [col for col in numerical_cols if X[col].nunique() <= 1]
    if len(zero_var) > 0:
        numerical_cols = [c for c in numerical_cols if c not in zero_var]

    # Numerical pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Categorical pipeline
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols),
        ],
        remainder='drop'
    )

    # Feature selection pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('variance_threshold', VarianceThreshold(threshold=0.0001)),
        ('feature_selection', SelectKBest(
            score_func=f_regression if problem_type == 'regression' else mutual_info_regression,
            k=min(n_features, 2000)
        ))
    ])

    return pipeline


def select_features(ml_data, n_features=50):
    selected_data = {}

    for prob_name, data_dict in ml_data.items():
        print(f"\nProcessing {prob_name}...")

        X = data_dict['X'].copy()
        y = data_dict['y']
        problem_type = data_dict['problem_type']

        # Hard clean raw X
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.applymap(lambda v: np.nan if isinstance(v, (int, float)) and (abs(v) > 1e308) else v)

        try:
            fs_pipeline = create_feature_selection_pipeline(
                X, y, problem_type, n_features
            )

            # Fit-transform safely
            X_transformed = fs_pipeline.named_steps['preprocessor'].fit_transform(X)

            # -------- POST-CLEAN TRANSFORMED DATA --------
            X_transformed = np.nan_to_num(X_transformed, nan=0.0, posinf=0.0, neginf=0.0)

            # Replace preprocessor output with cleaned version
            fs_pipeline.named_steps['preprocessor'].transform = lambda _X: X_transformed

            # Now run variance threshold + KBest
            X_selected = fs_pipeline.named_steps['variance_threshold'].fit_transform(X_transformed)
            X_selected = fs_pipeline.named_steps['feature_selection'].fit_transform(X_selected, y)

            # Feature names
            preprocessor = fs_pipeline.named_steps['preprocessor']
            feature_names = []

            # Numerical
            num_features = preprocessor.transformers_[0][2]
            feature_names.extend(num_features)

            # Categorical
            cat_features = preprocessor.transformers_[1][2]
            if len(cat_features) > 0:
                cat_pipeline = preprocessor.transformers_[1][1]
                onehot = cat_pipeline.named_steps['onehot']
                cat_feature_names = onehot.get_feature_names_out(cat_features)
                feature_names.extend(cat_feature_names)

            # Selected
            selector = fs_pipeline.named_steps['feature_selection']
            selected_indices = selector.get_support(indices=True)
            selected_feature_names = [feature_names[i] for i in selected_indices]

            selected_data[prob_name] = {
                'X': X_selected,
                'y': y.values,
                'feature_names': selected_feature_names,
                'problem_type': problem_type,
                'fs_pipeline': fs_pipeline
            }

            print(f"  Selected {len(selected_feature_names)} features")
            print(f"  Top 10 features: {selected_feature_names[:10]}")

        except Exception as e:
            print(f"  Error in feature selection: {str(e)}")
            selected_data[prob_name] = {
                'X': X.values,
                'y': y.values,
                'feature_names': X.columns.tolist(),
                'problem_type': problem_type,
                'fs_pipeline': None
            }

    return selected_data


# Run feature selection
selected_data = select_features(ml_data, n_features=30)



Processing yield_prediction...
  Selected 30 features
  Top 10 features: ['LINSEED AREA (1000 ha)', 'LINSEED PRODUCTION (1000 tons)', 'SUGARCANE AREA (1000 ha)', 'SUGARCANE PRODUCTION (1000 tons)', 'FINGER MILLET AREA (1000 ha)_PROPORTION', 'LINSEED AREA (1000 ha)_PROPORTION', 'Dist Name_Almorah', 'Dist Name_Bundi', 'Dist Name_Chamba', 'Dist Name_Chamoli']

Processing rice_yield_prediction...
  Selected 30 features
  Top 10 features: ['RICE AREA (1000 ha)', 'RICE AREA (1000 ha)_PROPORTION', 'GROUNDNUT AREA (1000 ha)_PROPORTION', 'FRUITS AREA (1000 ha)_PROPORTION', 'State Name_Uttarakhand', 'Dist Name_Bharatpur', 'Dist Name_Bhavnagar', 'Dist Name_Bhind', 'Dist Name_Bijnor', 'Dist Name_Bilaspur']

Processing high_yield_classification...
  Selected 30 features
  Top 10 features: ['RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'RICE YIELD (Kg per ha)', 'WHEAT AREA (1000 ha)', 'WHEAT PRODUCTION (1000 tons)', 'WHEAT YIELD (Kg per ha)', 'SORGHUM YIELD (Kg per ha)', 'BARLEY AREA (1000 

In [30]:
# 7. Model Training Setup



def get_model_configs():
    """
    Define model configurations for different problem types
    """
    
    model_configs = {
        'regression': {
            'Random Forest': {
                'model': RandomForestRegressor(random_state=42),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [10, 20, 30, None],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['sqrt', 'log2']
                }
            },
            'Gradient Boosting': {
                'model': GradientBoostingRegressor(random_state=42),
                'params': {
                    'n_estimators': [100, 200],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'max_depth': [3, 5, 7],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2]
                }
            },
            'XGBoost': {
                'model': XGBRegressor(random_state=42, verbosity=0),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [3, 5, 7],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'subsample': [0.8, 0.9, 1.0],
                    'colsample_bytree': [0.8, 0.9, 1.0]
                }
            },
            'LightGBM': {
                'model': LGBMRegressor(random_state=42, verbosity=-1),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [5, 10, 15],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'num_leaves': [31, 50, 100],
                    'subsample': [0.8, 0.9]
                }
            },
            'Linear Regression': {
                'model': LinearRegression(),
                'params': {}
            },
            'Ridge Regression': {
                'model': Ridge(random_state=42),
                'params': {
                    'alpha': [0.1, 1.0, 10.0, 100.0]
                }
            },
            'SVR': {
                'model': SVR(),
                'params': {
                    'C': [0.1, 1, 10],
                    'epsilon': [0.01, 0.1, 0.2],
                    'kernel': ['linear', 'rbf']
                }
            }
        },
        
        'classification': {
            'Random Forest': {
                'model': RandomForestClassifier(random_state=42),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5],
                    'min_samples_leaf': [1, 2],
                    'class_weight': ['balanced', None]
                }
            },
            'XGBoost': {
                'model': XGBClassifier(random_state=42, verbosity=0),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [3, 5, 7],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'subsample': [0.8, 0.9]
                }
            },
            'LightGBM': {
                'model': LGBMClassifier(random_state=42, verbosity=-1),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [5, 10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'num_leaves': [31, 50]
                }
            },
            'Logistic Regression': {
                'model': LogisticRegression(random_state=42, max_iter=1000),
                'params': {
                    'C': [0.1, 1, 10],
                    'penalty': ['l2', 'none']
                }
            },
            'SVM': {
                'model': SVC(random_state=42, probability=True),
                'params': {
                    'C': [0.1, 1, 10],
                    'kernel': ['linear', 'rbf'],
                    'gamma': ['scale', 'auto']
                }
            }
        }
    }
    
    return model_configs

model_configs = get_model_configs()

print(model_configs.keys())


dict_keys(['regression', 'classification'])


In [31]:
# 8. Train-Test Split and Cross-Validation Setup

# %%
def prepare_train_test_splits(selected_data, test_size=0.2, random_state=42):
    """
    Create train-test splits for all problems
    """
    
    splits = {}
    
    for prob_name, data_dict in selected_data.items():
        X = data_dict['X']
        y = data_dict['y']
        
        # Create train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state,
            stratify=y if data_dict['problem_type'] in ['classification', 'multiclass'] else None
        )
        
        splits[prob_name] = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'feature_names': data_dict['feature_names'],
            'problem_type': data_dict['problem_type'],
            'fs_pipeline': data_dict['fs_pipeline']
        }
        
        print(f"{prob_name}:")
        print(f"  Train shape: {X_train.shape}, Test shape: {X_test.shape}")
        print(f"  Train target distribution: {np.unique(y_train, return_counts=True)[1] if len(y_train.shape) == 1 else 'N/A'}")
    
    return splits

splits = prepare_train_test_splits(selected_data)


yield_prediction:
  Train shape: (1984, 30), Test shape: (497, 30)
  Train target distribution: [1 1 1 ... 1 1 1]
rice_yield_prediction:
  Train shape: (1987, 30), Test shape: (497, 30)
  Train target distribution: [189   1   1 ...   1   1  23]
high_yield_classification:
  Train shape: (1987, 30), Test shape: (497, 30)
  Train target distribution: [995 992]
cluster_prediction:
  Train shape: (1987, 30), Test shape: (497, 30)
  Train target distribution: [637 454 333 147 416]
yoy_change_prediction:
  Train shape: (1736, 30), Test shape: (434, 30)
  Train target distribution: [1 1 1 ... 1 1 1]


In [32]:
# 9. Model Training with Cross-Validation

# %%
def train_models_with_cv(splits, model_configs, cv_folds=5):
    """
    Train multiple models with cross-validation
    """
    
    trained_models = {}
    
    for prob_name, split_data in splits.items():
        print(f"\n{'='*80}")
        print(f"Training models for: {prob_name}")
        print(f"{'='*80}")
        
        X_train = split_data['X_train']
        y_train = split_data['y_train']
        problem_type = split_data['problem_type']
        
        # Get appropriate model configs
        if problem_type == 'regression':
            configs = model_configs['regression']
            scoring = 'neg_root_mean_squared_error'
        else:  # classification or multiclass
            configs = model_configs['classification']
            scoring = 'accuracy'
        
        prob_models = {}
        
        for model_name, model_info in configs.items():
            print(f"\nTraining {model_name}...")
            
            try:
                # Create model pipeline
                if split_data['fs_pipeline'] is not None:
                    # Use pipeline with preprocessing
                    pipeline = Pipeline([
                        ('preprocessing', split_data['fs_pipeline']),
                        ('model', model_info['model'])
                    ])
                else:
                    pipeline = make_pipeline(StandardScaler(), model_info['model'])
                
                # Perform cross-validation
                cv_scores = cross_val_score(
                    pipeline, X_train, y_train,
                    cv=cv_folds, scoring=scoring, n_jobs=-1
                )
                
                # Fit model on full training data
                pipeline.fit(X_train, y_train)
                
                # Store model and results
                prob_models[model_name] = {
                    'pipeline': pipeline,
                    'cv_scores': cv_scores,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std(),
                    'model_params': model_info.get('params', {})
                }
                
                print(f"  CV Scores: {cv_scores.round(4)}")
                print(f"  CV Mean: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
                
            except Exception as e:
                print(f"  Error training {model_name}: {str(e)}")
                continue
        
        # Store all models for this problem
        trained_models[prob_name] = prob_models
    
    return trained_models

trained_models = train_models_with_cv(splits, model_configs, cv_folds=5)


Training models for: yield_prediction

Training Random Forest...
  Error training Random Forest: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\phill\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_indexing.py", line 341, in _get_column_indices
    all_columns = X.columns
                  ^^^^^^^^^
AttributeError: 'numpy.ndarray' object has no attribute 'columns'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\phill\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_p

In [33]:
# 9. Model Training with Cross-Validation

# %%

def ensure_dataframe(X):
    """Ensures that the input is a pandas DataFrame."""
    if isinstance(X, pd.DataFrame):
        return X
    return pd.DataFrame(X)

def train_models_with_cv(splits, model_configs, cv_folds=5):
    """
    Robust training function with DataFrame-safe pipelines 
    to prevent 'A given column is not a column of the dataframe'.
    """
    
    trained_models = {}
    
    for prob_name, split_data in splits.items():
        print(f"\n{'='*80}")
        print(f"Training models for: {prob_name}")
        print(f"{'='*80}")
        
        # Always enforce DataFrame
        X_train = ensure_dataframe(split_data['X_train'])
        y_train = split_data['y_train']
        problem_type = split_data['problem_type']

        # Choose model configs
        if problem_type == "regression":
            configs = model_configs["regression"]
            scoring = "neg_root_mean_squared_error"
        else:
            configs = model_configs["classification"]
            scoring = "accuracy"

        prob_models = {}

        for model_name, model_info in configs.items():
            print(f"\nTraining {model_name}...")

            try:
                # --- FIX: Wrap fs_pipeline to preserve DataFrame structure ---
                if split_data['fs_pipeline'] is not None:
                    fs_pipe = Pipeline([
                        ("frame", FunctionTransformer(ensure_dataframe)),
                        ("fs", split_data["fs_pipeline"])
                    ])
                    pipeline = Pipeline([
                        ("preprocessing", fs_pipe),
                        ("model", model_info["model"])
                    ])
                else:
                    pipeline = make_pipeline(StandardScaler(), model_info["model"])
                
                # Cross Validation
                cv_scores = cross_val_score(
                    pipeline, X_train, y_train,
                    cv=cv_folds, scoring=scoring, n_jobs=-1,
                    error_score="raise"
                )

                # Fit full model
                pipeline.fit(X_train, y_train)

                prob_models[model_name] = {
                    "pipeline": pipeline,
                    "cv_scores": cv_scores,
                    "cv_mean": cv_scores.mean(),
                    "cv_std": cv_scores.std(),
                    "model_params": model_info.get("params", {})
                }

                print(f"  CV Scores: {cv_scores.round(4)}")
                print(f"  CV Mean: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

            except Exception as e:
                print(f"  Error training {model_name}: {str(e)}")
                print(traceback.format_exc())
                continue
        
        trained_models[prob_name] = prob_models

    return trained_models



trained_models = train_models_with_cv(splits, model_configs, cv_folds=5)



Training models for: yield_prediction

Training Random Forest...
  Error training Random Forest: A given column is not a column of the dataframe
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\phill\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_indexing.py", line 364, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "C:\Users\phill\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\indexes\range.py", line 417, in get_loc
    raise KeyError(key)
KeyError: 'RICE AREA (1000 ha)'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\phill\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\process_executor.py", line 463, in _process_worker
    r = call_item()
  File "C:\Users\phill\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\pro

In [34]:
# 10. Hyperparameter Tuning with Grid Search

# %%
def perform_hyperparameter_tuning(trained_models, splits, n_iter=20):
    """
    Perform hyperparameter tuning for best models
    """
    
    tuned_models = {}
    
    for prob_name, models_dict in trained_models.items():
        print(f"\n{'='*80}")
        print(f"Hyperparameter Tuning for: {prob_name}")
        print(f"{'='*80}")
        
        split_data = splits[prob_name]
        X_train = split_data['X_train']
        y_train = split_data['y_train']
        problem_type = split_data['problem_type']
        
        # Select top 3 models for tuning based on CV scores
        sorted_models = sorted(
            models_dict.items(),
            key=lambda x: x[1]['cv_mean'],
            reverse=True
        )[:3]
        
        prob_tuned_models = {}
        
        for model_name, model_info in sorted_models:
            print(f"\nTuning {model_name}...")
            
            # Get model config
            if problem_type == 'regression':
                configs = model_configs['regression']
            else:
                configs = model_configs['classification']
            
            if model_name not in configs or not configs[model_name]['params']:
                print(f"  No hyperparameters to tune for {model_name}")
                prob_tuned_models[model_name] = model_info
                continue
            
            try:
                # Create pipeline
                if split_data['fs_pipeline'] is not None:
                    pipeline = Pipeline([
                        ('preprocessing', split_data['fs_pipeline']),
                        ('model', configs[model_name]['model'])
                    ])
                else:
                    pipeline = make_pipeline(StandardScaler(), configs[model_name]['model'])
                
                # Prepare parameter grid
                param_grid = {}
                for param_name, param_values in configs[model_name]['params'].items():
                    param_grid[f'model__{param_name}'] = param_values
                
                # Use RandomizedSearchCV for efficiency
                scoring = 'neg_root_mean_squared_error' if problem_type == 'regression' else 'accuracy'
                
                grid_search = RandomizedSearchCV(
                    pipeline,
                    param_grid,
                    n_iter=n_iter,
                    cv=3,
                    scoring=scoring,
                    random_state=42,
                    n_jobs=-1,
                    verbose=0
                )
                
                # Perform grid search
                grid_search.fit(X_train, y_train)
                
                # Store tuned model
                prob_tuned_models[model_name] = {
                    'pipeline': grid_search.best_estimator_,
                    'best_params': grid_search.best_params_,
                    'best_score': grid_search.best_score_,
                    'cv_results': grid_search.cv_results_
                }
                
                print(f"  Best Score: {grid_search.best_score_:.4f}")
                print(f"  Best Params: {grid_search.best_params_}")
                
            except Exception as e:
                print(f"  Error tuning {model_name}: {str(e)}")
                prob_tuned_models[model_name] = model_info
        
        tuned_models[prob_name] = prob_tuned_models
    
    return tuned_models

# Perform hyperparameter tuning
tuned_models = perform_hyperparameter_tuning(trained_models, splits, n_iter=10)



Hyperparameter Tuning for: yield_prediction

Hyperparameter Tuning for: rice_yield_prediction

Hyperparameter Tuning for: high_yield_classification

Hyperparameter Tuning for: cluster_prediction

Hyperparameter Tuning for: yoy_change_prediction
