In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import gc
import random
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.spatial.distance import cosine, mahalanobis
from scipy.stats import ks_2samp, skew, kurtosis, entropy, f_oneway, norm
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Multiply, Add, BatchNormalization, LayerNormalization, MultiHeadAttention, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils import plot_model

import shap
from sklearn.inspection import permutation_importance

os.makedirs("models", exist_ok=True)
os.makedirs("architectures", exist_ok=True)
os.makedirs("results", exist_ok=True)
os.makedirs("feature_analysis", exist_ok=True)
os.makedirs("optimization", exist_ok=True)

gc.enable()
import warnings
warnings.filterwarnings("ignore")

In [None]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

In [None]:
data = pd.read_csv('/kaggle/input/al-moutmir/data.csv')

# Sampling Methods

## Train-Test Split

In [None]:
# Define feature matrix (X) and target variable (y)
X = data.drop(columns=['grain_yield_kg', 'yield_quartile'])
y = data['grain_yield_kg']

# Random and Temporal Splits
X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)

train_data = data[data['growth_season'] != data['growth_season'].max()]
test_data = data[data['growth_season'] == data['growth_season'].max()]

X_train_temp = train_data.drop(columns=['grain_yield_kg', 'yield_quartile'])
y_train_temp = train_data['grain_yield_kg']
X_test_temp = test_data.drop(columns=['grain_yield_kg', 'yield_quartile'])
y_test_temp = test_data['grain_yield_kg']

## Data Splits Evaluation
### Distribution of Crops Across Splits

In [None]:
quartile_distribution = pd.concat([
    data.loc[X_train_rand.index.intersection(data.index), 'crop']
        .value_counts(normalize=True)
        .rename("Random Train"),
    data.loc[X_test_rand.index.intersection(data.index), 'crop']
        .value_counts(normalize=True)
        .rename("Random Test"),
    data.loc[X_train_temp.index.intersection(data.index), 'crop']
        .value_counts(normalize=True)
        .rename("Temporal Train"),
    data.loc[X_test_temp.index.intersection(data.index), 'crop']
        .value_counts(normalize=True)
        .rename("Temporal Test")
], axis=1).fillna(0)

# Define crop mapping
crop_mapping = {0: "Barley", 1: "Dry Wheat", 2: "Soft Wheat"}

# Map crop labels to their names
quartile_distribution.index = quartile_distribution.index.map(crop_mapping)

# Plot the distribution
quartile_distribution.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title("Distribution of Crops Across Splits")
plt.ylabel("Proportion")
plt.xlabel("Crops")
plt.legend(title="Splits", loc="upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()

# Save the plot
plt.savefig(os.path.join("feature_analysis", "target_distribution_across_splits.png"), dpi=300)

# Display the plot
plt.show()

In [None]:
# Cosine Similarity
random_similarity = 1 - cosine(quartile_distribution["Random Train"], quartile_distribution["Random Test"])
temporal_similarity = 1 - cosine(quartile_distribution["Temporal Train"], quartile_distribution["Temporal Test"])

# KL Divergence
kl_random = entropy(quartile_distribution["Random Train"], quartile_distribution["Random Test"])
kl_temporal = entropy(quartile_distribution["Temporal Train"], quartile_distribution["Temporal Test"])

# Shannon Entropy
entropy_random_train = entropy(quartile_distribution["Random Train"])
entropy_random_test = entropy(quartile_distribution["Random Test"])
entropy_temporal_train = entropy(quartile_distribution["Temporal Train"])
entropy_temporal_test = entropy(quartile_distribution["Temporal Test"])

# Output
print(f"Cosine Similarity (Random Split): {random_similarity:.4f}")
print(f"Cosine Similarity (Temporal Split): {temporal_similarity:.4f}")
print(f"KL Divergence (Random Split): {kl_random:.4f}")
print(f"KL Divergence (Temporal Split): {kl_temporal:.4f}")
print(f"Shannon Entropy (Random Train): {entropy_random_train:.4f}")
print(f"Shannon Entropy (Temporal Train): {entropy_temporal_train:.4f}")


### Distribution of Target Variable

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(y_train_rand, label="Random Train", color="blue", fill=True, alpha=0.3)
sns.kdeplot(y_test_rand, label="Random Test", color="red", fill=True, alpha=0.3)
sns.kdeplot(y_train_temp, label="Temporal Train", color="green", fill=True, alpha=0.3)
sns.kdeplot(y_test_temp, label="Temporal Test", color="orange", fill=True, alpha=0.3)
plt.title("Distribution of Target Variable (Yield)")
plt.xlabel("Yield (kg)")
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join("feature_analysis", "/target_distribution_comparison.png"), dpi=300)
plt.show()

In [None]:
# Kolmogorov-Smirnov Test
ks_rand = ks_2samp(y_train_rand, y_test_rand)
ks_temp = ks_2samp(y_train_temp, y_test_temp)

# Descriptive Statistics
def describe_data(y):
    return {
        "Mean": np.round(y.mean(), 4),
        "Median": np.round(y.median(), 4),
        "Variance": np.round(y.var(), 4),
        "Skewness": np.round(skew(y), 4),
        "Kurtosis": np.round(kurtosis(y), 4)
    }

stats_rand_train = describe_data(y_train_rand)
stats_rand_test = describe_data(y_test_rand)
stats_temp_train = describe_data(y_train_temp)
stats_temp_test = describe_data(y_test_temp)

# Output
print(f"KS Test (Random Split): Statistic={ks_rand.statistic:.4f}, p-value={ks_rand.pvalue:.4f}")
print(f"KS Test (Temporal Split): Statistic={ks_temp.statistic:.4f}, p-value={ks_temp.pvalue:.4f}")
print("Descriptive Statistics (Random Train):", stats_rand_train)
print("Descriptive Statistics (Temporal Train):", stats_temp_train)

# Machine Learning Models

In [None]:
# Function to calculate core metrics
def calculate_metrics(y_true, y_pred, n_features=None):
    y_true = np.asarray(y_true, dtype=np.float64).ravel()
    y_pred = np.asarray(y_pred, dtype=np.float64).ravel()

    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    
    return {
        "R2": r2,
        "MAE": mae,
        "MAPE": mape,
        "RMSE": rmse,
    }

# Function to calculate grouped metrics
def calculate_grouped_metrics(df, group_col, y_true, y_pred):
    grouped = df.groupby(group_col).apply(
        lambda group: calculate_metrics(group[y_true], group[y_pred])
    )
    return grouped

In [None]:
# ML Models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42),
    'AdaBoost': AdaBoostRegressor(
        base_estimator=RandomForestRegressor(n_estimators=100, random_state=42),
        n_estimators=10,
        learning_rate=0.05,
        random_state=42
    ),
    'CatBoost': CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05, random_state=42),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=0.1, random_state=42),
    'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
    'Stacking_LR': StackingRegressor(
        estimators=[
            ('cb', CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42)),
            ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42)),
            ('lgbm', LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42))
        ],
        final_estimator=LinearRegression()
    ),
    'Stacking_RandomForestRegressor': StackingRegressor(
        estimators=[
            ('cb', CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42)),
            ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42)),
            ('lgbm', LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42))
        ],
        final_estimator=RandomForestRegressor(n_estimators=100, random_state=42)
    ),
    'Stacking_CatBoostRegressor': StackingRegressor(
        estimators=[
            ('cb', CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42)),
            ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42)),
            ('lgbm', LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42))
        ],
        final_estimator=CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42)
    ),
    'Stacking_LGBMRegressor': StackingRegressor(
        estimators=[
            ('cb', CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42)),
            ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42)),
            ('lgbm', LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42))
        ],
        final_estimator=LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42)
    ),
    'Stacking_XGBRegressor': StackingRegressor(
        estimators=[
            ('cb', CatBoostRegressor(iterations=2000, learning_rate=0.05, verbose=0, random_seed=42)),
            ('xgb', XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42)),
            ('lgbm', LGBMRegressor(n_estimators=2000, learning_rate=0.05, verbose=-1, random_state=42))
        ],
        final_estimator=XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=42)
    )
}

In [None]:
# Evaluate each model
results = {}
grouped_results = {}
y_pred_ensemble_rand = []
y_pred_ensemble_temp = []

for model_name, model in models.items():
    print(f"\n Evaluate {model_name}")

    # Random Split
    model.fit(X_train_rand, y_train_rand, sample_weight=sample_weights_rand)
    y_pred_rand = model.predict(X_test_rand)
    rand_metrics = calculate_metrics(y_test_rand, y_pred_rand)
    rand_df = X_test_rand.copy()
    rand_df['y_true'], rand_df['y_pred'] = y_test_rand, y_pred_rand
    
    # Save model 1
    joblib.dump(model, f"models/{model_name}_random_split.pkl")

    # Temporal Split
    model.fit(X_train_temp, y_train_temp)
    y_pred_temp = model.predict(X_test_temp)
    temp_metrics = calculate_metrics(y_test_temp, y_pred_temp)
    temp_df = X_test_temp.copy()
    temp_df['y_true'], temp_df['y_pred'] = y_test_temp, y_pred_temp

    # Save model 2
    joblib.dump(model, f"models/{model_name}_temporal_split.pkl")

    # Grouped Metrics
    grouped_rand_crop = calculate_grouped_metrics(rand_df, 'crop', 'y_true', 'y_pred')
    grouped_temp_crop = calculate_grouped_metrics(temp_df, 'crop', 'y_true', 'y_pred')

    # Store Results
    results[model_name] = {'Random Split': rand_metrics, 'Temporal Split': temp_metrics}
    grouped_results[model_name] = {
        'Random Split Crop': grouped_rand_crop,
        'Temporal Split Crop': grouped_temp_crop,
    }

    # For ensemble predictions
    y_pred_ensemble_rand.append(y_pred_rand)
    y_pred_ensemble_temp.append(y_pred_temp)

# Flatten the results into a DataFrame
def flatten_results(results):
    flattened = []
    for model, splits in results.items():
        for split, metrics in splits.items():
            if isinstance(metrics, dict):
                metrics['Model'] = model
                metrics['Split'] = split
                flattened.append(metrics)
    return pd.DataFrame(flattened)

# Flatten the results
results_df = flatten_results(results)

# Pivot the table for better readability
results_pivot = results_df.pivot(index='Model', columns='Split',
                                 values=['R2', 'MAE', 'MAPE', 'RMSE'])
# Clean up the column names
results_pivot.columns = [f"{metric}_{split}" for metric, split in results_pivot.columns]
results_pivot.reset_index(inplace=True)

# Neural Network Models

In [None]:
# Prepare data
scaler = MinMaxScaler()

X_train_rand_scaled = scaler.fit_transform(X_train_rand)
X_test_rand_scaled = scaler.transform(X_test_rand)
X_train_temp_scaled = scaler.fit_transform(X_train_temp)
X_test_temp_scaled = scaler.transform(X_test_temp)

input_dim = X_train_rand_scaled.shape[1]

In [None]:
# Define Neural Network Architectures
def dnn(input_dim):
    model = Sequential([
        Dense(1024, activation='relu', input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.4),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

def feature_attention_nn(input_dim):
    inputs = Input(shape=(input_dim,))
    attention_weights = Dense(input_dim, activation='softmax', name="Attention_Weights")(inputs)
    weighted_inputs = Multiply()([inputs, attention_weights])

    x = Dense(1024, activation='relu')(weighted_inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1, activation='linear')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

def resnet(input_dim):
    inputs = Input(shape=(input_dim,))
    x = Dense(512, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)

    shortcut = Dense(256, activation='linear')(inputs)
    x = Add()([x, shortcut])

    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    outputs = Dense(1, activation='linear')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

def transformer(input_dim):
    # Input layer
    inputs = Input(shape=(input_dim,))

    # Expand dimensions to create a sequence-like structure
    reshaped_inputs = Reshape((1, input_dim))(inputs)  # Sequence length = 1, feature dim = input_dim

    # Multi-Head Attention
    attention_output = MultiHeadAttention(num_heads=4, key_dim=input_dim // 4)(reshaped_inputs, reshaped_inputs)
    attention_output = LayerNormalization()(attention_output + reshaped_inputs)  # Skip connection

    # Feedforward Network
    ffn = Dense(input_dim, activation='relu')(attention_output)  # Ensure matching dimensions
    ffn = Dense(input_dim, activation='relu')(ffn)  # Match dimensions with attention_output
    ffn = LayerNormalization()(ffn + attention_output)  # Skip connection

    # Remove sequence dimension for final output
    flattened_ffn = Reshape((input_dim,))(ffn)  # Flatten to (None, input_dim)

    # Output layer
    outputs = Dense(1, activation='linear')(flattened_ffn)

    # Compile the model
    model = Model(inputs=inputs, outputs=outputs, name="Transformer_Model")
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

    return model

def autoencoder_regressor(input_dim):
    inputs = Input(shape=(input_dim,))
    encoded = Dense(256, activation='relu')(inputs)
    encoded = BatchNormalization()(encoded)
    encoded = Dense(128, activation='relu')(encoded)
    encoded = BatchNormalization()(encoded)

    decoded = Dense(256, activation='relu')(encoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dense(input_dim, activation='linear')(decoded)

    regression_output = Dense(1, activation='linear')(encoded)

    model = Model(inputs=inputs, outputs=regression_output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

# Train, evaluate, and save models for both splits
results_nn = []

# Define model functions and names
model_functions = [dnn, feature_attention_nn, resnet, transformer]
model_names = ['DNN', 'FeatureAttentionNN', 'ResNet', 'Transformer']

In [None]:
# Save architectures as images for visualization
architecture_dir = "architectures"

# Define the models and their corresponding functions
model_functions = [dnn, feature_attention_nn, resnet, transformer]
model_names = ['DNN', 'FeatureAttentionNN', 'ResNet', 'Transformer']

# Save and visualize model architectures
fig, axs = plt.subplots(1, 4, figsize=(20, 5))

for idx, (model_fn, model_name) in enumerate(zip(model_functions, model_names)):
    # Initialize the model
    model = model_fn(input_dim)
    
    # Save the model architecture
    architecture_path = f"{architecture_dir}/{model_name}_architecture.png"
    plot_model(model, to_file=architecture_path, show_shapes=True, show_layer_names=True)

    # Display the model architecture
    img = plt.imread(architecture_path)
    axs[idx].imshow(img)
    axs[idx].axis('off')
    axs[idx].set_title(model_name)

# Display all architectures in a single row
plt.tight_layout()
plt.show()

In [None]:
# Loop through models and splits
for split_name, (X_train_scaled, y_train, X_test_scaled, y_test) in zip(
    ["Random Split", "Temporal Split"],
    [
        (X_train_rand_scaled, y_train_rand, X_test_rand_scaled, y_test_rand),
        (X_train_temp_scaled, y_train_temp, X_test_temp_scaled, y_test_temp),
    ],
):
    for model_fn, model_name in zip(model_functions, model_names):
        print(f"\n Evaluate {model_name} - {split_name}")
        # Initialize and train the model
        model = model_fn(input_dim)
        early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6)

        model.fit(
            X_train_scaled, y_train,
            validation_data=(X_test_scaled, y_test),
            epochs=2000, batch_size=32, verbose=0,
            callbacks=[early_stopping, reduce_lr]
        )

        # Predict and calculate metrics
        y_pred = model.predict(X_test_scaled).flatten()
        metrics = calculate_metrics(y_test, y_pred)

        # Save results for both splits
        results_nn.append({
            'Model': model_name,
            f"R2_{split_name}": metrics['R2'],
            f"MAE_{split_name}": metrics['MAE'],
            f"MAPE_{split_name}": metrics['MAPE'],
            f"RMSE_{split_name}": metrics['RMSE']
        })

        # Grouped Metrics
        grouped_rand_crop = calculate_grouped_metrics(rand_df, 'crop', 'y_true', 'y_pred')
        grouped_temp_crop = calculate_grouped_metrics(temp_df, 'crop', 'y_true', 'y_pred')
    
        # Store Results
        grouped_results[model_name] = {
            'Random Split Crop': grouped_rand_crop,
            'Temporal Split Crop': grouped_temp_crop,
        }
        

        # Save the model
        model.save(f"models/{model_name}_{split_name.replace(' ', '_')}.keras")

# Combine results into a DataFrame
results_nn_combined = pd.DataFrame(results_nn)

# Merge random and temporal split results into one row per model
results_nn_final = results_nn_combined.groupby('Model').agg('first').reset_index()

# Model Evaluation

In [None]:
# Appending the results_nn_final to results_pivot
combined_results = pd.concat([results_pivot, results_nn_final], ignore_index=True)
combined_results

In [None]:
# Selecting the top 3 models based on MAPE for Random and Temporal Splits
top_3_random_models = combined_results.nsmallest(3, "MAPE_Random Split")["Model"].tolist()
top_3_temporal_models = combined_results.nsmallest(3, "MAPE_Temporal Split")["Model"].tolist()

# Function to plot and save prediction vs actual values
def plot_and_save(model_name, y_test, y_pred, split_name):
    plt.figure(figsize=(7, 7))
    plt.scatter(y_test, y_pred, alpha=0.7, label="Predicted vs Actual")
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--', label="Perfect Prediction")
    plt.title(f"Predicted vs Actual - {model_name} - {split_name}")
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.savefig(f"results/Predicted vs Actual - {model_name} - {split_name}.png")
    plt.show()
    plt.close()
    
# Function to extract predictions and ground truth
def get_predictions_and_actuals(model_name, split_type):
    if split_type == "Random Split":
        y_test = y_test_rand
        y_pred = y_pred_ensemble_rand[top_3_random_models.index(model_name)]
    elif split_type == "Temporal Split":
        y_test = y_test_temp
        y_pred = y_pred_ensemble_temp[top_3_temporal_models.index(model_name)]
    return y_test, y_pred

# Plotting and saving the predictions for the top models
for model_name in top_3_random_models:
    y_test, y_pred = get_predictions_and_actuals(model_name, "Random Split")
    plot_and_save(model_name, y_test, y_pred, "Random Split")

for model_name in top_3_temporal_models:
    y_test, y_pred = get_predictions_and_actuals(model_name, "Temporal Split")
    plot_and_save(model_name, y_test, y_pred, "Temporal Split")

# Crop-Specific Model Evaluation

In [None]:
wide_table_with_crops = []

# Iterate through each model and its splits
for model, splits in grouped_results.items():
    # Iterate through each split (e.g., 'Random Split Crop', 'Temporal Split Crop')
    for split, data in splits.items():
        # Iterate through each crop
        for crop_index, metrics in enumerate(data):
            # Initialize a row dictionary for the current crop
            row = {'Model': model, 'Split': split, 'Crop': crop_index}
            # Add metrics to the row
            row.update(metrics)
            # Append the row to the wide_table
            wide_table_with_crops.append(row)

# Convert the table list into a DataFrame
wide_results_table_with_crops = pd.DataFrame(wide_table_with_crops)

# Pivot the DataFrame to the desired wide format
wide_results_table = wide_results_table_with_crops.pivot(
    index=['Model', 'Crop'],
    columns='Split',
    values=['R2', 'MAE', 'MAPE', 'RMSE']
)

# Flatten the MultiIndex columns
wide_results_table.columns = [
    f"{metric}_{split}" for metric, split in wide_results_table.columns
]

# Reset index to make it a clean DataFrame
wide_results_table.reset_index(inplace=True)
wide_results_table

# Ensemble Learning

In [None]:
# Define helper function to load models
def load_model(filepath):
    if filepath.endswith(".pkl"):
        return joblib.load(filepath)  # For machine learning models
    elif filepath.endswith(".keras"):
        return keras.models.load_model(filepath)  # For neural network models
    else:
        raise ValueError(f"Unsupported file format for {filepath}")

## Unified Ensemble Learning

In [None]:
# Build Ensemble of Top N Models
ensemble_sizes = list(range(2, 6))
ensemble_results = []

for n in ensemble_sizes:
    # Select top N models for Random and Temporal splits based on MAE
    sorted_models_rand = sorted(
        {k: v for k, v in results.items() if 'Ensemble' not in k}.items(),
        key=lambda x: x[1]['Random Split']['MAPE']
    )[:n]
    sorted_models_temp = sorted(
        {k: v for k, v in results.items() if 'Ensemble' not in k}.items(),
        key=lambda x: x[1]['Temporal Split']['MAPE']
    )[:n]

    top_model_names_rand = [m[0] for m in sorted_models_rand]
    top_model_names_temp = [m[0] for m in sorted_models_temp]

    # Compute weights for the ensemble (inverse of MAE)
    weights_rand = np.array([1 / results[model]['Random Split']['MAPE'] for model in top_model_names_rand])
    weights_temp = np.array([1 / results[model]['Temporal Split']['MAPE'] for model in top_model_names_temp])

    # Normalize weights to sum to 1
    weights_rand /= weights_rand.sum()
    weights_temp /= weights_temp.sum()

    # Compute ensemble predictions (weighted average)
    y_pred_ensemble_rand = np.zeros_like(y_test_rand)
    y_pred_ensemble_temp = np.zeros_like(y_test_temp)

    for i, model_name in enumerate(top_model_names_rand):
        model_path = f"models/{model_name}_random_split"
        if os.path.exists(f"{model_path}.pkl"):
            model = load_model(f"{model_path}.pkl")
        elif os.path.exists(f"{model_path}.keras"):
            model = load_model(f"{model_path}.keras")
        else:
            continue

        y_pred_rand = model.predict(X_test_rand)
        y_pred_ensemble_rand += weights_rand[i] * y_pred_rand

    for i, model_name in enumerate(top_model_names_temp):
        model_path = f"models/{model_name}_temporal_split"
        if os.path.exists(f"{model_path}.pkl"):
            model = load_model(f"{model_path}.pkl")
        elif os.path.exists(f"{model_path}.keras"):
            model = load_model(f"{model_path}.keras")
        else:
            continue

        y_pred_temp = model.predict(X_test_temp)
        y_pred_ensemble_temp += weights_temp[i] * y_pred_temp

    # Calculate ensemble metrics
    ensemble_rand_metrics = calculate_metrics(y_test_rand, y_pred_ensemble_rand)
    ensemble_temp_metrics = calculate_metrics(y_test_temp, y_pred_ensemble_temp)

    # Store results including model names
    ensemble_results.append({
        'Model': f"Ensemble_{n}",
        'Number of Models': n,
        'Model Names (Random Split)': ', '.join(top_model_names_rand),
        'Model Names (Temporal Split)': ', '.join(top_model_names_temp),
        **{f"{metric}_Random Split": value for metric, value in ensemble_rand_metrics.items()},
        **{f"{metric}_Temporal Split": value for metric, value in ensemble_temp_metrics.items()}
    })

# Convert results into a DataFrame
ensemble_results_df = pd.DataFrame(ensemble_results)
ensemble_results_df

## Diebold-Mariano Test of Ensemble Learning 

In [None]:
# Perform Diebold-Mariano Test for Top Ensembles for Random and Temporal Splits
dm_test_results = []

# Define DM Test function
def dm_test(y_true, y_pred1, y_pred2, h=1, loss="MSE"):
    """
    Perform Diebold-Mariano test.

    Parameters:
    - y_true: Array-like, true values.
    - y_pred1: Array-like, predictions from Model 1.
    - y_pred2: Array-like, predictions from Model 2.
    - h: Int, forecast horizon (default=1).
    - loss: String, loss function ('MSE' or 'MAE').

    Returns:
    - DM statistic and p-value.
    """
    e1 = y_true - y_pred1
    e2 = y_true - y_pred2
    d = e1**2 - e2**2 if loss == "MSE" else np.abs(e1) - np.abs(e2)
    d_mean = np.mean(d)
    n = len(d)
    gamma = np.zeros(h)
    for lag in range(h):
        gamma[lag] = np.sum((d[:-lag] - d_mean) * (d[lag:] - d_mean)) / (n - lag) if lag > 0 else np.var(d)
    dm_stat = d_mean / np.sqrt((gamma[0] + 2 * np.sum(gamma[1:])) / n)
    p_value = 2 * (1 - norm.cdf(np.abs(dm_stat)))
    return dm_stat, p_value

# Extract top #1 ensemble for Random and Temporal splits
top_random_ensemble = ensemble_results_df.iloc[0]  # First row corresponds to top ensemble for Random Split
top_temporal_ensemble = ensemble_results_df.iloc[0]  # First row corresponds to top ensemble for Temporal Split

# Parse model names for each ensemble
top_random_models = top_random_ensemble['Model Names (Random Split)'].split(', ')
top_temporal_models = top_temporal_ensemble['Model Names (Temporal Split)'].split(', ')

# Predictions dictionary for ensembles
model_predictions = {
    'Random Split': {},
    'Temporal Split': {}
}

# Helper function to load predictions for all models in an ensemble
def get_ensemble_predictions(models, X_test, split_type):
    predictions = {}
    for model_name in models:
        model_path = f"models/{model_name}_{split_type.replace(' ', '_').lower()}"
        if os.path.exists(f"{model_path}.pkl"):
            model = load_model(f"{model_path}.pkl")
        elif os.path.exists(f"{model_path}.keras"):
            model = load_model(f"{model_path}.keras")
        else:
            continue
        predictions[model_name] = model.predict(X_test)
    return predictions

# Load predictions for Random and Temporal splits
model_predictions['Random Split'] = get_ensemble_predictions(top_random_models, X_test_rand, 'random_split')
model_predictions['Temporal Split'] = get_ensemble_predictions(top_temporal_models, X_test_temp, 'temporal_split')

# DM Test for all model pairs within each split
for split_type, y_test, predictions in [("Random Split", y_test_rand, model_predictions['Random Split']), 
                                        ("Temporal Split", y_test_temp, model_predictions['Temporal Split'])]:
    model_names = list(predictions.keys())
    for i, model_name_1 in enumerate(model_names):
        for j, model_name_2 in enumerate(model_names):
            if i >= j:  # Avoid duplicate comparisons and self-comparisons
                continue

            # Get predictions for the two models
            y_pred1 = predictions[model_name_1]
            y_pred2 = predictions[model_name_2]

            # Calculate DM test statistics
            dm_stat, p_value = dm_test(y_test, y_pred1, y_pred2, loss="MSE")

            # Store results
            dm_test_results.append({
                'Model 1': model_name_1,
                'Model 2': model_name_2,
                'Split': split_type,
                'DM Statistic': dm_stat,
                'P-Value': p_value
            })

# Convert results to DataFrame
dm_test_results_df = pd.DataFrame(dm_test_results)

# Display DM Test Results Table
dm_test_results_df.sort_values(by=['Split', 'P-Value'], inplace=True)
dm_test_results_df

## Crop-Specific Ensemble Model Evaluation

In [None]:
ensemble_results_crops = []

for n in ensemble_sizes:
    for crop in wide_results_table_with_crops['Crop'].unique():
        crop_results = wide_results_table_with_crops[wide_results_table_with_crops['Crop'] == crop]

        # Select top N models for Random and Temporal splits
        sorted_models_rand = crop_results[crop_results['Split'] == 'Random Split Crop'].nsmallest(n, 'MAPE')
        sorted_models_temp = crop_results[crop_results['Split'] == 'Temporal Split Crop'].nsmallest(n, 'MAPE')

        top_model_names_rand = sorted_models_rand['Model'].tolist()
        top_model_names_temp = sorted_models_temp['Model'].tolist()

        weights_rand = np.array([1 / mae for mae in sorted_models_rand['MAPE']])
        weights_temp = np.array([1 / mae for mae in sorted_models_temp['MAPE']])

        weights_rand /= weights_rand.sum()
        weights_temp /= weights_temp.sum()

        X_test_rand_crop = X_test_rand[X_test_rand['crop'] == crop]
        X_test_temp_crop = X_test_temp[X_test_temp['crop'] == crop]
        y_test_rand_crop = y_test_rand[X_test_rand['crop'] == crop]
        y_test_temp_crop = y_test_temp[X_test_temp['crop'] == crop]

        y_pred_ensemble_rand = np.zeros_like(y_test_rand_crop, dtype=float)
        y_pred_ensemble_temp = np.zeros_like(y_test_temp_crop, dtype=float)

        # Aggregate predictions for Random Split
        for i, model_name in enumerate(top_model_names_rand):
            model_path = f"models/{model_name}_random_split"
            if os.path.exists(f"{model_path}.pkl"):
                model = load_model(f"{model_path}.pkl")
            elif os.path.exists(f"{model_path}.keras"):
                model = load_model(f"{model_path}.keras")
            else:
                continue

            y_pred_rand = model.predict(X_test_rand_crop)
            y_pred_ensemble_rand += weights_rand[i] * y_pred_rand

        # Aggregate predictions for Temporal Split
        for i, model_name in enumerate(top_model_names_temp):
            model_path = f"models/{model_name}_temporal_split"
            if os.path.exists(f"{model_path}.pkl"):
                model = load_model(f"{model_path}.pkl")
            elif os.path.exists(f"{model_path}.keras"):
                model = load_model(f"{model_path}.keras")
            else:
                continue

            y_pred_temp = model.predict(X_test_temp_crop)
            y_pred_ensemble_temp += weights_temp[i] * y_pred_temp

        # Calculate metrics for Random and Temporal Splits
        ensemble_rand_metrics = calculate_metrics(y_test_rand_crop, y_pred_ensemble_rand)
        ensemble_temp_metrics = calculate_metrics(y_test_temp_crop, y_pred_ensemble_temp)

        # Append results
        ensemble_results_crops.append({
            'Model': f"Ensemble_{n}",
            'Crop': crop,
            'Number of Models': n,
            'Model Names (Random Split)': ', '.join(top_model_names_rand),
            'Model Names (Temporal Split)': ', '.join(top_model_names_temp),
            **{f"{metric}_Random Split Crop": value for metric, value in ensemble_rand_metrics.items()},
            **{f"{metric}_Temporal Split Crop": value for metric, value in ensemble_temp_metrics.items()}
        })

# Convert results to DataFrame
ensemble_results_crops_df = pd.DataFrame(ensemble_results_crops)
ensemble_results_crops_df

# Feature Importance and Interpretability Analysis
## Feature Analysis

In [None]:
# Set flag to include/exclude temporal models in analysis
include_temporal = True  # Set to False to exclude temporal models

# Filter top models (exclude ensemble models and stacking models) for Random Split
top_models_random = results_pivot[
    ~results_pivot["Model"].str.contains("Ensemble") & ~results_pivot["Model"].str.contains("Stacking")
]
top_models_random = top_models_random.nsmallest(3, "MAPE_Random Split")  # Top 3 for Random Split

# Filter top models for Temporal Split if enabled
if include_temporal:
    top_models_temporal = results_pivot[
        ~results_pivot["Model"].str.contains("Ensemble") & ~results_pivot["Model"].str.contains("Stacking")
    ]
    top_models_temporal = top_models_temporal.nsmallest(3, "MAPE_Temporal Split")  # Top 3 for Temporal Split

# Combine the models while maintaining uniqueness
unique_top_models = set(top_models_random["Model"])
if include_temporal:
    unique_top_models.update(top_models_temporal["Model"])

# Placeholder for storing analysis results
feature_analysis_results = {}

# Loop through unique models for analysis
for model_name in unique_top_models:
    print(f"\nEvaluating {model_name}")
    model_path_random = f"models/{model_name}_random_split.pkl"
    model_path_temporal = f"models/{model_name}_temporal_split.pkl"

    # Initialize model variables
    model_random = None
    model_temporal = None

    # Load models safely
    try:
        if os.path.exists(model_path_random):
            model_random = joblib.load(model_path_random)
        if include_temporal and os.path.exists(model_path_temporal):
            model_temporal = joblib.load(model_path_temporal)
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        continue

    try:
        # Feature Importance
        feature_importance_random = (
            model_random.feature_importances_ if model_random and hasattr(model_random, "feature_importances_") else None
        )
        feature_importance_temporal = (
            model_temporal.feature_importances_ if model_temporal and hasattr(model_temporal, "feature_importances_") else None
        ) if include_temporal else None

        # Permutation Importance
        perm_importance_random = permutation_importance(
            model_random, X_test_rand, y_test_rand, scoring='neg_mean_absolute_error'
        ) if model_random else None
        perm_importance_temporal = (
            permutation_importance(model_temporal, X_test_temp, y_test_temp, scoring='neg_mean_absolute_error')
            if include_temporal and model_temporal
            else None
        )

        # SHAP Analysis
        explainer_random = shap.TreeExplainer(model_random) if model_random else None
        shap_values_random = explainer_random.shap_values(X_test_rand) if explainer_random else None

        explainer_temporal = shap.TreeExplainer(model_temporal) if include_temporal and model_temporal else None
        shap_values_temporal = explainer_temporal.shap_values(X_test_temp) if explainer_temporal else None

        # Store results
        feature_analysis_results[model_name] = {
            "feature_importance_random": feature_importance_random,
            "feature_importance_temporal": feature_importance_temporal,
            "perm_importance_random": perm_importance_random.importances_mean if perm_importance_random else None,
            "perm_importance_temporal": perm_importance_temporal.importances_mean if perm_importance_temporal else None,
            "shap_values_random": shap_values_random,
            "shap_values_temporal": shap_values_temporal,
        }
    except Exception as e:
        print(f"Error analyzing model {model_name}: {e}")
        continue

## Importance Plotting

In [None]:
import shap

# Create a directory to save plots if it doesn't already exist
feature_analysis_dir = "feature_analysis"
os.makedirs(feature_analysis_dir, exist_ok=True)

# Define the number of top features to display
TOP_FEATURES = 25

# Loop through top models for visualization
for model_name, results in feature_analysis_results.items():
    # RANDOM SPLIT ANALYSIS
    if results.get("feature_importance_random") is not None:
        # Feature Importance Plot (Random Split)
        sorted_idx_random = np.argsort(-results["feature_importance_random"])[:TOP_FEATURES]
        feature_names_random = X_train_rand.columns[sorted_idx_random]
        fig, ax = plt.subplots(figsize=(12, 8))
        sns.barplot(
            x=results["feature_importance_random"][sorted_idx_random],
            y=feature_names_random,
            ax=ax
        )
        ax.set_title(f"Feature Importance - {model_name} - Random Split")
        ax.set_xlabel("Importance Score")
        ax.set_ylabel("Features")
        plt.tight_layout()
        plot_path = os.path.join(feature_analysis_dir, f"{model_name}_feature_importance_random.png")
        plt.savefig(plot_path)
        plt.show()
        plt.close()

    if results.get("perm_importance_random") is not None:
        # Permutation Importance Plot (Random Split)
        perm_sorted_idx_random = np.argsort(-results["perm_importance_random"])[:TOP_FEATURES]
        feature_names_perm_random = X_train_rand.columns[perm_sorted_idx_random]
        fig, ax = plt.subplots(figsize=(12, 8))
        sns.barplot(
            x=results["perm_importance_random"][perm_sorted_idx_random],
            y=feature_names_perm_random,
            ax=ax
        )
        ax.set_title(f"Permutation Importance - {model_name} - Random Split")
        ax.set_xlabel("Importance Score")
        ax.set_ylabel("Features")
        plt.tight_layout()
        plot_path = os.path.join(feature_analysis_dir, f"{model_name}_permutation_importance_random.png")
        plt.savefig(plot_path)
        plt.show()
        plt.close()

    if results.get("shap_values_random") is not None:
        # SHAP Summary Plot (Random Split)
        fig, ax = plt.subplots(figsize=(12, 8))
        shap.summary_plot(
            results["shap_values_random"], X_test_rand, max_display=TOP_FEATURES, show=False
        )
        plt.title(f"SHAP Summary - {model_name} - Random Split")
        plot_path = os.path.join(feature_analysis_dir, f"{model_name}_shap_random.png")
        plt.savefig(plot_path)
        plt.show()
        plt.close()

    # TEMPORAL SPLIT ANALYSIS (if temporal models are enabled)
    if include_temporal:
        if results.get("feature_importance_temporal") is not None:
            # Feature Importance Plot (Temporal Split)
            sorted_idx_temporal = np.argsort(-results["feature_importance_temporal"])[:TOP_FEATURES]
            feature_names_temporal = X_train_temp.columns[sorted_idx_temporal]
            fig, ax = plt.subplots(figsize=(12, 8))
            sns.barplot(
                x=results["feature_importance_temporal"][sorted_idx_temporal],
                y=feature_names_temporal,
                ax=ax
            )
            ax.set_title(f"Feature Importance - {model_name} - Temporal Split")
            ax.set_xlabel("Importance Score")
            ax.set_ylabel("Features")
            plt.tight_layout()
            plot_path = os.path.join(feature_analysis_dir, f"{model_name}_feature_importance_temporal.png")
            plt.savefig(plot_path)
            plt.show()
            plt.close()

        if results.get("perm_importance_temporal") is not None:
            # Permutation Importance Plot (Temporal Split)
            perm_sorted_idx_temporal = np.argsort(-results["perm_importance_temporal"])[:TOP_FEATURES]
            feature_names_perm_temporal = X_train_temp.columns[perm_sorted_idx_temporal]
            fig, ax = plt.subplots(figsize=(12, 8))
            sns.barplot(
                x=results["perm_importance_temporal"][perm_sorted_idx_temporal],
                y=feature_names_perm_temporal,
                ax=ax
            )
            ax.set_title(f"Permutation Importance - {model_name} - Temporal Split")
            ax.set_xlabel("Importance Score")
            ax.set_ylabel("Features")
            plt.tight_layout()
            plot_path = os.path.join(feature_analysis_dir, f"{model_name}_permutation_importance_temporal.png")
            plt.savefig(plot_path)
            plt.show()
            plt.close()

        if results.get("shap_values_temporal") is not None:
            # SHAP Summary Plot (Temporal Split)
            fig, ax = plt.subplots(figsize=(12, 8))
            shap.summary_plot(
                results["shap_values_temporal"], X_test_temp, max_display=TOP_FEATURES, show=False
            )
            plt.title(f"SHAP Summary - {model_name} - Temporal Split")
            plot_path = os.path.join(feature_analysis_dir, f"{model_name}_shap_temporal.png")
            plt.savefig(plot_path)
            plt.show()
            plt.close()

In [None]:
# Prepare a list to hold feature importance data for all models
feature_analysis_data = []

# Loop through each model and extract feature importance results
for model_name, results in feature_analysis_results.items():
    # Random Split Feature Importance
    for i, feature_name in enumerate(X_train_rand.columns):
        feature_analysis_data.append({
            "Model": model_name,
            "Split": "Random",
            "Feature": feature_name,
            "Importance": results["feature_importance_random"][i]
        })
    
    # Temporal Split Feature Importance
    for i, feature_name in enumerate(X_train_temp.columns):
        feature_analysis_data.append({
            "Model": model_name,
            "Split": "Temporal",
            "Feature": feature_name,
            "Importance": results["feature_importance_temporal"][i]
        })

# Convert the list of dictionaries to a DataFrame
feature_analysis_df = pd.DataFrame(feature_analysis_data)
feature_analysis_df.sort_values(by="Importance", ascending=False)

## Random-Temporal Importance Comparison

In [None]:
# Calculate average importance per feature for each split
random_split_importance = feature_analysis_df[feature_analysis_df['Split'] == 'Random'].groupby('Feature')['Importance'].mean()
temporal_split_importance = feature_analysis_df[feature_analysis_df['Split'] == 'Temporal'].groupby('Feature')['Importance'].mean()

# Combine into a single DataFrame for comparison
importance_comparison = pd.DataFrame({
    'Feature': random_split_importance.index,
    'Random Importance': random_split_importance.values,
    'Temporal Importance': temporal_split_importance.reindex(random_split_importance.index).values
})

# Calculate the difference and rank features by the difference
importance_comparison['Difference'] = importance_comparison['Random Importance'] - importance_comparison['Temporal Importance']
importance_comparison['Absolute Difference'] = importance_comparison['Difference'].abs()

# Sort by absolute difference for analysis
importance_comparison = importance_comparison.sort_values(by='Absolute Difference', ascending=False)
importance_comparison.sort_values(by="Absolute Difference", ascending=False)