In [1]:
import os
import numpy as np
import pandas as pd
import rasterio
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from datetime import datetime

In [24]:
import os
from datetime import datetime
import rasterio
import numpy as np
import pandas as pd

# Base folder containing all drone image files
image_folder = r"D:\Yield\Estimation\DroneData"
records = []

# List all .tif files within the specified drone image folder
files = [f for f in os.listdir(image_folder) if f.endswith('.tif')]
files.sort()

for file in files:
    file_path = os.path.join(image_folder, file)

    try:
        with rasterio.open(file_path) as src:
            red = src.read(1).astype('float32')
            green = src.read(2).astype('float32')
            blue = src.read(3).astype('float32')

        # Calculate Vegetation Indices
        exg = 2 * green - red - blue
        gli = (2 * green - red - blue) / (2 * green + red + blue + 1e-10)
        vari = (green - red) / (green + red - blue + 1e-10)
        tgi = green - 0.39 * red - 0.61 * blue
        sum_rgb = red + green + blue + 1e-10
        r_norm = red / sum_rgb
        g_norm = green / sum_rgb
        b_norm = blue / sum_rgb
        cive = 0.441 * r_norm - 0.881 * g_norm + 0.385 * b_norm + 18.78745

        # Clean up potential infinite or NaN values
        exg = np.where(np.isfinite(exg), exg, np.nan)
        gli = np.where(np.isfinite(gli), gli, np.nan)
        vari = np.where(np.isfinite(vari), vari, np.nan)
        tgi = np.where(np.isfinite(tgi), tgi, np.nan)
        cive = np.where(np.isfinite(cive), cive, np.nan)

        # Calculate Mean Values (using UNMASKED data)
        mean_exg = np.nanmean(exg)
        mean_gli = np.nanmean(gli)
        mean_vari = np.nanmean(vari)
        mean_tgi = np.nanmean(tgi)
        mean_cive = np.nanmean(cive)
        mean_red = np.nanmean(red)
        mean_green = np.nanmean(green)
        mean_blue = np.nanmean(blue)

        # Extract polygon name from filename
        try:
            polygon = file.split('.')[0]
            date = None
        except Exception as e:
            print(f"Error extracting polygon name from '{file}': {e}. Polygon set to None.")
            polygon = None
            date = None

        # Append extracted features and metadata to records list
        records.append({
            'polygon': polygon,
            'date': date,
            'filename': file,
            'mean_ExG': mean_exg,
            'mean_GLI': mean_gli,
            'mean_VARI': mean_vari,
            'mean_TGI': mean_tgi,
            'mean_CIVE': mean_cive,
            'mean_Red': mean_red,
            'mean_Green': mean_green,
            'mean_Blue': mean_blue,
        })

    except rasterio.errors.RasterioIOError as e:
        print(f"Error opening or reading raster file '{file_path}': {e}. Skipping this file.")
    except Exception as e:
        print(f"An unexpected error occurred while processing '{file_path}': {e}. Skipping this file.")

# Create DataFrame from collected records
df_indices = pd.DataFrame(records)

print("✅ Extracted vegetation indices and features from drone imagery (no masking, no standard deviations):")
print(df_indices.head())
print("\nUnique polygons found in drone data:")
print(df_indices['polygon'].unique())


✅ Extracted vegetation indices and features from drone imagery (no masking, no standard deviations):
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  29.988546  0.084960   0.005946  18.162916   
1  Munda2  None  Munda2.tif  16.802973  0.040981  -0.038630  11.502631   
2  Munda3  None  Munda3.tif  29.551044  0.077395  -0.044689  19.322803   
3  Munda4  None  Munda4.tif  21.299973  0.058161  -0.026116  13.652133   
4  Munda5  None  Munda5.tif  29.342831  0.077635   0.011176  17.655802   

   mean_CIVE   mean_Red  mean_Green  mean_Blue  
0  18.727116  72.174706   72.766083  43.368896  
1  18.757044  64.890701   59.196167  36.698555  
2  18.732512  88.398834   82.504814  47.059834  
3  18.746469  58.684658   55.688457  31.392330  
4  18.732330  68.685951   69.791962  41.555145  

Unique polygons found in drone data:
['Munda1' 'Munda2' 'Munda3' 'Munda4' 'Munda5' 'Munda6' 'Munda7']


In [27]:
import pandas as pd # Ensure pandas is imported if not already in the environment

# --- Load and Merge Polygon Area Data ---
df_area = pd.read_csv("polygon_areas.csv")

# Merge the area data (df_area) with your drone vegetation index features (df_indices).
# 'how='left'' ensures all rows from df_indices are kept.
df_indices = pd.merge(df_indices, df_area, on='polygon', how='left')

# Check for any polygons missing area values after the merge.
missing_area = df_indices[df_indices['area_ha'].isna()]
if not missing_area.empty:
    print("Warning: Some polygons in your drone dataset are missing area values:")
    print(missing_area['polygon'].unique())
    print("Please ensure these polygons have entries in 'polygon_areas.csv'.")

print("✅ Merged drone vegetation index features with polygon area data:")
print(df_indices.head())


# --- Load and Merge Yield Data ---
df_yield = pd.read_csv("yield_tons.csv")

# Merge the drone features (df_indices, now including 'area_ha') with the yield data.
df = pd.merge(df_indices, df_yield, on='polygon', how='inner')

print("\nAvailable columns after merging with yield data:")
print(df.columns.tolist())

# Drop rows with any missing data in specified columns.
df = df.dropna(subset=[
    'mean_ExG', 'mean_GLI', 'mean_VARI',
    'mean_TGI', 'mean_CIVE',
    'mean_Red', 'mean_Green', 'mean_Blue',
    'area_ha', 'yield_tons'
])

print("\n✅ Per-image data ready for modeling (drone data):")
print(df.head())

print("\nPolygons included in the training dataset:")
print(df['polygon'].unique())

# --- Prepare df_summary for Cross-Validation ---
# Select columns for df_summary.
df_summary = df[[
    'polygon',
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue',
    'area_ha',
    'yield_tons',
    'filename'
]].copy()

# Drop any rows with missing values in df_summary.
df_summary = df_summary.dropna()

print("\n✅ Image-level training dataset (for drone data - df_summary):")
print(df_summary.head())


✅ Merged drone vegetation index features with polygon area data:
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  29.988546  0.084960   0.005946  18.162916   
1  Munda2  None  Munda2.tif  16.802973  0.040981  -0.038630  11.502631   
2  Munda3  None  Munda3.tif  29.551044  0.077395  -0.044689  19.322803   
3  Munda4  None  Munda4.tif  21.299973  0.058161  -0.026116  13.652133   
4  Munda5  None  Munda5.tif  29.342831  0.077635   0.011176  17.655802   

   mean_CIVE   mean_Red  mean_Green  mean_Blue   area_ha  
0  18.727116  72.174706   72.766083  43.368896  0.101974  
1  18.757044  64.890701   59.196167  36.698555  0.339510  
2  18.732512  88.398834   82.504814  47.059834  0.308073  
3  18.746469  58.684658   55.688457  31.392330  0.289367  
4  18.732330  68.685951   69.791962  41.555145  0.185417  

Available columns after merging with yield data:
['polygon', 'date', 'filename', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean

In [29]:
# Load yield data.
df_yield = pd.read_csv("yield_tons.csv")

# Merge drone feature data with yield data.
df = pd.merge(df_indices, df_yield, on='polygon', how='inner')

# Check available columns after merging.
print("Available columns after merging with yield data:")
print(df.columns.tolist())

# Define all features, including the new ones, for dropping rows with missing data.
all_features_for_dropna = [
    'mean_ExG', 'mean_GLI', 'mean_VARI',
    'mean_TGI', 'mean_CIVE',
    'mean_Red', 'mean_Green', 'mean_Blue',
    'area_ha', 'yield_tons'
]

# Drop rows with any missing data in the specified columns.
df = df.dropna(subset=all_features_for_dropna)

print("\n✅ Per-image data ready for modeling (drone data - base for LOOCV):")
print(df.head())
print("\nPolygons included in the base dataset for LOOCV:")
print(df['polygon'].unique())


Available columns after merging with yield data:
['polygon', 'date', 'filename', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean_CIVE', 'mean_Red', 'mean_Green', 'mean_Blue', 'area_ha', 'yield_tons']

✅ Per-image data ready for modeling (drone data - base for LOOCV):
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  29.988546  0.084960   0.005946  18.162916   
1  Munda2  None  Munda2.tif  16.802973  0.040981  -0.038630  11.502631   
2  Munda3  None  Munda3.tif  29.551044  0.077395  -0.044689  19.322803   
3  Munda4  None  Munda4.tif  21.299973  0.058161  -0.026116  13.652133   
4  Munda5  None  Munda5.tif  29.342831  0.077635   0.011176  17.655802   

   mean_CIVE   mean_Red  mean_Green  mean_Blue   area_ha  yield_tons  
0  18.727116  72.174706   72.766083  43.368896  0.101974         0.9  
1  18.757044  64.890701   59.196167  36.698555  0.339510         1.3  
2  18.732512  88.398834   82.504814  47.059834  0.308073         1.

In [30]:
# --- DEBUGGING ADDITION START ---
print("\n--- Debugging df before df_summary creation ---")
print(f"Columns in df before creating df_summary: {df.columns.tolist()}")
print("--- Debugging df END ---\n")
# --- DEBUGGING ADDITION END ---

# Select only the columns needed for training and testing in the LOOCV loop.
# This ensures df_summary contains all the new features.
df_summary = df[[
    'polygon',
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue',
    'area_ha',
    'yield_tons',
    'filename'
]].copy()

# Drop any rows with missing values as a final clean-up step for the summary DataFrame.
df_summary = df_summary.dropna()

print("\n✅ Image-level training dataset (for drone data - LOOCV base):")
print(df_summary.head())
print("\nPolygons in df_summary (all available for LOOCV):")
print(df_summary['polygon'].unique())



--- Debugging df before df_summary creation ---
Columns in df before creating df_summary: ['polygon', 'date', 'filename', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean_CIVE', 'mean_Red', 'mean_Green', 'mean_Blue', 'area_ha', 'yield_tons']
--- Debugging df END ---


✅ Image-level training dataset (for drone data - LOOCV base):
  polygon   mean_ExG  mean_GLI  mean_VARI   mean_TGI  mean_CIVE   mean_Red  \
0  Munda1  29.988546  0.084960   0.005946  18.162916  18.727116  72.174706   
1  Munda2  16.802973  0.040981  -0.038630  11.502631  18.757044  64.890701   
2  Munda3  29.551044  0.077395  -0.044689  19.322803  18.732512  88.398834   
3  Munda4  21.299973  0.058161  -0.026116  13.652133  18.746469  58.684658   
4  Munda5  29.342831  0.077635   0.011176  17.655802  18.732330  68.685951   

   mean_Green  mean_Blue   area_ha  yield_tons    filename  
0   72.766083  43.368896  0.101974         0.9  Munda1.tif  
1   59.196167  36.698555  0.339510         1.3  Munda2.tif  
2   82.504

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
import numpy as np
from itertools import combinations

# Define features for training and prediction.
# These features now only include mean indices and raw bands, as per recent changes.
features = [
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue'
]

# Debugging information to check df_summary before cross-validation.
print("\n--- Debugging df_summary before Custom Cross-Validation ---")
print(f"Shape of df_summary: {df_summary.shape}")
print(f"Columns in df_summary: {df_summary.columns.tolist()}")
print(f"Unique polygons in df_summary: {df_summary['polygon'].unique().tolist()}")
print("--- Debugging df_summary END ---\n")

# Get all unique polygon identifiers from the df_summary DataFrame.
unique_polygons = df_summary['polygon'].unique().tolist()
num_total_polygons = len(unique_polygons)

# Define the desired sizes for the training and test sets in each fold.
train_set_size = 5
test_set_size = 1

# Initialize lists to store performance metrics for both test and training sets.
test_r2_scores = []
test_rmse_scores = []
test_mae_scores = []

train_r2_scores = []
train_rmse_scores = []
train_mae_scores = []

print(f"Starting Custom Cross-Validation: Training on {train_set_size} fields, Testing on {test_set_size} field.")
print(f"Total unique polygons available: {num_total_polygons}")
print(f"Polygons in dataset: {unique_polygons}")

fold_counter = 0

# Outer loop: Iterate through each unique polygon, setting it as the test polygon for a set of folds.
for test_polygon in unique_polygons:
    # Create the test set DataFrame for the current test polygon.
    df_test_fold = df_summary[df_summary['polygon'] == test_polygon].copy()

    # Skip if the test set is empty.
    if df_test_fold.empty:
        print(f"Warning: Test set is empty for '{test_polygon}'. Skipping folds with this test polygon.")
        continue

    # Identify candidate polygons for the training set (all except the current test polygon).
    candidate_train_polygons = [p for p in unique_polygons if p != test_polygon]

    # Inner loop: Generate all combinations of 'train_set_size' polygons from candidates for training.
    for train_subset_polygons in combinations(candidate_train_polygons, train_set_size):
        fold_counter += 1
        print(f"\n--- Fold {fold_counter}: Testing on '{test_polygon}' ---")
        print(f"  Training polygons for this fold: {list(train_subset_polygons)}")

        # Identify and print polygons not used in this specific fold.
        unused_polygons = [p for p in candidate_train_polygons if p not in train_subset_polygons]
        print(f"  Unused polygons in this fold: {unused_polygons}")

        # Create the training set DataFrame for the current fold.
        df_train_fold = df_summary[df_summary['polygon'].isin(train_subset_polygons)].copy()

        # Skip if the training set is empty.
        if df_train_fold.empty:
            print(f"Warning: Training set is empty for fold {fold_counter}. Skipping this fold.")
            continue

        # Define X (features) and y (target) for the training set.
        X_train = df_train_fold[features]
        y_train = df_train_fold['yield_tons']

        # Define X (features) and y (target) for the test set.
        X_test = df_test_fold[features]
        y_test = df_test_fold['yield_tons']

        # Initialize and train the linear regression model.
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Calculate Training Set Performance.
        y_pred_train = model.predict(X_train)

        # Calculate R² for training, handling cases with no variance in y_train.
        if len(y_train.unique()) > 1:
            fold_train_r2 = r2_score(y_train, y_pred_train)
        else:
            fold_train_r2 = np.nan
            print(f"  Warning: y_train for fold {fold_counter} has no variance. Training R² will be NaN.")

        fold_train_rmse = root_mean_squared_error(y_train, y_pred_train)
        fold_train_mae = mean_absolute_error(y_train, y_pred_train)

        train_r2_scores.append(fold_train_r2)
        train_rmse_scores.append(fold_train_rmse)
        train_mae_scores.append(fold_train_mae)

        # Calculate Test Set Performance.
        y_pred_test = model.predict(X_test)

        # Calculate R² for testing, handling cases with no variance in y_test.
        if len(y_test.unique()) > 1:
            fold_test_r2 = r2_score(y_test, y_pred_test)
        else:
            fold_test_r2 = np.nan
            print(f"  Warning: y_test for '{test_polygon}' has no variance. Test R² will be NaN for this fold.")

        fold_test_rmse = root_mean_squared_error(y_test, y_pred_test)
        fold_test_mae = mean_absolute_error(y_test, y_pred_test)

        test_r2_scores.append(fold_test_r2)
        test_rmse_scores.append(fold_test_rmse)
        test_mae_scores.append(fold_test_mae)

        # Print performance metrics for the current fold.
        print(f"  Training R²: {fold_train_r2:.4f} | Training RMSE: {fold_train_rmse:.4f} | Training MAE: {fold_train_mae:.4f}")
        print(f"  Test R²: {fold_test_r2:.4f} | Test RMSE: {fold_test_rmse:.4f} | Test MAE: {fold_test_mae:.4f}")


# Calculate overall average performance metrics across all folds.
# np.nanmean is used to correctly handle any NaN values.
if test_r2_scores:
    avg_test_r2 = np.nanmean(test_r2_scores)
    avg_test_rmse = np.nanmean(test_rmse_scores)
    avg_test_mae = np.nanmean(test_mae_scores)

    avg_train_r2 = np.nanmean(train_r2_scores)
    avg_train_rmse = np.nanmean(train_rmse_scores)
    avg_train_mae = np.nanmean(train_mae_scores)

    print("\n--- Average Model Performance (Custom Cross-Validation Results) ---")
    print(f"Total number of folds completed: {fold_counter}")
    print("--- Test Set Averages ---")
    print(f"Average Test R² Score: {avg_test_r2:.4f}")
    print(f"Average Test RMSE (tons/ha): {avg_test_rmse:.4f}")
    print(f"Average Test MAE (tons/ha): {avg_test_mae:.4f}")

    print("\n--- Training Set Averages ---")
    print(f"Average Training R² Score: {avg_train_r2:.4f}")
    print(f"Average Training RMSE (tons/ha): {avg_train_rmse:.4f}")
    print(f"Average Training MAE (tons/ha): {avg_train_mae:.4f}")
else:
    print("\nNo valid cross-validation folds were completed. Check your data and parameters.")



--- Debugging df_summary before Custom Cross-Validation ---
Shape of df_summary: (6, 12)
Columns in df_summary: ['polygon', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean_CIVE', 'mean_Red', 'mean_Green', 'mean_Blue', 'area_ha', 'yield_tons', 'filename']
Unique polygons in df_summary: ['Munda1', 'Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda7']
--- Debugging df_summary END ---

Starting Custom Cross-Validation: Training on 5 fields, Testing on 1 field.
Total unique polygons available: 6
Polygons in dataset: ['Munda1', 'Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda7']

--- Fold 1: Testing on 'Munda1' ---
  Training polygons for this fold: ['Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda7']
  Unused polygons in this fold: []
  Training R²: 1.0000 | Training RMSE: 0.0000 | Training MAE: 0.0000
  Test R²: nan | Test RMSE: 0.2948 | Test MAE: 0.2948

--- Fold 2: Testing on 'Munda2' ---
  Training polygons for this fold: ['Munda1', 'Munda3', 'Munda4', 'Munda5', 'Munda7']
  Unused polyg

  avg_test_r2 = np.nanmean(test_r2_scores)


In [32]:

# Define features that your 'model' was trained on.
# This list must match the 'features' list used in your model training (e.g., in the LOOCV cell).
features = [
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue'
]

# Get a list of all unique polygons present in df_indices.
# df_indices should already contain all the drone features and the 'area_ha' column
# merged from 'polygon_areas.csv'.
unique_polygons_for_prediction = df_indices['polygon'].unique()

print("Starting yield prediction for all available polygons...")

# Loop through each unique polygon to predict its yield.
for polygon_name in unique_polygons_for_prediction:
    print(f"\n--- Predicting for '{polygon_name}' ---")

    # Filter df_indices to get data only for the current polygon.
    df_current_polygon = df_indices[df_indices['polygon'] == polygon_name].copy()

    # Ensure the current polygon has data and all necessary features.
    if df_current_polygon.empty:
        print(f"Warning: No data found for polygon '{polygon_name}'. Skipping prediction.")
        continue

    # Check if all required features are present in the current polygon's data.
    missing_features = [f for f in features if f not in df_current_polygon.columns]
    if missing_features:
        print(f"Error: Missing features for polygon '{polygon_name}': {missing_features}. Skipping prediction.")
        continue

    # Predict yield per hectare for each image within the current polygon.
    # The 'model' object must have been trained in a previous cell.
    # Ensure that the 'model' object is available in your environment.
    try:
        df_current_polygon['predicted_yield_per_ha'] = model.predict(df_current_polygon[features])
    except NameError:
        print("Error: 'model' is not defined. Please ensure the model training cell was run successfully.")
        break # Exit the loop if model is not trained
    except Exception as e:
        print(f"Error predicting for '{polygon_name}': {e}. Skipping prediction for this polygon.")
        continue

    # Calculate the average predicted yield per hectare across all images for this polygon.
    avg_predicted_yield_per_ha = np.nanmean(df_current_polygon['predicted_yield_per_ha'])

    # Get the polygon area for the current polygon.
    # This assumes 'area_ha' column exists and is not NaN for this polygon.
    # We take the mean in case there are multiple entries for the same polygon, though ideally 'area_ha' should be constant.
    if 'area_ha' in df_current_polygon.columns and not df_current_polygon['area_ha'].empty and not pd.isna(df_current_polygon['area_ha'].iloc[0]):
        polygon_area = df_current_polygon['area_ha'].iloc[0] # Assuming area is constant per polygon

        # Calculate the total predicted yield in tons for the entire polygon.
        total_predicted_yield = avg_predicted_yield_per_ha * polygon_area

        # Print the results for the current polygon.
        print(f"  Average predicted yield per hectare: {avg_predicted_yield_per_ha:.3f} tons/ha")
        print(f"  Total predicted maize yield: {total_predicted_yield:.3f} tons")
    else:
        # Handle cases where 'area_ha' is missing or invalid for the current polygon.
        print(f"  ⚠️ Warning: '{polygon_name}' 'area_ha' is missing or invalid. Cannot calculate total predicted yield.")
        print(f"  Average predicted yield per hectare: {avg_predicted_yield_per_ha:.3f} tons/ha (Total yield calculation requires area)")

print("\n✅ Prediction process completed for all available polygons.")


Starting yield prediction for all available polygons...

--- Predicting for 'Munda1' ---
  Average predicted yield per hectare: 0.900 tons/ha
  Total predicted maize yield: 0.092 tons

--- Predicting for 'Munda2' ---
  Average predicted yield per hectare: 1.300 tons/ha
  Total predicted maize yield: 0.441 tons

--- Predicting for 'Munda3' ---
  Average predicted yield per hectare: 1.400 tons/ha
  Total predicted maize yield: 0.431 tons

--- Predicting for 'Munda4' ---
  Average predicted yield per hectare: 0.700 tons/ha
  Total predicted maize yield: 0.203 tons

--- Predicting for 'Munda5' ---
  Average predicted yield per hectare: 1.100 tons/ha
  Total predicted maize yield: 0.204 tons

--- Predicting for 'Munda6' ---
  Average predicted yield per hectare: 1.843 tons/ha
  Total predicted maize yield: 0.115 tons

--- Predicting for 'Munda7' ---
  Average predicted yield per hectare: 77945.680 tons/ha
  Total predicted maize yield: 69137.818 tons

✅ Prediction process completed for all 

In [19]:
import pandas as pd # Ensure pandas is imported if not already in the environment
import numpy as np # Ensure numpy is imported for np.nanmean

# Define features that your 'model' was trained on.
# This list must match the 'features' list used in your model training (e.g., in the LOOCV cell).
features = [
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue',
]

# Define the specific polygon for which we want to predict the yield.
polygon_to_predict = 'Munda6'

print(f"Starting yield prediction for '{polygon_to_predict}' only...")

# Filter df_indices to get data only for the specified polygon.
# df_indices should already contain all the drone features and the 'area_ha' column
# merged from 'polygon_areas.csv'.
df_specific_polygon = df_indices[df_indices['polygon'] == polygon_to_predict].copy()

# Ensure the specific polygon has data and all necessary features.
if df_specific_polygon.empty:
    print(f"Error: No data found for polygon '{polygon_to_predict}'. Cannot proceed with prediction.")
else:
    # Check if all required features are present in the polygon's data.
    missing_features = [f for f in features if f not in df_specific_polygon.columns]
    if missing_features:
        print(f"Error: Missing features for polygon '{polygon_to_predict}': {missing_features}. Cannot proceed with prediction.")
    else:
        # Predict yield per hectare for each image within this polygon.
        # The 'model' object must have been trained in a previous cell (e.g., the LOOCV cell).
        # Ensure that the 'model' object is available in your environment.
        try:
            df_specific_polygon['predicted_yield_per_ha'] = model.predict(df_specific_polygon[features])
        except NameError:
            print("Error: 'model' is not defined. Please ensure the model training cell was run successfully before running this prediction.")
        except Exception as e:
            print(f"An unexpected error occurred during prediction for '{polygon_to_predict}': {e}.")
        else:
            # Calculate the average predicted yield per hectare across all images for this polygon.
            avg_predicted_yield_per_ha = np.nanmean(df_specific_polygon['predicted_yield_per_ha'])

            # Get the polygon area for the current polygon.
            # This assumes 'area_ha' column exists and is not NaN for this polygon.
            # We take the first value as area should be constant for a given polygon.
            if 'area_ha' in df_specific_polygon.columns and not df_specific_polygon['area_ha'].empty and not pd.isna(df_specific_polygon['area_ha'].iloc[0]):
                polygon_area = df_specific_polygon['area_ha'].iloc[0]

                # Calculate the total predicted yield in tons for the entire polygon.
                total_predicted_yield = avg_predicted_yield_per_ha * polygon_area

                # Print the results for the specific polygon.
                print(f"  Average predicted yield per hectare: {avg_predicted_yield_per_ha:.3f} tons/ha")



Starting yield prediction for 'Munda6' only...
  Average predicted yield per hectare: 1.843 tons/ha
