In [33]:
import os
import numpy as np
import pandas as pd
import rasterio
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from datetime import datetime

In [None]:
import os

# Base folder containing all drone image files
image_folder = r"D:\Yield\LR\DroneData"
records = []

# List all .tif files within the specified drone image folder
files = [f for f in os.listdir(image_folder) if f.endswith('.tif')]
files.sort()

for file in files:
    file_path = os.path.join(image_folder, file)

    try:
        with rasterio.open(file_path) as src:
            red = src.read(1).astype('float32')
            green = src.read(2).astype('float32')
            blue = src.read(3).astype('float32')

        # Calculate Vegetation Indices
        exg = 2 * green - red - blue
        gli = (2 * green - red - blue) / (2 * green + red + blue + 1e-10)
        vari = (green - red) / (green + red - blue + 1e-10)
        tgi = green - 0.39 * red - 0.61 * blue
        sum_rgb = red + green + blue + 1e-10
        r_norm = red / sum_rgb
        g_norm = green / sum_rgb
        b_norm = blue / sum_rgb
        cive = 0.441 * r_norm - 0.881 * g_norm + 0.385 * b_norm + 18.78745

        # Clean up potential infinite or NaN values
        exg = np.where(np.isfinite(exg), exg, np.nan)
        gli = np.where(np.isfinite(gli), gli, np.nan)
        vari = np.where(np.isfinite(vari), vari, np.nan)
        tgi = np.where(np.isfinite(tgi), tgi, np.nan)
        cive = np.where(np.isfinite(cive), cive, np.nan)

        # Implement Masking for Green Vegetation (e.g., ExG > threshold)
        vegetation_mask = exg > 10 

        # Calculate percentage of green cover
        total_valid_pixels = np.sum(~np.isnan(exg))
        percent_green_cover = (np.sum(vegetation_mask) / total_valid_pixels) * 100 if total_valid_pixels > 0 else np.nan

        # Apply the mask to all bands and indices for mean calculation
        red_masked = np.where(vegetation_mask, red, np.nan)
        green_masked = np.where(vegetation_mask, green, np.nan)
        blue_masked = np.where(vegetation_mask, blue, np.nan)
        exg_masked = np.where(vegetation_mask, exg, np.nan)
        gli_masked = np.where(vegetation_mask, gli, np.nan)
        vari_masked = np.where(vegetation_mask, vari, np.nan)
        tgi_masked = np.where(vegetation_mask, tgi, np.nan)
        cive_masked = np.where(vegetation_mask, cive, np.nan)

        # Calculate mean value for each masked index and raw band
        mean_exg_masked = np.nanmean(exg_masked)
        mean_gli_masked = np.nanmean(gli_masked)
        mean_vari_masked = np.nanmean(vari_masked)
        mean_tgi_masked = np.nanmean(tgi_masked)
        mean_cive_masked = np.nanmean(cive_masked)
        mean_red_masked = np.nanmean(red_masked)
        mean_green_masked = np.nanmean(green_masked)
        mean_blue_masked = np.nanmean(blue_masked)

        # Add Variability Features (Standard Deviation)
        std_exg_masked = np.nanstd(exg_masked)
        std_green_masked = np.nanstd(green_masked)

        # Extract polygon name from filename
        try:
            polygon = file.split('.')[0]
            date = None
        except Exception as e:
            print(f"Error extracting polygon name from '{file}': {e}. Polygon set to None.")
            polygon = None
            date = None

        # Append extracted features and metadata to records list
        records.append({
            'polygon': polygon,
            'date': date,
            'filename': file,
            'mean_ExG': mean_exg_masked,
            'mean_GLI': mean_gli_masked,
            'mean_VARI': mean_vari_masked,
            'mean_TGI': mean_tgi_masked,
            'mean_CIVE': mean_cive_masked,
            'mean_Red': mean_red_masked,
            'mean_Green': mean_green_masked,
            'mean_Blue': mean_blue_masked,
            'std_ExG': std_exg_masked,
            'std_Green': std_green_masked,
            'percent_green_cover': percent_green_cover
        })

    except rasterio.errors.RasterioIOError as e:
        print(f"Error opening or reading raster file '{file_path}': {e}. Skipping this file.")
    except Exception as e:
        print(f"An unexpected error occurred while processing '{file_path}': {e}. Skipping this file.")

# Create DataFrame from collected records
df_indices = pd.DataFrame(records)

print("✅ Extracted vegetation indices and variability features from drone imagery:")
print(df_indices.head())
print("\nUnique polygons found in drone data:")
print(df_indices['polygon'].unique())


✅ Extracted vegetation indices and variability features from drone imagery:
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  53.546490  0.150254   0.023449  32.256836   
1  Munda2  None  Munda2.tif  41.190311  0.100614  -0.054592  27.334774   
2  Munda3  None  Munda3.tif  46.276634  0.120017  -0.044731  29.833282   
3  Munda4  None  Munda4.tif  49.481083  0.133959  -0.028991  31.134373   
4  Munda5  None  Munda5.tif  58.587349  0.154823   0.029838  34.988140   

   mean_CIVE    mean_Red  mean_Green  mean_Blue    std_ExG  std_Green  \
0  18.681017  126.814560  128.662354  76.963638  24.812695  60.642323   
1  18.714027  146.354401  136.314972  85.085243  19.989790  57.192657   
2  18.702696  133.024643  125.731285  72.161270  19.136429  53.785713   
3  18.693714  128.715103  124.392838  70.589462  21.671139  60.535816   
4  18.678141  129.899765  133.309601  78.132088  24.228174  53.942223   

   percent_green_cover  
0            55

In [37]:
# Load polygon area data
df_area = pd.read_csv("polygon_areas.csv")

# Debugging Polygon Names for Merge
print("\n--- Debugging Polygon Names for Merge ---")
print("Unique polygons in df_indices (from drone data processing):")
print(df_indices['polygon'].unique())
print("\nUnique polygons in df_area (from polygon_areas.csv):")
print(df_area['polygon'].unique())

common_polygons = set(df_indices['polygon'].unique()) & set(df_area['polygon'].unique())
print(f"\nPolygons common to both df_indices and df_area: {list(common_polygons)}")

missing_in_area_file = set(df_indices['polygon'].unique()) - set(df_area['polygon'].unique())
if missing_in_area_file:
    print(f"Polygons found in drone data (df_indices) but NOT in polygon_areas.csv: {list(missing_in_area_file)}")
else:
    print("All polygons from drone data (df_indices) are present in polygon_areas.csv.")
print("--- Debugging Polygon Names for Merge End ---\n")

# Merge area data with drone vegetation index features.
df_indices = pd.merge(df_indices, df_area, on='polygon', how='left')

# Check for missing area values after merge.
missing_area = df_indices[df_indices['area_ha'].isna()]
if not missing_area.empty:
    print("Warning: Some polygons in your drone dataset are missing area values (area_ha is NaN) after merge:")
    print(missing_area['polygon'].unique())
    print("This indicates a mismatch in polygon names or missing entries in 'polygon_areas.csv'.")

print("✅ Merged drone vegetation index features with polygon area data:")
print(df_indices.head())



--- Debugging Polygon Names for Merge ---
Unique polygons in df_indices (from drone data processing):
['Munda1' 'Munda2' 'Munda3' 'Munda4' 'Munda5' 'Munda6' 'Munda7']

Unique polygons in df_area (from polygon_areas.csv):
['Munda1' 'Munda2' 'Munda3' 'Munda4' 'Munda5' 'Munda6' 'Munda7' 'Munda8']

Polygons common to both df_indices and df_area: ['Munda2', 'Munda5', 'Munda4', 'Munda7', 'Munda3', 'Munda6', 'Munda1']
All polygons from drone data (df_indices) are present in polygon_areas.csv.
--- Debugging Polygon Names for Merge End ---

✅ Merged drone vegetation index features with polygon area data:
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  53.546490  0.150254   0.023449  32.256836   
1  Munda2  None  Munda2.tif  41.190311  0.100614  -0.054592  27.334774   
2  Munda3  None  Munda3.tif  46.276634  0.120017  -0.044731  29.833282   
3  Munda4  None  Munda4.tif  49.481083  0.133959  -0.028991  31.134373   
4  Munda5  None  Munda5.ti

In [20]:

image_folder = r"D:\Yield\LR\DroneData"
records = []

# List all .tif files within the specified drone image folder
files = [f for f in os.listdir(image_folder) if f.endswith('.tif')]
files.sort() # Sort files to ensure consistent order

for file in files:
    file_path = os.path.join(image_folder, file)

    try:
        with rasterio.open(file_path) as src:
            # Assume Red=1, Green=2, Blue=3 for standard RGB drone images.
            # Adjust these band indices (e.g., src.read(3) for Red, src.read(2) for Green, etc.)
            # if your drone imagery has a different band order (e.g., BGR).
            red = src.read(1).astype('float32')
            green = src.read(2).astype('float32')
            blue = src.read(3).astype('float32')

        # === Calculate Vegetation Indices for RGB only ===
        # Excess Green (ExG): Highlights green vegetation
        exg = 2 * green - red - blue
        # Green Leaf Index (GLI): Sensitive to green biomass
        gli = (2 * green - red - blue) / (2 * green + red + blue + 1e-10)
        # Visible Atmospherically Resistant Index (VARI): Reduces atmospheric effects
        vari = (green - red) / (green + red - blue + 1e-10)

        # --- NEW INDICES ---
        # Triangular Greenness Index (TGI): Estimates chlorophyll content
        tgi = green - 0.39 * red - 0.61 * blue

        # Color Index of Vegetation (CIVE): Designed to be robust to light conditions
        # Normalize RGB first for CIVE
        sum_rgb = red + green + blue + 1e-10 # Add epsilon to avoid division by zero
        r_norm = red / sum_rgb
        g_norm = green / sum_rgb
        b_norm = blue / sum_rgb
        cive = 0.441 * r_norm - 0.881 * g_norm + 0.385 * b_norm + 18.78745
        # --- END NEW INDICES ---

        # Clean up potential infinite or NaN values that can arise from divisions
        # np.isfinite() checks for numbers that are not NaN or infinity.
        exg = np.where(np.isfinite(exg), exg, np.nan)
        gli = np.where(np.isfinite(gli), gli, np.nan)
        vari = np.where(np.isfinite(vari), vari, np.nan)
        tgi = np.where(np.isfinite(tgi), tgi, np.nan) # Clean TGI
        cive = np.where(np.isfinite(cive), cive, np.nan) # Clean CIVE

        # Calculate the mean value for each vegetation index and raw band across the image
        mean_exg = np.nanmean(exg)
        mean_gli = np.nanmean(gli)
        mean_vari = np.nanmean(vari)
        mean_tgi = np.nanmean(tgi) 
        mean_cive = np.nanmean(cive)
        mean_red = np.nanmean(red)
        mean_green = np.nanmean(green)
        mean_blue = np.nanmean(blue)

        # === Extract metadata (polygon name) from filename ===
        # Based on your input, filenames are simply "PolygonName.tif" (e.g., "Munda1.tif")
        # The date is not present in the filename.
        try:
            # Extract polygon name by splitting on the first '.' (before the file extension)
            polygon = file.split('.')[0]
            date = None # Explicitly set date to None as it's not in the filename
        except Exception as e:
            # Catch any unexpected errors during metadata extraction
            print(f"Error extracting polygon name from '{file}': {e}. Polygon set to None.")
            polygon = None
            date = None

        # Append the extracted features and metadata to the records list
        records.append({
            'polygon': polygon,
            'date': date, # Date will be None
            'filename': file,
            'mean_ExG': mean_exg,
            'mean_GLI': mean_gli,
            'mean_VARI': mean_vari,
            'mean_TGI': mean_tgi, 
            'mean_CIVE': mean_cive, 
            'mean_Red': mean_red,
            'mean_Green': mean_green,
            'mean_Blue': mean_blue
        })

    except rasterio.errors.RasterioIOError as e:
        print(f"Error opening or reading raster file '{file_path}': {e}. Skipping this file.")
    except Exception as e:
        print(f"An unexpected error occurred while processing '{file_path}': {e}. Skipping this file.")

# === Step 2: Create DataFrame from collected records ===
df_indices = pd.DataFrame(records)

# Load area data
df_area = pd.read_csv("polygon_areas.csv")

# Merge area with vegetation index data
df_indices = pd.merge(df_indices, df_area, on='polygon', how='left')

# Check for any polygons without area info
missing_area = df_indices[df_indices['area_ha'].isna()]
if not missing_area.empty:
    print("Warning: Some polygons are missing area values:")
    print(missing_area['polygon'].unique())

# Show result
print("Merged vegetation index + area data:")
print(df_indices.head())

# Also print unique polygons found to confirm correct extraction
print("\nUnique polygons found in drone data:")
print(df_indices['polygon'].unique())


Merged vegetation index + area data:
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  29.988546  0.084960   0.005946  18.162916   
1  Munda2  None  Munda2.tif  16.802973  0.040981  -0.038630  11.502631   
2  Munda3  None  Munda3.tif  29.551044  0.077395  -0.044689  19.322803   
3  Munda4  None  Munda4.tif  21.299973  0.058161  -0.026116  13.652133   
4  Munda5  None  Munda5.tif  29.342831  0.077635   0.011176  17.655802   

   mean_CIVE   mean_Red  mean_Green  mean_Blue   area_ha  
0  18.727116  72.174706   72.766083  43.368896  0.101974  
1  18.757044  64.890701   59.196167  36.698555  0.339510  
2  18.732512  88.398834   82.504814  47.059834  0.308073  
3  18.746469  58.684658   55.688457  31.392330  0.289367  
4  18.732330  68.685951   69.791962  41.555145  0.185417  

Unique polygons found in drone data:
['Munda1' 'Munda2' 'Munda3' 'Munda4' 'Munda5' 'Munda6' 'Munda7']


In [30]:
import os
from datetime import datetime
import rasterio
import numpy as np
import pandas as pd

# === Step 1: Process all images in Drone folder ===
# Base folder containing all drone image files (e.g., Munda1.tif, Munda2.tif, etc.)
image_folder = r"D:\Yield\LR\DroneData"
records = []

# List all .tif files within the specified drone image folder
files = [f for f in os.listdir(image_folder) if f.endswith('.tif')]
files.sort() # Sort files to ensure consistent order

for file in files:
    file_path = os.path.join(image_folder, file)

    try:
        with rasterio.open(file_path) as src:
            # Assume Red=1, Green=2, Blue=3 for standard RGB drone images.
            # Adjust these band indices (e.g., src.read(3) for Red, src.read(2) for Green, etc.)
            # if your drone imagery has a different band order (e.g., BGR).
            red = src.read(1).astype('float32')
            green = src.read(2).astype('float32')
            blue = src.read(3).astype('float32')

        # === Calculate Vegetation Indices for RGB only ===
        exg = 2 * green - red - blue
        gli = (2 * green - red - blue) / (2 * green + red + blue + 1e-10)
        vari = (green - red) / (green + red - blue + 1e-10)
        tgi = green - 0.39 * red - 0.61 * blue
        sum_rgb = red + green + blue + 1e-10
        r_norm = red / sum_rgb
        g_norm = green / sum_rgb
        b_norm = blue / sum_rgb
        cive = 0.441 * r_norm - 0.881 * g_norm + 0.385 * b_norm + 18.78745

        # Clean up potential infinite or NaN values from initial calculations
        exg = np.where(np.isfinite(exg), exg, np.nan)
        gli = np.where(np.isfinite(gli), gli, np.nan)
        vari = np.where(np.isfinite(vari), vari, np.nan)
        tgi = np.where(np.isfinite(tgi), tgi, np.nan)
        cive = np.where(np.isfinite(cive), cive, np.nan)

        # === Implement Masking for Non-Maize Areas ===
        # Create a mask for green vegetation (e.g., ExG > a certain threshold)
        # Adjust the threshold (e.g., 10, 20, 30) based on your image characteristics
        # A simple threshold like ExG > 0 is a good starting point to exclude bare soil/non-vegetation.
        vegetation_mask = exg > 10 # Example threshold, adjust as needed (e.g., 0, 10, 20)

        # Calculate percentage of green cover
        # Count pixels where vegetation_mask is True and divide by total valid pixels
        total_valid_pixels = np.sum(~np.isnan(exg)) # Count non-NaN pixels for the denominator
        if total_valid_pixels > 0:
            percent_green_cover = (np.sum(vegetation_mask) / total_valid_pixels) * 100
        else:
            percent_green_cover = np.nan # No valid pixels to calculate cover

        # Apply the mask to all bands and indices
        red_masked = np.where(vegetation_mask, red, np.nan)
        green_masked = np.where(vegetation_mask, green, np.nan)
        blue_masked = np.where(vegetation_mask, blue, np.nan)
        exg_masked = np.where(vegetation_mask, exg, np.nan)
        gli_masked = np.where(vegetation_mask, gli, np.nan)
        vari_masked = np.where(vegetation_mask, vari, np.nan)
        tgi_masked = np.where(vegetation_mask, tgi, np.nan)
        cive_masked = np.where(vegetation_mask, cive, np.nan)

        # Calculate the mean value for each masked vegetation index and raw band
        # Use np.nanmean to ignore NaN values introduced by masking
        mean_exg_masked = np.nanmean(exg_masked)
        mean_gli_masked = np.nanmean(gli_masked)
        mean_vari_masked = np.nanmean(vari_masked)
        mean_tgi_masked = np.nanmean(tgi_masked)
        mean_cive_masked = np.nanmean(cive_masked)
        mean_red_masked = np.nanmean(red_masked)
        mean_green_masked = np.nanmean(green_masked)
        mean_blue_masked = np.nanmean(blue_masked)

        # === Add Simple Variability Features (Standard Deviation) ===
        # Calculate standard deviation for masked ExG and Green band
        std_exg_masked = np.nanstd(exg_masked)
        std_green_masked = np.nanstd(green_masked)

        # === Extract metadata (polygon name) from filename ===
        try:
            polygon = file.split('.')[0]
            date = None # Date not present in filename
        except Exception as e:
            print(f"Error extracting polygon name from '{file}': {e}. Polygon set to None.")
            polygon = None
            date = None

        # Append the extracted features and metadata to the records list
        records.append({
            'polygon': polygon,
            'date': date,
            'filename': file,
            'mean_ExG': mean_exg_masked, # Now using masked mean
            'mean_GLI': mean_gli_masked, # Now using masked mean
            'mean_VARI': mean_vari_masked, # Now using masked mean
            'mean_TGI': mean_tgi_masked, # Now using masked mean
            'mean_CIVE': mean_cive_masked, # Now using masked mean
            'mean_Red': mean_red_masked, # Now using masked mean
            'mean_Green': mean_green_masked, # Now using masked mean
            'mean_Blue': mean_blue_masked, # Now using masked mean
            'std_ExG': std_exg_masked, # New variability feature
            'std_Green': std_green_masked, # New variability feature
            'percent_green_cover': percent_green_cover # New green cover feature
        })

    except rasterio.errors.RasterioIOError as e:
        print(f"Error opening or reading raster file '{file_path}': {e}. Skipping this file.")
    except Exception as e:
        print(f"An unexpected error occurred while processing '{file_path}': {e}. Skipping this file.")

# === Step 2: Create DataFrame from collected records ===
df_indices = pd.DataFrame(records)

# Show extracted features for verification
print("✅ Extracted vegetation indices and variability features from drone imagery:")
print(df_indices.head())

# Also print unique polygons found to confirm correct extraction
print("\nUnique polygons found in drone data:")
print(df_indices['polygon'].unique())


✅ Extracted vegetation indices and variability features from drone imagery:
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  53.546490  0.150254   0.023449  32.256836   
1  Munda2  None  Munda2.tif  41.190311  0.100614  -0.054592  27.334774   
2  Munda3  None  Munda3.tif  46.276634  0.120017  -0.044731  29.833282   
3  Munda4  None  Munda4.tif  49.481083  0.133959  -0.028991  31.134373   
4  Munda5  None  Munda5.tif  58.587349  0.154823   0.029838  34.988140   

   mean_CIVE    mean_Red  mean_Green  mean_Blue    std_ExG  std_Green  \
0  18.681017  126.814560  128.662354  76.963638  24.812695  60.642323   
1  18.714027  146.354401  136.314972  85.085243  19.989790  57.192657   
2  18.702696  133.024643  125.731285  72.161270  19.136429  53.785713   
3  18.693714  128.715103  124.392838  70.589462  21.671139  60.535816   
4  18.678141  129.899765  133.309601  78.132088  24.228174  53.942223   

   percent_green_cover  
0            55

In [38]:
# Load yield data.
df_yield = pd.read_csv("yield_tons.csv")

# Merge drone feature data with yield data.
df = pd.merge(df_indices, df_yield, on='polygon', how='inner')

# Check available columns after merging.
print("Available columns after merging with yield data:")
print(df.columns.tolist())

# Define all features, including the new ones, for dropping rows with missing data.
all_features_for_dropna = [
    'mean_ExG', 'mean_GLI', 'mean_VARI',
    'mean_TGI', 'mean_CIVE',
    'mean_Red', 'mean_Green', 'mean_Blue',
    'std_ExG', 'std_Green', 'percent_green_cover', # Include new features
    'area_ha', 'yield_tons'
]

# Drop rows with any missing data in the specified columns.
df = df.dropna(subset=all_features_for_dropna)

print("\n✅ Per-image data ready for modeling (drone data - base for LOOCV):")
print(df.head())
print("\nPolygons included in the base dataset for LOOCV:")
print(df['polygon'].unique())


Available columns after merging with yield data:
['polygon', 'date', 'filename', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean_CIVE', 'mean_Red', 'mean_Green', 'mean_Blue', 'std_ExG', 'std_Green', 'percent_green_cover', 'area_ha_x', 'area_ha_y', 'area_ha', 'yield_tons']

✅ Per-image data ready for modeling (drone data - base for LOOCV):
  polygon  date    filename   mean_ExG  mean_GLI  mean_VARI   mean_TGI  \
0  Munda1  None  Munda1.tif  53.546490  0.150254   0.023449  32.256836   
1  Munda2  None  Munda2.tif  41.190311  0.100614  -0.054592  27.334774   
2  Munda3  None  Munda3.tif  46.276634  0.120017  -0.044731  29.833282   
3  Munda4  None  Munda4.tif  49.481083  0.133959  -0.028991  31.134373   
4  Munda5  None  Munda5.tif  58.587349  0.154823   0.029838  34.988140   

   mean_CIVE    mean_Red  mean_Green  mean_Blue    std_ExG  std_Green  \
0  18.681017  126.814560  128.662354  76.963638  24.812695  60.642323   
1  18.714027  146.354401  136.314972  85.085243  19.989790  5

In [39]:
# --- DEBUGGING ADDITION START ---
print("\n--- Debugging df before df_summary creation ---")
print(f"Columns in df before creating df_summary: {df.columns.tolist()}")
print("--- Debugging df END ---\n")
# --- DEBUGGING ADDITION END ---

# Select only the columns needed for training and testing in the LOOCV loop.
# This ensures df_summary contains all the new features.
df_summary = df[[
    'polygon',
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue',
    'std_ExG',           # Include new feature
    'std_Green',         # Include new feature
    'percent_green_cover',# Include new feature
    'area_ha',
    'yield_tons',
    'filename'
]].copy() # Use .copy() to avoid SettingWithCopyWarning

# Drop any rows with missing values as a final clean-up step for the summary DataFrame.
df_summary = df_summary.dropna()

print("\n✅ Image-level training dataset (for drone data - LOOCV base):")
print(df_summary.head())
print("\nPolygons in df_summary (all available for LOOCV):")
print(df_summary['polygon'].unique())



--- Debugging df before df_summary creation ---
Columns in df before creating df_summary: ['polygon', 'date', 'filename', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean_CIVE', 'mean_Red', 'mean_Green', 'mean_Blue', 'std_ExG', 'std_Green', 'percent_green_cover', 'area_ha_x', 'area_ha_y', 'area_ha', 'yield_tons']
--- Debugging df END ---


✅ Image-level training dataset (for drone data - LOOCV base):
  polygon   mean_ExG  mean_GLI  mean_VARI   mean_TGI  mean_CIVE    mean_Red  \
0  Munda1  53.546490  0.150254   0.023449  32.256836  18.681017  126.814560   
1  Munda2  41.190311  0.100614  -0.054592  27.334774  18.714027  146.354401   
2  Munda3  46.276634  0.120017  -0.044731  29.833282  18.702696  133.024643   
3  Munda4  49.481083  0.133959  -0.028991  31.134373  18.693714  128.715103   
4  Munda5  58.587349  0.154823   0.029838  34.988140  18.678141  129.899765   

   mean_Green  mean_Blue    std_ExG  std_Green  percent_green_cover   area_ha  \
0  128.662354  76.963638  24.8126

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
import numpy as np
from itertools import combinations

# Define features for training and prediction.
# These features now include masked means, standard deviations, and percentage green cover.
features = [
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue',
    'std_ExG',           # Standard deviation of ExG
    'std_Green',         # Standard deviation of Green band
    'percent_green_cover'# Percentage of green cover
]

# Debugging information to check df_summary before cross-validation.
print("\n--- Debugging df_summary before Custom Cross-Validation ---")
print(f"Shape of df_summary: {df_summary.shape}")
print(f"Columns in df_summary: {df_summary.columns.tolist()}")
print(f"Unique polygons in df_summary: {df_summary['polygon'].unique().tolist()}")
print("--- Debugging df_summary END ---\n")

# Get all unique polygon identifiers from the df_summary DataFrame.
unique_polygons = df_summary['polygon'].unique().tolist()
num_total_polygons = len(unique_polygons)

# Define the desired sizes for the training and test sets in each fold.
train_set_size = 5
test_set_size = 1

# Initialize lists to store performance metrics for both test and training sets.
test_r2_scores = []
test_rmse_scores = []
test_mae_scores = []

train_r2_scores = []
train_rmse_scores = []
train_mae_scores = []

print(f"Starting Custom Cross-Validation: Training on {train_set_size} fields, Testing on {test_set_size} field.")
print(f"Total unique polygons available: {num_total_polygons}")
print(f"Polygons in dataset: {unique_polygons}")

fold_counter = 0

# Outer loop: Iterate through each unique polygon, setting it as the test polygon for a set of folds.
for test_polygon in unique_polygons:
    # Create the test set DataFrame for the current test polygon.
    df_test_fold = df_summary[df_summary['polygon'] == test_polygon].copy()

    # Skip if the test set is empty.
    if df_test_fold.empty:
        print(f"Warning: Test set is empty for '{test_polygon}'. Skipping folds with this test polygon.")
        continue

    # Identify candidate polygons for the training set (all except the current test polygon).
    candidate_train_polygons = [p for p in unique_polygons if p != test_polygon]

    # Inner loop: Generate all combinations of 'train_set_size' polygons from candidates for training.
    for train_subset_polygons in combinations(candidate_train_polygons, train_set_size):
        fold_counter += 1
        print(f"\n--- Fold {fold_counter}: Testing on '{test_polygon}' ---")
        print(f"  Training polygons for this fold: {list(train_subset_polygons)}")

        # Identify and print polygons not used in this specific fold.
        unused_polygons = [p for p in candidate_train_polygons if p not in train_subset_polygons]
        print(f"  Unused polygons in this fold: {unused_polygons}")

        # Create the training set DataFrame for the current fold.
        df_train_fold = df_summary[df_summary['polygon'].isin(train_subset_polygons)].copy()

        # Skip if the training set is empty.
        if df_train_fold.empty:
            print(f"Warning: Training set is empty for fold {fold_counter}. Skipping this fold.")
            continue

        # Define X (features) and y (target) for the training set.
        X_train = df_train_fold[features]
        y_train = df_train_fold['yield_tons']

        # Define X (features) and y (target) for the test set.
        X_test = df_test_fold[features]
        y_test = df_test_fold['yield_tons']

        # Initialize and train the linear regression model.
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Calculate Training Set Performance.
        y_pred_train = model.predict(X_train)

        # Calculate R² for training, handling cases with no variance in y_train.
        if len(y_train.unique()) > 1:
            fold_train_r2 = r2_score(y_train, y_pred_train)
        else:
            fold_train_r2 = np.nan
            print(f"  Warning: y_train for fold {fold_counter} has no variance. Training R² will be NaN.")

        fold_train_rmse = root_mean_squared_error(y_train, y_pred_train)
        fold_train_mae = mean_absolute_error(y_train, y_pred_train)

        train_r2_scores.append(fold_train_r2)
        train_rmse_scores.append(fold_train_rmse)
        train_mae_scores.append(fold_train_mae)

        # Calculate Test Set Performance.
        y_pred_test = model.predict(X_test)

        # Calculate R² for testing, handling cases with no variance in y_test.
        if len(y_test.unique()) > 1:
            fold_test_r2 = r2_score(y_test, y_pred_test)
        else:
            fold_test_r2 = np.nan
            print(f"  Warning: y_test for '{test_polygon}' has no variance. Test R² will be NaN for this fold.")

        fold_test_rmse = root_mean_squared_error(y_test, y_pred_test)
        fold_test_mae = mean_absolute_error(y_test, y_pred_test)

        test_r2_scores.append(fold_test_r2)
        test_rmse_scores.append(fold_test_rmse)
        test_mae_scores.append(fold_test_mae)

        # Print performance metrics for the current fold.
        print(f"  Training R²: {fold_train_r2:.4f} | Training RMSE: {fold_train_rmse:.4f} | Training MAE: {fold_train_mae:.4f}")
        print(f"  Test R²: {fold_test_r2:.4f} | Test RMSE: {fold_test_rmse:.4f} | Test MAE: {fold_test_mae:.4f}")


# Calculate overall average performance metrics across all folds.
# np.nanmean is used to correctly handle any NaN values.
if test_r2_scores:
    avg_test_r2 = np.nanmean(test_r2_scores)
    avg_test_rmse = np.nanmean(test_rmse_scores)
    avg_test_mae = np.nanmean(test_mae_scores)

    avg_train_r2 = np.nanmean(train_r2_scores)
    avg_train_rmse = np.nanmean(train_rmse_scores)
    avg_train_mae = np.nanmean(train_mae_scores)

    print("\n--- Average Model Performance (Custom Cross-Validation Results) ---")
    print(f"Total number of folds completed: {fold_counter}")
    print("--- Test Set Averages ---")
    print(f"Average Test R² Score: {avg_test_r2:.4f}")
    print(f"Average Test RMSE (tons/ha): {avg_test_rmse:.4f}")
    print(f"Average Test MAE (tons/ha): {avg_test_mae:.4f}")

    print("\n--- Training Set Averages ---")
    print(f"Average Training R² Score: {avg_train_r2:.4f}")
    print(f"Average Training RMSE (tons/ha): {avg_train_rmse:.4f}")
    print(f"Average Training MAE (tons/ha): {avg_train_mae:.4f}")
else:
    print("\nNo valid cross-validation folds were completed. Check your data and parameters.")



--- Debugging df_summary before Custom Cross-Validation ---
Shape of df_summary: (6, 15)
Columns in df_summary: ['polygon', 'mean_ExG', 'mean_GLI', 'mean_VARI', 'mean_TGI', 'mean_CIVE', 'mean_Red', 'mean_Green', 'mean_Blue', 'std_ExG', 'std_Green', 'percent_green_cover', 'area_ha', 'yield_tons', 'filename']
Unique polygons in df_summary: ['Munda1', 'Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda7']
--- Debugging df_summary END ---

Starting Custom Cross-Validation: Training on 5 fields, Testing on 1 field.
Total unique polygons available: 6
Polygons in dataset: ['Munda1', 'Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda7']

--- Fold 1: Testing on 'Munda1' ---
  Training polygons for this fold: ['Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda7']
  Unused polygons in this fold: []
  Training R²: 1.0000 | Training RMSE: 0.0000 | Training MAE: 0.0000
  Test R²: nan | Test RMSE: 1.3095 | Test MAE: 1.3095

--- Fold 2: Testing on 'Munda2' ---
  Training polygons for this fold: ['Munda1', 'Munda3

  avg_test_r2 = np.nanmean(test_r2_scores)


In [41]:

# Define features that your 'model' was trained on.
# This list must match the 'features' list used in your model training (e.g., in the LOOCV cell).
features = [
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue'
]

# Get a list of all unique polygons present in df_indices.
# df_indices should already contain all the drone features and the 'area_ha' column
# merged from 'polygon_areas.csv'.
unique_polygons_for_prediction = df_indices['polygon'].unique()

print("Starting yield prediction for all available polygons...")

# Loop through each unique polygon to predict its yield.
for polygon_name in unique_polygons_for_prediction:
    print(f"\n--- Predicting for '{polygon_name}' ---")

    # Filter df_indices to get data only for the current polygon.
    df_current_polygon = df_indices[df_indices['polygon'] == polygon_name].copy()

    # Ensure the current polygon has data and all necessary features.
    if df_current_polygon.empty:
        print(f"Warning: No data found for polygon '{polygon_name}'. Skipping prediction.")
        continue

    # Check if all required features are present in the current polygon's data.
    missing_features = [f for f in features if f not in df_current_polygon.columns]
    if missing_features:
        print(f"Error: Missing features for polygon '{polygon_name}': {missing_features}. Skipping prediction.")
        continue

    # Predict yield per hectare for each image within the current polygon.
    # The 'model' object must have been trained in a previous cell.
    # Ensure that the 'model' object is available in your environment.
    try:
        df_current_polygon['predicted_yield_per_ha'] = model.predict(df_current_polygon[features])
    except NameError:
        print("Error: 'model' is not defined. Please ensure the model training cell was run successfully.")
        break # Exit the loop if model is not trained
    except Exception as e:
        print(f"Error predicting for '{polygon_name}': {e}. Skipping prediction for this polygon.")
        continue

    # Calculate the average predicted yield per hectare across all images for this polygon.
    avg_predicted_yield_per_ha = np.nanmean(df_current_polygon['predicted_yield_per_ha'])

    # Get the polygon area for the current polygon.
    # This assumes 'area_ha' column exists and is not NaN for this polygon.
    # We take the mean in case there are multiple entries for the same polygon, though ideally 'area_ha' should be constant.
    if 'area_ha' in df_current_polygon.columns and not df_current_polygon['area_ha'].empty and not pd.isna(df_current_polygon['area_ha'].iloc[0]):
        polygon_area = df_current_polygon['area_ha'].iloc[0] # Assuming area is constant per polygon

        # Calculate the total predicted yield in tons for the entire polygon.
        total_predicted_yield = avg_predicted_yield_per_ha * polygon_area

        # Print the results for the current polygon.
        print(f"  Average predicted yield per hectare: {avg_predicted_yield_per_ha:.3f} tons/ha")
        print(f"  Total predicted maize yield: {total_predicted_yield:.3f} tons")
    else:
        # Handle cases where 'area_ha' is missing or invalid for the current polygon.
        print(f"  ⚠️ Warning: '{polygon_name}' 'area_ha' is missing or invalid. Cannot calculate total predicted yield.")
        print(f"  Average predicted yield per hectare: {avg_predicted_yield_per_ha:.3f} tons/ha (Total yield calculation requires area)")

print("\n✅ Prediction process completed for all available polygons.")


Starting yield prediction for all available polygons...

--- Predicting for 'Munda1' ---
Error predicting for 'Munda1': The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- percent_green_cover
- std_ExG
- std_Green
. Skipping prediction for this polygon.

--- Predicting for 'Munda2' ---
Error predicting for 'Munda2': The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- percent_green_cover
- std_ExG
- std_Green
. Skipping prediction for this polygon.

--- Predicting for 'Munda3' ---
Error predicting for 'Munda3': The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- percent_green_cover
- std_ExG
- std_Green
. Skipping prediction for this polygon.

--- Predicting for 'Munda4' ---
Error predicting for 'Munda4': The feature names should match those that were passed during fit.
Feature names seen at fit

In [46]:
import pandas as pd # Ensure pandas is imported if not already in the environment
import numpy as np # Ensure numpy is imported for np.nanmean

# Define features that your 'model' was trained on.
# This list must match the 'features' list used in your model training (e.g., in the LOOCV cell).
features = [
    'mean_ExG',
    'mean_GLI',
    'mean_VARI',
    'mean_TGI',
    'mean_CIVE',
    'mean_Red',
    'mean_Green',
    'mean_Blue',
    'std_ExG',            # Standard deviation of ExG
    'std_Green',          # Standard deviation of Green band
    'percent_green_cover' # Percentage of green cover
]

# Define the specific polygon for which we want to predict the yield.
polygon_to_predict = 'Munda6'

print(f"Starting yield prediction for '{polygon_to_predict}' only...")

# Filter df_indices to get data only for the specified polygon.
# df_indices should already contain all the drone features and the 'area_ha' column
# merged from 'polygon_areas.csv'.
df_specific_polygon = df_indices[df_indices['polygon'] == polygon_to_predict].copy()

# Ensure the specific polygon has data and all necessary features.
if df_specific_polygon.empty:
    print(f"Error: No data found for polygon '{polygon_to_predict}'. Cannot proceed with prediction.")
else:
    # Check if all required features are present in the polygon's data.
    missing_features = [f for f in features if f not in df_specific_polygon.columns]
    if missing_features:
        print(f"Error: Missing features for polygon '{polygon_to_predict}': {missing_features}. Cannot proceed with prediction.")
    else:
        # Predict yield per hectare for each image within this polygon.
        # The 'model' object must have been trained in a previous cell (e.g., the LOOCV cell).
        # Ensure that the 'model' object is available in your environment.
        try:
            df_specific_polygon['predicted_yield_per_ha'] = model.predict(df_specific_polygon[features])
        except NameError:
            print("Error: 'model' is not defined. Please ensure the model training cell was run successfully before running this prediction.")
        except Exception as e:
            print(f"An unexpected error occurred during prediction for '{polygon_to_predict}': {e}.")
        else:
            # Calculate the average predicted yield per hectare across all images for this polygon.
            avg_predicted_yield_per_ha = np.nanmean(df_specific_polygon['predicted_yield_per_ha'])

            # Get the polygon area for the current polygon.
            # This assumes 'area_ha' column exists and is not NaN for this polygon.
            # We take the first value as area should be constant for a given polygon.
            if 'area_ha' in df_specific_polygon.columns and not df_specific_polygon['area_ha'].empty and not pd.isna(df_specific_polygon['area_ha'].iloc[0]):
                polygon_area = df_specific_polygon['area_ha'].iloc[0]

                # Calculate the total predicted yield in tons for the entire polygon.
                total_predicted_yield = avg_predicted_yield_per_ha * polygon_area

                # Print the results for the specific polygon.
                print(f"  Average predicted yield per hectare: {avg_predicted_yield_per_ha:.3f} tons/ha")



Starting yield prediction for 'Munda6' only...
  Average predicted yield per hectare: 1.485 tons/ha
