In [14]:
import os
import numpy as np
import pandas as pd
import rasterio
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [15]:
# Base folder containing polygon folders
base_folder = r"D:\Yield\LR\LandsatData"

# Polygon folders to process
polygon_folders = ['Munda1', 'Munda2', 'Munda3', 'Munda4', 'Munda5', 'Munda6', 'Munda7']

# Store results
records = []

for poly in polygon_folders:
    folder_path = os.path.join(base_folder, poly)

    if not os.path.exists(folder_path):
        print(f"Warning: Folder not found for polygon '{poly}': {folder_path}")
        continue # Skip to the next polygon if folder does not exist

    files = [f for f in os.listdir(folder_path) if f.endswith('.tif')]
    files.sort()

    if not files:
        print(f"Warning: No .tif files found in folder for polygon '{poly}': {folder_path}")
        continue # Skip to the next polygon
    else:
        print(f"Found {len(files)} .tif files in '{poly}' folder: {folder_path}")

    for file in files:
        file_path = os.path.join(folder_path, file)

        with rasterio.open(file_path) as src:
            red = src.read(4).astype('float32')
            green = src.read(3).astype('float32')
            nir = src.read(5).astype('float32')

        # Vegetation Indices
        ndvi = (nir - red) / (nir + red + 1e-10)
        gndvi = (nir - green) / (nir + green + 1e-10)
        savi = ((nir - red) / (nir + red + 0.428)) * 1.428  # L=0.428

        # Mask invalid VI values
        ndvi = np.where((ndvi >= -1) & (ndvi <= 1), ndvi, np.nan)
        gndvi = np.where((gndvi >= -1) & (gndvi <= 1), gndvi, np.nan)
        savi = np.where((savi >= -1) & (savi <= 1), savi, np.nan)

        # Calculate VI means
        mean_ndvi = np.nanmean(ndvi)
        mean_gndvi = np.nanmean(gndvi)
        mean_savi = np.nanmean(savi)

        # Calculate mean reflectance of raw bands ===
        mean_red = np.nanmean(red)
        mean_green = np.nanmean(green)
        mean_nir = np.nanmean(nir)

        # Extract date
        try:
            date_str = file.split('_')[1]
            date = datetime.strptime(date_str, "%Y%m%d")
        except Exception:
            date = None 

        # Append results
        records.append({
            'polygon': poly,
            'date': date,
            'filename': file,
            'mean_NDVI': mean_ndvi,
            'mean_GNDVI': mean_gndvi,
            'mean_SAVI': mean_savi,
            'mean_Red': mean_red,
            'mean_Green': mean_green,
            'mean_NIR': mean_nir
        })

# Create DataFrame
df_indices = pd.DataFrame(records)
print("Raw per-image features:")
print(df_indices.head())

Found 4 .tif files in 'Munda1' folder: D:\Yield\LR\LandsatData\Munda1
Found 4 .tif files in 'Munda2' folder: D:\Yield\LR\LandsatData\Munda2
Found 4 .tif files in 'Munda3' folder: D:\Yield\LR\LandsatData\Munda3
Found 4 .tif files in 'Munda4' folder: D:\Yield\LR\LandsatData\Munda4
Found 4 .tif files in 'Munda5' folder: D:\Yield\LR\LandsatData\Munda5
Found 4 .tif files in 'Munda6' folder: D:\Yield\LR\LandsatData\Munda6
Found 4 .tif files in 'Munda7' folder: D:\Yield\LR\LandsatData\Munda7
Raw per-image features:
  polygon       date                                 filename  mean_NDVI  \
0  Munda1        NaT                    20250210_20250215.tif   0.069857   
1  Munda1        NaT                                   M1.tif   0.096473   
2  Munda1 2024-11-13  M525-058polygon20241106_20241113_02.tif   0.022826   
3  Munda1        NaT     M525-058polygon20241224_20241228.tif   0.021880   
4  Munda2        NaT                    20250210_20250215.tif   0.049058   

   mean_GNDVI  mean_SAVI  mea

In [16]:
# Load area data
df_area = pd.read_csv("polygon_areas.csv")

# Merge area with vegetation index data
df_indices = pd.merge(df_indices, df_area, on='polygon', how='left')

# Check for any polygons without area info
missing_area = df_indices[df_indices['area_ha'].isna()]
if not missing_area.empty:
    print("Warning: Some polygons are missing area values:")
    print(missing_area['polygon'].unique())

# Show result
print("Merged vegetation index + area data:")
print(df_indices.head())

Merged vegetation index + area data:
  polygon       date                                 filename  mean_NDVI  \
0  Munda1        NaT                    20250210_20250215.tif   0.069857   
1  Munda1        NaT                                   M1.tif   0.096473   
2  Munda1 2024-11-13  M525-058polygon20241106_20241113_02.tif   0.022826   
3  Munda1        NaT     M525-058polygon20241224_20241228.tif   0.021880   
4  Munda2        NaT                    20250210_20250215.tif   0.049058   

   mean_GNDVI  mean_SAVI  mean_Red  mean_Green   mean_NIR   area_ha  
0    0.071177   0.099755  2532.375   2503.5000  4497.1250  0.101974  
1    0.088087   0.137761  2134.000   2306.3750  4816.2500  0.101974  
2    0.040924   0.032595  3234.500   2791.6250  3884.5000  0.101974  
3    0.042063   0.031244  3496.625   2967.0000  4167.3750  0.101974  
4    0.050969   0.070054  2032.625   1988.3125  3474.1875  0.339510  


In [17]:
df_yield = pd.read_csv("yield_tons.csv")

# Merge with yield data
df = pd.merge(df_indices, df_yield, on='polygon', how='inner')

# Check which columns exist
print("Available columns:", df.columns.tolist())

# Drop rows with missing data including band means
df = df.dropna(subset=[
    'mean_NDVI', 'mean_GNDVI', 'mean_SAVI', 
    'mean_Red', 'mean_Green', 'mean_NIR', 
    'area_ha', 'yield_tons'
])

# Show preview
print("Per-image data ready for modeling:")
print(df.head())

Available columns: ['polygon', 'date', 'filename', 'mean_NDVI', 'mean_GNDVI', 'mean_SAVI', 'mean_Red', 'mean_Green', 'mean_NIR', 'area_ha', 'yield_tons']
Per-image data ready for modeling:
  polygon       date                                 filename  mean_NDVI  \
0  Munda1        NaT                    20250210_20250215.tif   0.069857   
1  Munda1        NaT                                   M1.tif   0.096473   
2  Munda1 2024-11-13  M525-058polygon20241106_20241113_02.tif   0.022826   
3  Munda1        NaT     M525-058polygon20241224_20241228.tif   0.021880   
4  Munda2        NaT                    20250210_20250215.tif   0.049058   

   mean_GNDVI  mean_SAVI  mean_Red  mean_Green   mean_NIR   area_ha  \
0    0.071177   0.099755  2532.375   2503.5000  4497.1250  0.101974   
1    0.088087   0.137761  2134.000   2306.3750  4816.2500  0.101974   
2    0.040924   0.032595  3234.500   2791.6250  3884.5000  0.101974   
3    0.042063   0.031244  3496.625   2967.0000  4167.3750  0.101974   

In [18]:
# Select only the columns needed for training
df_summary = df[[
    'polygon',
    'mean_NDVI',
    'mean_GNDVI',
    'mean_SAVI',
    'mean_Red',
    'mean_Green',
    'mean_NIR',
    'area_ha',
    'yield_tons',
    'filename'
]]

# Drop rows with missing values (optional clean-up)
df_summary = df_summary.dropna()

# Show the final table
print("Image-level training dataset:")
print(df_summary.head())


Image-level training dataset:
  polygon  mean_NDVI  mean_GNDVI  mean_SAVI  mean_Red  mean_Green   mean_NIR  \
0  Munda1   0.069857    0.071177   0.099755  2532.375   2503.5000  4497.1250   
1  Munda1   0.096473    0.088087   0.137761  2134.000   2306.3750  4816.2500   
2  Munda1   0.022826    0.040924   0.032595  3234.500   2791.6250  3884.5000   
3  Munda1   0.021880    0.042063   0.031244  3496.625   2967.0000  4167.3750   
4  Munda2   0.049058    0.050969   0.070054  2032.625   1988.3125  3474.1875   

    area_ha  yield_tons                                 filename  
0  0.101974         0.9                    20250210_20250215.tif  
1  0.101974         0.9                                   M1.tif  
2  0.101974         0.9  M525-058polygon20241106_20241113_02.tif  
3  0.101974         0.9     M525-058polygon20241224_20241228.tif  
4  0.339510         1.3                    20250210_20250215.tif  


In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Define features to use for training
features = [
    'mean_NDVI',
    'mean_GNDVI',
    'mean_SAVI',
    'mean_Red',
    'mean_Green',
    'mean_NIR'
]

# Drop any rows with missing values in these features (safety check)
df_summary = df_summary.dropna(subset=features + ['yield_tons'])

# Define input (X) and output (y)
X = df_summary[features]
y = df_summary['yield_tons']  # Already in tons/ha

# Train linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict on training data
y_pred = model.predict(X)

# Evaluate model
r2 = r2_score(y, y_pred)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

# Print performance
print("Image-level Model Performance:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE (tons/ha): {rmse:.4f}")
print(f"MAE (tons/ha): {mae:.4f}")

Image-level Model Performance:
R² Score: 0.2331
RMSE (tons/ha): 0.4020
MAE (tons/ha): 0.3258




In [22]:
# Filter only test field from the df_indices DataFrame,
df_munda6 = df_indices[df_indices['polygon'] == 'Munda6'].copy()

# Check if test field data is actually present for prediction
if df_munda6.empty:
    print("Error: No Munda6 data found in df_indices for prediction. Ensure it loaded correctly in earlier steps.")
else:
    # Predict yield per hectare for each test field image
    # Only pass the 'features' columns to the model for prediction
    df_munda6['predicted_yield_per_ha'] = model.predict(df_munda6[features])

    # Calculate average predicted yield per hectare across all Munda6 images
    avg_predicted_yield_per_ha = df_munda6['predicted_yield_per_ha'].mean()

    print(f"\nAverage predicted yield per hectare for Munda6: {avg_predicted_yield_per_ha:.3f} tons/ha")



Average predicted yield per hectare for Munda6: 1.961 tons/ha
