This code compiles the VI values from Landsat and Sentinel-2 data calculated at each crop location (centroid of the field polygon) from the shapefile (generated using Google Earth Engine). It reads two sets of input csv files - first from the Landsat data named as VIs_Landsat_croppolygons.csv (eg. VIs_Landsat_mangopolygons.csv) and second from the Sentinel-2 data named as VIs_S2_croppolygon.csv (eg. VIs_S2_mangopolygons.csv). As the output, the code will generate three combined csv files by crop which are saved in the "output" folder within the input folder "GEE_outputs". The file names are saved as combined_fruitpolygon.csv (eg. combined_mangopolygon.csv). All the calculations are done locally and no temporary files are created during the calculations. 

In [36]:
import os
import sys
import subprocess

# This gives the name of the environment directory
print("Environment name:", os.path.basename(sys.prefix))

Environment name: A4I064-ML


In [37]:
# Install necessary packages, if needed

required_packages = ["pandas"]

for package in required_packages:
    try:
        __import__(package if package != "scikit-learn" else "sklearn")
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("All packages have been installed!")

pandas is already installed.
All packages have been installed!


In [38]:
# Import necessary packages
import pandas as pd
import os

In [39]:
# Manage input and output folders
source_folder = r"C:\Users\U8019357\UniSQ\A4I Geospatial Tech - UniSQ Internal - UniSQ Internal\2 - ML\Raw Data\GEE_VIs"

# Enter crop name
crop_name = "Mango"

In [40]:
output_folder = source_folder
os.makedirs(output_folder, exist_ok=True)

In [41]:
def process_csv(file_path, source_name, crop_name):
    df = pd.read_csv(file_path)

    # Drop columns you don't need (only if they exist)
    drop_cols = [
        "Model_ID", "MangoVarie", "DragonFrui", "AverageYie", "YieldPrevi",
        "altitudeMo", "begin", "descriptio", "end", "extrude", "icon",
        "tessellate", "timestamp", "visibility", "system:time_start", ".geo", "system:index"
    ]
    cols_to_drop = [c for c in drop_cols if c in df.columns]
    df.drop(columns=cols_to_drop, inplace=True)

    # Ensure Plot_ID exists
    if "Plot_ID" not in df.columns:
        raise KeyError(f"'Plot_ID' column missing in {file_path}")

    # Parse Date if exists
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")

    # Attach crop + source
    df["Crop"] = crop_name
    df["Source"] = source_name

    # Round vegetation indices
    excluded_cols = ["system:index", "Plot_ID", "latitude", "longitude", "Date", "Crop", "Source"]
    veg_indices = [col for col in df.columns if col not in excluded_cols]
    df[veg_indices] = df[veg_indices].round(4)

    # Drop duplicate columns if any
    df = df.loc[:, ~df.columns.duplicated()]

    # Final dataframe
    final_df = df[["Crop", "Source", "Plot_ID", "latitude", "longitude", "Date"] + veg_indices]
    final_df = final_df.sort_values(by=["Plot_ID", "Date"]).reset_index(drop=True)

    return final_df

In [43]:
# Identify all CSV files containing the crop name
matching_files = [
    f for f in os.listdir(source_folder)
    if f.lower().endswith(".csv")
       and crop_name.lower() in f.lower()
       and "polygon" in f.lower()
       and "minmaxmean" not in f.lower()
]

if not matching_files:
    print(f"No CSV files found for crop '{crop_name}' with 'polygon' and excluding 'minmaxmean'.")
else:
    print(f"Found {len(matching_files)} file(s) to process:")
    for f in matching_files:
        print(f" - {f}")

    combined_df = pd.DataFrame()

    for file_name in matching_files:
        file_path = os.path.join(source_folder, file_name)

        # Determine source from filename
        filename_parts = os.path.splitext(file_name)[0].split("_")
        source = "Unknown"
        for part in filename_parts:
            if part.lower() == "landsat":
                source = "Landsat"
                break
            elif part.lower() in ["s2", "sentinel2", "sentinel-2"]:
                source = "Sentinel2"
                break

        if source == "Unknown":
            print(f"Skipping file '{file_name}' — unknown source format.")
            continue

        # Map crop name if needed
        crop_label = "Dragonfruit" if crop_name.lower() == "dragon" else crop_name

        # Process the file
        processed_df = process_csv(file_path, source, crop_label)
        combined_df = pd.concat([combined_df, processed_df], ignore_index=True)

    # Final sort
    combined_df = combined_df.sort_values(by=["Plot_ID", "Date"]).reset_index(drop=True)

    # Save final combined output
    output_path = os.path.join(output_folder, f"VIs_Combined_{crop_label}_Polygon.csv")
    combined_df.to_csv(output_path, index=False)
    print(f"\nFinal combined file saved to: {os.path.basename(output_path)}")

Found 4 file(s) to process:
 - VIs_Combined_Mango_Polygon.csv
 - VIs_Landsat_Mango_Polygons.csv
 - VIs_S2_Mango_Polygons.csv
 - VIs_S2_Mango_Polygons_MinMaxMmean.csv
Skipping file 'VIs_Combined_Mango_Polygon.csv' — unknown source format.

Final combined file saved to: VIs_Combined_Mango_Polygon.csv
