In [1]:
import os
import ee
import math
import time
import geemap
import datetime 
import pandas as pd

# ee.Authenticate()

# Initialize Earth Engine
try:
    ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')
    print('Google Earth Engine initialized successfully!')
except ee.EEException as e:
    print('Google Earth Engine failed to initialize!', e)
    raise

Google Earth Engine initialized successfully!


In [2]:
# sorted(os.listdir("./shc_data/NORMALISED_DATA/2024"))
sorted(os.listdir("./shc_data/NORMALISED_DATA/AGRI_2023-24/"))

['ANDAMAN & NICOBAR_AGRI_2023-24.csv',
 'ANDHRA PRADESH_AGRI_2023-24.csv',
 'ARUNACHAL PRADESH_AGRI_2023-24.csv',
 'ASSAM_AGRI_2023-24.csv',
 'BIHAR_AGRI_2023-24.csv',
 'CHHATTISGARH_AGRI_2023-24.csv',
 'GOA_AGRI_2023-24.csv',
 'GUJARAT_AGRI_2023-24.csv',
 'HARYANA_AGRI_2023-24.csv',
 'HIMACHAL PRADESH_AGRI_2023-24.csv',
 'JAMMU & KASHMIR_AGRI_2023-24.csv',
 'JHARKHAND_AGRI_2023-24.csv',
 'KARNATAKA_AGRI_2023-24.csv',
 'LADAKH_AGRI_2023-24.csv',
 'MADHYA PRADESH_AGRI_2023-24.csv',
 'MAHARASHTRA_AGRI_2023-24.csv',
 'MEGHALAYA_AGRI_2023-24.csv',
 'MIZORAM_AGRI_2023-24.csv',
 'NAGALAND_AGRI_2023-24.csv',
 'ODISHA_AGRI_2023-24.csv',
 'PUDUCHERRY_AGRI_2023-24.csv',
 'PUNJAB_AGRI_2023-24.csv',
 'RAJASTHAN_AGRI_2023-24.csv',
 'SIKKIM_AGRI_2023-24.csv',
 'TAMIL NADU_AGRI_2023-24.csv',
 'TELANGANA_AGRI_2023-24.csv',
 'TRIPURA_AGRI_2023-24.csv',
 'UTTAR PRADESH_AGRI_2023-24.csv',
 'UTTARAKHAND_AGRI_2023-24.csv',
 'WEST BENGAL_AGRI_2023-24.csv']

## Please use the same names as listed below for the state variable

0: 
Andaman & Nicobar
1: 
Andhra Pradesh
2: 
Arunachal Pradesh
3: 
Assam
4: 
Bihar
5: 
Chandigarh
6: 
Chhattishgarh
7: 
Daman and Diu and Dadra and Nagar Haveli
8: 
Delhi
9: 
Goa
10: 
Gujarat
11: 
Haryana
12: 
Himachal Pradesh
13: 
Jammu and Kashmir
14: 
Jharkhand
15: 
Karnataka
16: 
Kerala
17: 
Ladakh
18: 
Lakshadweep
19: 
Madhya Pradesh
20: 
Maharashtra
21: 
Manipur
22: 
Meghalaya
23: 
Mizoram
24: 
Nagaland
25: 
Odisha
26: 
Puducherry
27: 
Puducherry
28: 
Punjab
29: 
Rajasthan
30: 
Sikkim
31: 
Tamilnadu
32: 
Telengana
33: 
Tripura
34: 
Uttar Pradesh
35: 
Uttarakhand
36: 
West Bengal

In [7]:
# === CONFIGURABLE PARAMETERS ===
DATA_DIR = "./shc_data/NORMALISED_DATA/AGRI_2023-24/"
INPUT_CSV = os.path.join(DATA_DIR, "WEST BENGAL_AGRI_2023-24.csv")  # CSV containing soil properties
BATCH_SIZE = 3000  # Number of points processed per batch

india_districts = ee.FeatureCollection("projects/ee-aakash312000/assets/state")
state = india_districts.filter(ee.Filter.eq('State_Name', 'West Bengal'))

# Initialize start date and end date
SD = pd.to_datetime("2023-07-01")
ED = pd.to_datetime("2024-06-30")

In [4]:
def maskS2clouds(image):
    qa = image.select('QA60');
    # Bits 10 and 11 are clouds and cirrus, respectively.
    cloudBitMask = 1 << 10;
    cirrusBitMask = 1 << 11;
    mask = qa.bitwiseAnd(cloudBitMask).eq(0).And(qa.bitwiseAnd(cirrusBitMask).eq(0))
    scaled = image.divide(10000)
    scaled = scaled.select(['B2', 'B3', 'B4', 'B8', 'B11', 'B12'], ['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2'])
    return scaled.updateMask(mask).toFloat()


def fetch_satellite_data(start_date, end_date, roi):
    """Fetches Physical properties and Sentinel-2 bands"""

    def update(image):
      return image.multiply(0.002)

    S2 = (
        ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
        .filterBounds(roi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 40))
        .map(maskS2clouds)
    )
    
    temp = (
            ee.ImageCollection("MODIS/061/MOD11A2")
            .select('LST_Day_1km')
            .filterDate(start_date, end_date)
            .filterBounds(roi)
            .map(lambda img: update(img))
            .mean()
            .rename('temp')
    )
    
    preci = (
            ee.ImageCollection('UCSB-CHG/CHIRPS/DAILY')
            .filterDate(start_date, end_date)
            .filterBounds(roi)
            .mean()
            .rename('precipitation')      
    )

    elevation = ee.Image("USGS/SRTMGL1_003").select("elevation").clip(roi)
    slope = ee.Terrain.slope(elevation).rename('slope')
    aspect = ee.Terrain.aspect(elevation).rename('aspect')
    
    slope_radians = slope.multiply(math.pi).divide(180)
    
    flow_accumulation = ee.Image("MERIT/Hydro/v1_0_1").select("upa").clip(roi) # Upstream Area (Flow Accumulation)
    
    tan_slope = slope_radians.tan()
    safe_slope = tan_slope.where(tan_slope.eq(0), 0.001)
    
    twi = flow_accumulation.divide(safe_slope).log().rename('TWI')

    sand = ee.Image("projects/soilgrids-isric/sand_mean").select(["sand_0-5cm_mean", "sand_5-15cm_mean"]).clip(roi).rename(['sand05', 'sand515']);
    silt = ee.Image("projects/soilgrids-isric/silt_mean").select(["silt_0-5cm_mean", "silt_5-15cm_mean"]).clip(roi).rename(['silt05', 'silt515']);
    clay = ee.Image("projects/soilgrids-isric/clay_mean").select(["clay_0-5cm_mean", "clay_5-15cm_mean"]).clip(roi).rename(['clay05', 'clay515']);

    S2 = S2.map(lambda img: img.toFloat())
    s2 = S2.mean()

    collection = ee.Image([s2, temp, preci, elevation, slope, aspect, twi, sand, clay, silt])
    
    return collection


def fetch_indices_for_district(df, district, month, start_date, end_date):
    """Processes a district's data in batches and fetches satellite indices."""
    
    # Filter district data
    district_df = df[df["district"] == district].reset_index(drop=True)
    total_features = len(district_df)
    
    processed = 0
    batch_number = 1

    while processed < total_features:
        batch_df = district_df.iloc[processed : processed + BATCH_SIZE]
        valid_points = []
        # loop through the district dataframe batch
        for _, row in batch_df.iterrows():
            # sanity check if survey data lies between start and end date
            if (pd.to_datetime(row['date']) >= SD and pd.to_datetime(row['date']) <= ED):
                valid_points.append(
                    ee.Feature(ee.Geometry.Point(row['long'], row['lat']), {
                        'district' : row['district'],
                        'village' : row['village'],
                        'date': row['date'],
                        'start_date': SD,
                        'end_date': ED,
                        'N': row['N'],
                        'P': row['P'],
                        'K': row['K'],
                        'B': row['B'],
                        'Fe': row['Fe'],
                        'Zn': row['Zn'],
                        'Cu': row['Cu'],
                        'S': row['S'],
                        'OC': row['OC'],
                        'pH': row['pH'],
                        'Mn': row['Mn'],
                        'EC': row['EC']}))

        points = ee.FeatureCollection(valid_points) 
        points = points.filterBounds(state.geometry())
        roi = points.geometry().bounds()
        collections = fetch_satellite_data(start_date, end_date, roi)
        sampled_points = collections.sampleRegions(
            collection=points, scale=30, tileScale=8, geometries=True
        )

        state_name = INPUT_CSV.split("/")[-1].split(".csv")[0].replace(" ", "_").replace("&", "_")

        # Export the CSV file to google drive
        task = ee.batch.Export.table.toDrive(
            collection=sampled_points,
            description=f"{state_name}_sampled_batch_{batch_number}",
            folder='GEE_Exports_Aakash',
            fileNamePrefix=f"{state_name}_{district}_batch{batch_number}",
            fileFormat='CSV'
        )
        task.start()

        processed += BATCH_SIZE
        batch_number += 1
        print(f"Batch {batch_number-1} processed. ")

In [5]:
def main():
    """Main function to process all districts."""
    df = pd.read_csv(INPUT_CSV)
    
    # Ensure necessary columns exist
    required_columns = {"long", "lat", "district", "village", "date"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"CSV file must contain columns: {required_columns}")

    # Process districts
    unique_districts = df["district"].unique()
    print(f"Total Districts: {len(unique_districts)}")

    start_date, end_date = ee.Date(SD), ee.Date(ED)
    
    for district in unique_districts:
        start_time = time.perf_counter()

        print(f"\nProcessing District: {district}")
        batch_df = df[df["district"] == district].reset_index(drop=True)
        total_features = len(batch_df)
        print(f"Total Features in {district}: {total_features}")
            
        fetch_indices_for_district(df, district, None, start_date, end_date)
            
        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        a = datetime.timedelta(seconds=elapsed_time)
        print("Time taken : " + str(a))
        
    print("\nAll districts processed successfully!")

In [8]:
main()

Total Districts: 22

Processing District: 24 PARAGANAS NORTH
Total Features in 24 PARAGANAS NORTH: 8574
Batch 1 processed. 
Batch 2 processed. 
Batch 3 processed. 
Time taken : 0:00:57.116135

Processing District: 24 PARAGANAS SOUTH
Total Features in 24 PARAGANAS SOUTH: 13340
Batch 1 processed. 
Batch 2 processed. 
Batch 3 processed. 
Batch 4 processed. 
Batch 5 processed. 
Time taken : 0:01:31.794129

Processing District: ALIPURDUAR
Total Features in ALIPURDUAR: 4298
Batch 1 processed. 
Batch 2 processed. 
Time taken : 0:00:28.690450

Processing District: BANKURA
Total Features in BANKURA: 10713
Batch 1 processed. 
Batch 2 processed. 
Batch 3 processed. 
Batch 4 processed. 
Time taken : 0:01:13.782102

Processing District: BIRBHUM
Total Features in BIRBHUM: 15061
Batch 1 processed. 
Batch 2 processed. 
Batch 3 processed. 
Batch 4 processed. 
Batch 5 processed. 
Batch 6 processed. 
Time taken : 0:01:40.810254

Processing District: COOCHBEHAR
Total Features in COOCHBEHAR: 7364
Batch 1 p