Load the Sentinel-2 features 

In [1]:
import ee 
ee.Authenticate()

True

In [2]:
ee.Initialize()


Extract the Sentinel features for test set 

In [3]:
import pandas as pd 
df = pd.read_csv("/Users/hereagain/Desktop/OpenAItoZ/dataset/test_UpperXingu.csv")

In [5]:
features = [
    ee.Feature(ee.Geometry.Point([row['longitude'], row['latitude']]))
    for i, row in df.iterrows()
]

points = ee.FeatureCollection(features)

In [None]:
# load Sentinel-2 SR Image Collection
start = '2020-01-01'
end = '2021-12-31'

s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
    .filterDate(start, end) \
    .filterBounds(points.geometry()) \
    .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
    .median()


In [7]:
# NDVI = (B8 - B4) / (B8 + B4)
ndvi = s2.normalizedDifference(['B8', 'B4']).rename('NDVI')

# NDBI = (B11 - B8) / (B11 + B8)
ndbi = s2.normalizedDifference(['B11', 'B8']).rename('NDBI')

# BSI = ((B11 + B4) - (B8 + B2)) / ((B11 + B4) + (B8 + B2))
b11 = s2.select('B11')
b4 = s2.select('B4')
b8 = s2.select('B8')
b2 = s2.select('B2')
bsi = b11.add(b4).subtract(b8.add(b2)).divide(b11.add(b4).add(b8).add(b2)).rename('BSI')

# Stack indices
sentinel_indices = ndvi.addBands(ndbi).addBands(bsi)


In [8]:
# sample at our test sites
sampled = sentinel_indices.sampleRegions(
    collection=points,
    scale=10,  # Sentinel-2 resolution
    geometries=True
)

In [9]:
results = sampled.getInfo()['features']

records = []
for f in results:
    props = f['properties']
    coords = f['geometry']['coordinates']
    props['lon'] = coords[0]
    props['lat'] = coords[1]
    records.append(props)

sentinel_df = pd.DataFrame(records)


In [33]:
sentinel_df.head()

Unnamed: 0,BSI,NDBI,NDVI,longitude,latitude
0,0.285881,0.253903,0.25892,-54.054371,-13.238966
1,-0.294443,-0.330374,0.856848,-53.324938,-12.632514
2,0.131558,0.118401,0.509402,-53.266548,-12.352958
3,0.261757,0.234384,0.281234,-55.353963,-11.506835
4,0.239771,0.227358,0.354979,-53.964808,-13.214442


In [11]:
sentinel_df = sentinel_df.rename(columns={'lon': 'longitude', 'lat': 'latitude'})


In [44]:
df['longitude'].head()

0   -54.054361
1   -53.324932
2   -53.266561
3   -55.353967
4   -53.964775
Name: longitude, dtype: float64

In [45]:
sentinel_df['longitude'].head()

0   -54.054371
1   -53.324938
2   -53.266548
3   -55.353963
4   -53.964808
Name: longitude, dtype: float64

In [53]:
#!!!! notice the x,y will change lil bit every time after we mapped to GEE, need to be cautious combining 
df['NDVI'] = sentinel_df['NDVI']
df['BSI'] = sentinel_df['BSI']
df['NDBI'] = sentinel_df['NDBI']

In [55]:
df.head()
# substitute with the original x,y 
original_df =pd.read_csv("/Users/hereagain/Desktop/OpenAItoZ/dataset/Filtered_Upper_Xingu_Candidates__50_pts_.csv")

In [56]:
original_df.head()

Unnamed: 0,x,y,type
0,-54.055652,-13.239949,candidate
1,-53.324514,-12.632771,candidate
2,-53.26571,-12.35209,candidate
3,-55.354909,-11.506891,candidate
4,-53.965133,-13.215377,candidate


In [57]:
df['latitude'] = original_df['x']
df['longitude'] = original_df['y']

In [58]:
df.head()

Unnamed: 0,bulk_density,cec,clay,distriver1,distriver2,ph,slope,soc,tri,type,...,bio13,bio14,bio15,bio16,bio17,bio18,bio19,NDVI,BSI,NDBI
0,124,170,285,120202.560737,41888.893348,51,1.070147,196,4.049082,candidate,...,70.97289,101.320854,33.6,12.7,20.899998,24.8,22.866667,0.25892,0.285881,0.253903
1,127,64,292,222586.076474,73665.842083,48,0.166004,154,0.666667,candidate,...,70.07389,102.19097,33.8,13.5,20.3,25.1,23.166666,0.856848,-0.294443,-0.330374
2,131,88,244,245487.103656,55371.915133,52,0.165912,157,0.31427,candidate,...,67.4312,98.91454,35.5,13.7,21.8,25.633333,23.883333,0.509402,0.131558,0.118401
3,131,78,207,21525.564974,164316.430523,49,1.576528,118,6.40216,candidate,...,69.98255,55.507298,34.6,15.5,19.099998,25.3,24.733334,0.281234,0.261757,0.234384
4,127,105,289,130162.573413,49880.052296,51,0.84159,165,3.15446,candidate,...,71.27831,108.71146,33.2,12.6,20.6,24.7,22.583334,0.354979,0.239771,0.227358


In [59]:
df.to_csv("/Users/hereagain/Desktop/OpenAItoZ/dataset/UpperXingu_test_with_allfeatures.csv", index=False)

In [136]:
df.columns

Index(['bulk_density', 'cec', 'clay', 'distriver1', 'distriver2', 'ph',
       'slope', 'soc', 'tri', 'type', 'longitude', 'latitude', 'bio1', 'bio2',
       'bio3', 'bio4', 'bio5', 'bio6', 'bio7', 'bio8', 'bio9', 'bio10',
       'bio11', 'bio12', 'bio13', 'bio14', 'bio15', 'bio16', 'bio17', 'bio18',
       'bio19', 'NDVI', 'BSI', 'NDBI'],
      dtype='object')

In [73]:
df.shape

(50, 34)

Get the Sentinel-2 features for train dataset 

In [60]:
train_df = pd.read_csv('/Users/hereagain/Desktop/OpenAItoZ/dataset/RobertSWalker/submit.csv')

In [74]:
train_df.shape

(2081, 68)

In [61]:
train_df.head()

Unnamed: 0,type,x,y,wc2.1_30s_bio_1,wc2.1_30s_bio_10,wc2.1_30s_bio_11,wc2.1_30s_bio_12,wc2.1_30s_bio_13,wc2.1_30s_bio_14,wc2.1_30s_bio_15,...,T_C,S_C,T_OC,S_OC,AWT_S_OC,AWT_T_OC,AWC,T_CEC_CLAY,T_BULK_DEN,S_BULK_DEN
0,earthwork,-67.07,-10.4828,25.870832,26.566666,24.816666,1730,246,28,56.194942,...,6.699,7.056,1.54,0.84,7.137375,6.94275,1,4.0,1.4,1.34
1,earthwork,-67.076,-10.2873,26.0375,26.733334,24.983334,1745,245,31,55.252506,...,6.699,5.3193,1.54,0.51,4.73704,6.76125,1,5.0,1.41,1.34
2,earthwork,-67.114,-10.4285,25.854166,26.533333,24.799999,1724,243,29,55.525566,...,6.699,7.056,1.54,0.84,7.137375,6.94275,1,4.0,1.4,1.34
3,earthwork,-67.209,-10.4633,25.716667,26.416668,24.633333,1706,239,32,54.147835,...,6.699,5.3193,1.54,0.51,4.73704,6.76125,1,4.0,1.29,1.34
4,earthwork,-67.22,-10.3069,25.9625,26.666666,24.85,1745,238,36,51.742565,...,6.699,4.935,1.54,0.5,4.235805,6.7425,1,12.0,1.4,1.34


In [18]:
arnas_df = pd.read_csv('/Users/hereagain/Desktop/OpenAItoZ/dataset/arnas_training_data.csv')

In [21]:
arnas_df = arnas_df.rename(columns={'x': 'longitude', 'y': 'latitude'})

In [62]:
train_features = [
    ee.Feature(ee.Geometry.Point([row['x'], row['y']]))
    for i, row in train_df.iterrows()
]

trainpoints = ee.FeatureCollection(train_features)

In [91]:
# Define required bands for indices
required_bands = ['B2', 'B4', 'B8', 'B11']

In [93]:
# updated s2
# NDVI = (B8 - B4) / (B8 + B4)
# Filter and reduce the collection to a single composite image
s2 = ee.ImageCollection("COPERNICUS/S2_SR") \
    .filterBounds(trainpoints) \
    .filterDate("2018-01-01", "2021-12-31") \
    .select(required_bands) \
    .median()

# Now compute indices on this composite
ndvi = s2.normalizedDifference(['B8', 'B4']).rename('NDVI')
ndbi = s2.normalizedDifference(['B11', 'B8']).rename('NDBI')

bsi = s2.expression(
    '((B11 + B4) - (B8 + B2)) / ((B11 + B4) + (B8 + B2))',
    {
        'B2': s2.select('B2'),
        'B4': s2.select('B4'),
        'B8': s2.select('B8'),
        'B11': s2.select('B11')
    }
).rename('BSI')

sentinel_indices = ndvi.addBands(ndbi).addBands(bsi)


In [94]:
trainsampled = sentinel_indices.sampleRegions(
    collection=trainpoints,
    scale=10,  # Sentinel-2 resolution
    geometries=True
)

In [None]:
# train_results = trainsampled.getInfo()['features']

# train_records = []
# for f in train_results:
#     props = f['properties']
#     coords = f['geometry']['coordinates']
#     props['lon'] = coords[0]
#     props['lat'] = coords[1]
#     train_records.append(props)


# hit memory error 

EEException: User memory limit exceeded.

In [96]:
task = ee.batch.Export.table.toDrive(
    collection=trainsampled,
    description='sentinel2_train_features',  # Task name
    folder='earthengine',  # Optional: folder in your Google Drive
    fileNamePrefix='sentinel2_train_features',  # Optional: file prefix
    fileFormat='CSV'
)

In [97]:
task.start()

In [98]:
import time
while task.active():
    print('Exporting... Please wait.')
    time.sleep(10)

print('Export complete. Check your Google Drive.')

Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Exporting... Please wait.
Export complete. Check your Google Drive.


In [99]:
sentinel_traindf = pd.read_csv('/Users/hereagain/Desktop/OpenAItoZ/dataset/sentinel2_train_features.csv')
sentinel_traindf.head()

Unnamed: 0,system:index,BSI,NDBI,NDVI,.geo
0,0_0,-0.215753,-0.277196,0.425166,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,1_0,-0.153677,-0.225465,0.481983,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
2,2_0,-0.132422,-0.158282,0.418867,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
3,3_0,-0.044102,-0.059675,0.355608,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
4,4_0,-0.074577,-0.11054,0.514203,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [100]:
sentinel_traindf.shape

(2081, 5)

In [103]:
arnas_df.columns

Index(['Unnamed: 0', 'type', 'longitude', 'latitude', 'wc2.1_30s_bio_1',
       'wc2.1_30s_bio_10', 'wc2.1_30s_bio_11', 'wc2.1_30s_bio_12',
       'wc2.1_30s_bio_13', 'wc2.1_30s_bio_14', 'wc2.1_30s_bio_15',
       'wc2.1_30s_bio_16', 'wc2.1_30s_bio_17', 'wc2.1_30s_bio_18',
       'wc2.1_30s_bio_19', 'wc2.1_30s_bio_2', 'wc2.1_30s_bio_3',
       'wc2.1_30s_bio_4', 'wc2.1_30s_bio_5', 'wc2.1_30s_bio_6',
       'wc2.1_30s_bio_7', 'wc2.1_30s_bio_8', 'wc2.1_30s_bio_9',
       'wc2.1_30s_elev', 'pest', 'distriver1', 'distriver2', 'distriver3',
       'rivtype', 'distlake', 'distshore', 'agpot', 'distblack', 'distclear',
       'distwhite', 'npp', 'T_SILT', 'phsoil', 'T_GRAVEL', 'S_CEC_CLAY',
       'T_PH_H2O', 'S_SAND', 'S_GRAVEL', 'mgs', 'nure', 'nuav', 'rug',
       'nitrogen', 'bdod', 'cec', 'cfvo', 'clay', 'phh2o', 'sand', 'silt',
       'soc', 'phos', 'smn30cm', 'smc30cm', 'T_C', 'S_C', 'T_OC', 'S_OC',
       'AWT_S_OC', 'AWT_T_OC', 'AWC', 'T_CEC_CLAY', 'T_BULK_DEN', 'S_BULK_DEN',
       

In [106]:
arnas_df['BSI']  = sentinel_traindf['BSI']
arnas_df['NDBI'] = sentinel_traindf['NDBI']
arnas_df['NDVI'] = sentinel_traindf['NDVI']

# update train dataset with slope/terrain and soil features 

In [110]:
arnas_df

Unnamed: 0.1,Unnamed: 0,type,longitude,latitude,wc2.1_30s_bio_1,wc2.1_30s_bio_10,wc2.1_30s_bio_11,wc2.1_30s_bio_12,wc2.1_30s_bio_13,wc2.1_30s_bio_14,...,AWT_S_OC,AWT_T_OC,AWC,T_CEC_CLAY,T_BULK_DEN,S_BULK_DEN,lidar_elevation_value,BSI,NDBI,NDVI
0,0,earthwork,-67.070000,-10.482800,25.870832,26.566666,24.816666,1730,246,28,...,7.137375,6.942750,1,4.0,1.40,1.34,184.0,-0.215753,-0.277196,0.425166
1,1,earthwork,-67.076000,-10.287300,26.037500,26.733334,24.983334,1745,245,31,...,4.737040,6.761250,1,5.0,1.41,1.34,170.0,-0.153677,-0.225465,0.481983
2,2,earthwork,-67.114000,-10.428500,25.854166,26.533333,24.799999,1724,243,29,...,7.137375,6.942750,1,4.0,1.40,1.34,204.0,-0.132422,-0.158282,0.418867
3,3,earthwork,-67.209000,-10.463300,25.716667,26.416668,24.633333,1706,239,32,...,4.737040,6.761250,1,4.0,1.29,1.34,144.0,-0.044102,-0.059675,0.355608
4,4,earthwork,-67.220000,-10.306900,25.962500,26.666666,24.850000,1745,238,36,...,4.235805,6.742500,1,12.0,1.40,1.34,186.0,-0.074577,-0.110540,0.514203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2076,2076,earthwork,-64.234926,-12.354810,26.262501,27.083334,25.150000,1558,251,12,...,2.993760,2.942280,1,6.0,1.45,1.49,,-0.001065,-0.052833,0.431288
2077,2077,earthwork,-63.433897,-12.222452,25.616667,26.299999,24.566668,1569,255,10,...,2.993760,2.942280,1,6.0,1.45,1.60,,0.063598,0.005347,0.400672
2078,2078,earthwork,-63.430354,-12.191449,25.700001,26.383333,24.650000,1574,257,10,...,2.993760,2.942280,1,6.0,1.45,1.60,,-0.022223,-0.083438,0.486630
2079,2079,earthwork,-63.796731,-13.691274,25.941668,26.833334,24.516666,1417,235,14,...,4.621050,5.994825,1,0.0,1.37,1.26,,-0.251704,-0.294071,0.715786


In [107]:
ph = ee.Image("projects/soilgrids-isric/phh2o_mean").select("phh2o_15-30cm_mean").rename("ph")
soc = ee.Image("projects/soilgrids-isric/soc_mean").select("soc_15-30cm_mean").rename("soc")
clay = ee.Image("projects/soilgrids-isric/clay_mean").select("clay_15-30cm_mean").rename("clay")
cec = ee.Image("projects/soilgrids-isric/cec_mean").select("cec_15-30cm_mean").rename("cec")
bdod = ee.Image("projects/soilgrids-isric/bdod_mean").select("bdod_15-30cm_mean").rename("bulk_density")


In [108]:
elev = ee.Image("USGS/SRTMGL1_003")

# Compute slope (in degrees)
slope = ee.Terrain.slope(elev).rename("slope")



# Compute TRI using focal standard deviation
tri = elev.reduceNeighborhood(
    reducer=ee.Reducer.stdDev(),
    kernel=ee.Kernel.square(1)  # This gives a 3x3 window
).rename("tri")

In [109]:
wholep_features = ph.addBands(soc).addBands(clay).addBands(cec).addBands(bdod).addBands(slope).addBands(tri)

In [117]:
# add train data points 
def create_feature(row):
    return ee.Feature(ee.Geometry.Point([row['longitude'], row['latitude']]), {'site_id': row.name})

features = [create_feature(row) for _, row in arnas_df.iterrows()]
points2 = ee.FeatureCollection(features)

In [119]:
sampled = wholep_features.sampleRegions(
    collection=points2,
    scale=250,
    geometries=True
)

In [120]:
train_results =sampled.getInfo()['features']

In [124]:
df_sampled = pd.DataFrame([f['properties'] for f in sampled.getInfo()['features']])


In [127]:
df_sampled.head()

Unnamed: 0,bulk_density,cec,clay,ph,site_id,slope,soc,tri
0,127,76,383,51,0,1.076476,184,5.335648
1,130,83,345,46,1,0.992471,66,3.664983
2,131,65,282,47,2,4.312538,72,13.695092
3,131,66,387,47,3,0.583209,98,1.523479
4,134,92,295,47,4,0.579662,78,2.249829


In [126]:
arnas_df.head()

Unnamed: 0.1,Unnamed: 0,type,longitude,latitude,wc2.1_30s_bio_1,wc2.1_30s_bio_10,wc2.1_30s_bio_11,wc2.1_30s_bio_12,wc2.1_30s_bio_13,wc2.1_30s_bio_14,...,AWT_S_OC,AWT_T_OC,AWC,T_CEC_CLAY,T_BULK_DEN,S_BULK_DEN,lidar_elevation_value,BSI,NDBI,NDVI
0,0,earthwork,-67.07,-10.4828,25.870832,26.566666,24.816666,1730,246,28,...,7.137375,6.94275,1,4.0,1.4,1.34,184.0,-0.215753,-0.277196,0.425166
1,1,earthwork,-67.076,-10.2873,26.0375,26.733334,24.983334,1745,245,31,...,4.73704,6.76125,1,5.0,1.41,1.34,170.0,-0.153677,-0.225465,0.481983
2,2,earthwork,-67.114,-10.4285,25.854166,26.533333,24.799999,1724,243,29,...,7.137375,6.94275,1,4.0,1.4,1.34,204.0,-0.132422,-0.158282,0.418867
3,3,earthwork,-67.209,-10.4633,25.716667,26.416668,24.633333,1706,239,32,...,4.73704,6.76125,1,4.0,1.29,1.34,144.0,-0.044102,-0.059675,0.355608
4,4,earthwork,-67.22,-10.3069,25.9625,26.666666,24.85,1745,238,36,...,4.235805,6.7425,1,12.0,1.4,1.34,186.0,-0.074577,-0.11054,0.514203


In [None]:
df_merged = arnas_df.merge(df_sampled, left_index=True, right_on='site_id', how='left')

In [131]:
df_merged.columns

Index(['Unnamed: 0', 'type', 'longitude', 'latitude', 'wc2.1_30s_bio_1',
       'wc2.1_30s_bio_10', 'wc2.1_30s_bio_11', 'wc2.1_30s_bio_12',
       'wc2.1_30s_bio_13', 'wc2.1_30s_bio_14', 'wc2.1_30s_bio_15',
       'wc2.1_30s_bio_16', 'wc2.1_30s_bio_17', 'wc2.1_30s_bio_18',
       'wc2.1_30s_bio_19', 'wc2.1_30s_bio_2', 'wc2.1_30s_bio_3',
       'wc2.1_30s_bio_4', 'wc2.1_30s_bio_5', 'wc2.1_30s_bio_6',
       'wc2.1_30s_bio_7', 'wc2.1_30s_bio_8', 'wc2.1_30s_bio_9',
       'wc2.1_30s_elev', 'pest', 'distriver1', 'distriver2', 'distriver3',
       'rivtype', 'distlake', 'distshore', 'agpot', 'distblack', 'distclear',
       'distwhite', 'npp', 'T_SILT', 'phsoil', 'T_GRAVEL', 'S_CEC_CLAY',
       'T_PH_H2O', 'S_SAND', 'S_GRAVEL', 'mgs', 'nure', 'nuav', 'rug',
       'nitrogen', 'bdod', 'cec_x', 'cfvo', 'clay_x', 'phh2o', 'sand', 'silt',
       'soc_x', 'phos', 'smn30cm', 'smc30cm', 'T_C', 'S_C', 'T_OC', 'S_OC',
       'AWT_S_OC', 'AWT_T_OC', 'AWC', 'T_CEC_CLAY', 'T_BULK_DEN', 'S_BULK_DEN',
 

In [140]:
# extarc tge features I want 
keep_cols = [
    'type', 'longitude', 'latitude', 'NDVI', 'NDBI', 'BSI'
] + [f'wc2.1_30s_bio_{i}' for i in range(1, 20)] + ['distriver1','distriver2']+[
    'bulk_density', 'slope', 'tri', 'ph', 'soc_y', 'clay_x', 'cec_x', 'lidar_elevation_value'
]



In [141]:
len(keep_cols)

35

In [142]:
filtered_df = df_merged[keep_cols]

In [143]:
filtered_df.to_csv("/Users/hereagain/Desktop/OpenAItoZ/dataset/allfeatures_train_dataset.csv", index=False)