### 1. Install Dependencies, Import required Libraries and authenticate with Earth engine

In [None]:
!pip install geemap

In [None]:
from os.path import isfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import geemap
from geemap import ml
import ee
from google.colab import drive

In [None]:
geemap.ee_initialize()

## 2. Data Extraction:

* Extract Training and Validation data from Drive
* Sample x-y points uniformly from dataset
* Store it in `x_train, y_train, x_test, y_test` for train and validating the model

In [None]:
drive.mount('/content/gdrive')
gdrive_pref = '/content/gdrive/My Drive/'

Compute the names of folders to be used

In [None]:
folder_pref = './AEZ_datasets_train_'
areas = np.arange(1, 11)
folders = np.char.add(folder_pref, areas.astype(str))
print(folders)

Using the naming convention compute the names of the files to be used:

In [None]:
years = np.arange(2016, 2023)
seasons = np.array(['Rabi_1','Kharif_1','Zaid_1'])
filenames = np.array([])
for year in years:
    for season in seasons:
        filenames = np.append(filenames, str(year) + '_' + season)
print(filenames)

Append the file names paths to the dataset_paths array

In [None]:
dataset_paths = np.array([])
for folder in folders:
    for filename in filenames:
        dataset_paths = np.append(dataset_paths, gdrive_pref+folder + '/' + filename + '.csv')

Append the relevant filenames to train-path array and val-path array

In [None]:
train_data_path = np.array([])
val_data_path = np.array([])
for dataset_path in dataset_paths:
    year = dataset_path.split('/')[6].split('_')[0]
    if(int(year) == 2022):
        val_data_path = np.append(val_data_path, dataset_path)
    else:
        train_data_path = np.append(train_data_path, dataset_path)

Extract headers from the first file

In [None]:
headers = np.array(pd.read_csv(train_data_path[0]).columns)

Extract the data from the files and store in train_data and val_data


In [None]:
train_data = np.array([])
for data_path in train_data_path:
    if(isfile(data_path)):
        data = pd.read_csv(data_path)
        data = np.array(data)
        if(train_data.size == 0):
            train_data = data
        else:
            train_data = np.vstack((train_data, data))
    else:
        print(data_path + ' does not exist')
print(train_data.shape)


In [None]:
val_data = np.array([])
for data_path in val_data_path:
    if(isfile(data_path)):
        data = pd.read_csv(data_path)
        data = np.array(data)
        if(val_data.size == 0):
            val_data = data
        else:
            val_data = np.vstack((val_data, data))
    else:
        print(data_path + ' does not exist')
print(val_data.shape)

Remove features containing nan values, to find the corrupt headers

In [None]:
indices = np.argwhere(np.isnan(train_data))
indices = np.unique(indices[:, 1])

corrupt_headers = headers[indices]
print(corrupt_headers)

indix = np.argwhere(np.isnan(val_data))
indix = np.unique(indix[:, 1])

train_data = np.delete(train_data, indices, 1)
val_data = np.delete(val_data, indix, 1)

# output the size of the train_data and val_data

print(train_data.shape)
print(val_data.shape)


Extract labels and feature names (required for converting random forest regressor to strings)

In [None]:
labels = headers[-1]
feature_names = np.array([])
for header in headers[:-1]:
    if header not in corrupt_headers:
        feature_names = np.append(feature_names, header)
print(feature_names)
print(len(feature_names))
print(labels)

Extract the train_data_x, train_data_y, val_data_x, val_data_y from train_data and val_data

In [None]:
train_data_x = train_data[:, :-1]
train_data_y = train_data[:, -1]/8

val_data_x = val_data[:, :-1]
val_data_y = val_data[:, -1]/8

RUN THIS CELL IF THERE IS STORAGE ISSUE, THIS WILL DELETE UNNECESSARY VARIABLES THAT WOULD NOT BE REQUIRED IN FURTHER CELLS

In [None]:
del train_data
del val_data
del folder_pref
del areas
del folders
del years
del seasons
del filenames
del dataset_paths
del train_data_path
del val_data_path

PURE TRAIN DATA (SAMPLE RANDOMLY N POINTS FOR TRAINING AND STORE IN x_train AND y_train):

In [None]:
N = 200000

shuffle_indices = np.arange(train_data_x.shape[0])
np.random.shuffle(shuffle_indices)
train_x = train_data_x[shuffle_indices]
train_y = train_data_y[shuffle_indices]
x_train = train_x[:N]
y_train = train_y[:N]

#verify the size of y_train
print(len(y_train))

 PURE VALIDATION DATA (SAMPLE RANDOMLY M POINTS FOR VALIDATION AND STORE IN x_test AND y_test):


In [None]:
shuffle_indices = np.arange(val_data_x.shape[0])

np.random.shuffle(shuffle_indices)
val_x = val_data_x[shuffle_indices]
val_y = val_data_y[shuffle_indices]
x_test = val_x[:20000]
y_test = val_y[:20000]

## 3. Train and Assess the model
* Train `rf_model` on x_train, y_train (train data points)
* Get the predictions for train and test data
* Output different metrics on these predictions such as `rmse, nrmse, R2_score`
* Plot the scatter plots of true values v/s predicted values

Use Random Forest Regressor (change the parameters as required) and store the model in rf_model

In [None]:

rf_model = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=5,n_jobs=-1)
rf_model.fit(x_train, y_train)


In [None]:
y_pred = rf_model.predict(x_test)           # predict the values of y_test using the model (validation output stored in y_pred)
y_pred_train = rf_model.predict(x_train)    # predict the values of y_train using the model (training output stored in y_pred_train)

Compute the metrics for the model (mse, rmse, nrmse, nmse)

In [None]:
testsize = x_test.shape[0]
print(f'Test size: {testsize}')
mse = mean_squared_error(y_test, y_pred)
mse0 = mean_squared_error(y_test, np.zeros(testsize))
ymean = np.mean(y_pred)
print(f'mean of pred: {ymean}')
print(f'mse: {mse}')
rmse = np.sqrt(mse)
print(f'rmse: {rmse}')
print(f'nmse: {mse/mse0}')
nrmse = rmse/ymean
print(f'nrmse: {nrmse}')
print(ymean)

Compute R2 score of the model on Train and Validation data

In [None]:

r2_train = r2_score(y_train, y_pred_train)
print(f'r2_train: {r2_train}')

r2_val = r2_score(y_test, y_pred)
print(f'r2_validation: {r2_val}')

#### Plots for Accuracy Assessment:

In [None]:
y_actual = y_train

plt.scatter(y_actual, y_pred_train, s=20, alpha=0.7, edgecolors='w', linewidth=0.5)

# Plot the 45-degree line
plt.plot([min(y_actual), max(y_actual)], [min(y_actual), max(y_actual)], color='red', linestyle='--')

# Set axis labels
plt.xlim(0,700)
plt.ylim(0,700)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Set plot title
plt.title('Scatter plot of Actual vs. Predicted (Training Data)')

# Show the plot
plt.show()

In [None]:
# Plot the scatter plot of Actual vs. Predicted (Validation Data)

y_actual = y_test
plt.scatter(y_actual, y_pred, s=20, alpha=0.7, edgecolors='w', linewidth=0.5)

# Plot the 45-degree line
plt.plot([min(y_actual), max(y_actual)], [min(y_actual), max(y_actual)], color='red', linestyle='--')

# Set axis labels
plt.xlim(0,700)
plt.ylim(0,700)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Set plot title
plt.title('Scatter plot of Actual vs. Predicted (Validation Data)')

# Show the plot
plt.show()

## 4. Uploading trained model to Google Earth Engine:
1. Serialise the random forest model to a list of strings (where each string represents a decision tree)
2. Convert the list of strings to a `ee.FeatureCollection`
3. Export the `ee.FeatureCollection` to Google Earth Engine as an asset

Serialise random forest to strings and store it in rf_strings

In [None]:

rf_strings = ml.rf_to_strings(rf_model, feature_names, output_mode='regression', processes=6)


To convert the list of strings to a ee.FeatureCollection :
1. Convert the list of strings to a list of dummy ee.Features (with any ~NON NULL~ geometry and property `tree` set as the string representing the tree)
2. Convert this list of features to a ee.FeatureCollections and store it in fc_string

In [None]:
dummy_feature = ee.Feature(ee.Geometry.Point([-114.318, 38.985]));
treeStrings = []
for dt in rf_strings:
  feat = dummy_feature.set('tree', dt)
  treeStrings.append(feat)
fc_string = ee.FeatureCollection(treeStrings)

Export the `ee.FeatureCollection` to earth engine asset

In [None]:
asset_name = 'rf_demo_7'
task_name = asset_name+'_task'
asset_path= 'projects/vatsal-stiti/assets/'+asset_name
task = ee.batch.Export.table.toAsset(fc_string,task_name,asset_path);
task.start()

## 5. Downloading the Model in GEE and using it to inference ET:
To download the model from GEE asset and load it as a classifier use this code snippet in Google Earth Engine Editor:
```
var RandomForestasFeatCollection = ee.FeatureCollection(assetName).aggregate_array('tree').aside(print);
var classifier = ee.Classifier.decisionTreeEnsemble(RandomForestasFeatCollection);
```
Replace assetName with the path of asset that contains the model. The code is available in the script at: `Vatsal_Stiti/FinalPipelineScripts/InferenceETFromModelAsset`. Use this script to predict ET of any area on a given date.