In [1]:
import pandas as pd
import numpy as np
import shap
import os
import json
from upsetplot import UpSet
import xgboost as xgb
import json
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict
from itertools import combinations
import shutil
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import math
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.ticker as mtick
import matplotlib.ticker as ticker
from scipy.stats import gaussian_kde
from scipy.signal import savgol_filter
from matplotlib import gridspec
from sklearn.metrics import mean_squared_error
from scipy import stats
from datetime import datetime
import cartopy.crs as ccrs
import joypy
from osgeo import gdal
import matplotlib as mpl
from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
import warnings
warnings.filterwarnings("ignore")

In [2]:
def read_tif(tif_file):
    dataset = gdal.Open(tif_file)
    cols = dataset.RasterXSize
    rows = dataset.RasterYSize
    im_proj = (dataset.GetProjection())
    im_Geotrans = (dataset.GetGeoTransform())
    im_data = dataset.ReadAsArray(0, 0, cols, rows)
    if im_data.ndim == 3:
        im_data = np.moveaxis(dataset.ReadAsArray(0, 0, cols, rows), 0, -1)
    dataset = None
    return im_data, im_Geotrans, im_proj,rows, cols
    
def array_to_geotiff(array, output_path, geo_transform, projection, band_names=None):
    rows, cols, num_bands = array.shape
    driver = gdal.GetDriverByName('GTiff')
    dataset = driver.Create(output_path, cols, rows, num_bands, gdal.GDT_Float32)
    
    dataset.SetGeoTransform(geo_transform)
    dataset.SetProjection(projection)
    
    for band_num in range(num_bands):
        band = dataset.GetRasterBand(band_num + 1)
        band.WriteArray(array[:, :, band_num])
        band.FlushCache()
        
        if band_names:
            band.SetDescription(band_names[band_num])
    
    dataset = None
    band = None
    return

def get_corner(image_file):
    dataset = gdal.Open(image_file)
    geo_transform = dataset.GetGeoTransform()
    x_res = geo_transform[1]
    y_res = geo_transform[5] 
    x_min = geo_transform[0]
    y_max = geo_transform[3]
    x_max = x_min + x_res * dataset.RasterXSize
    y_min = y_max + y_res * dataset.RasterYSize
    
    x_size = dataset.RasterXSize
    y_size = dataset.RasterYSize
    im_proj = dataset.GetProjection()
    return im_proj, x_res, y_res, x_size, y_size, (x_min, y_min, x_max, y_max)

def transfer_lulc_pft(array):
    # land_cover_type = {10: "Rainfed cropland",11: "Herbaceous cover cropland",12: "Tree or shrub cover (Orchard) cropland",
    #                    20: "Irrigated cropland",51: "Open evergreen broadleaved forest",52: "Closed evergreen broadleaved forest",
    #                    61: "Open deciduous broadleaved forest",62: "Closed deciduous broadleaved forest",71: "Open evergreen needle-leaved forest",
    #                    72: "Closed evergreen needle-leaved forest",81: "Open deciduous needle-leaved forest",82: "Closed deciduous needle-leaved forest",
    #                    91: "Open mixed leaf forest (broadleaved and needle-leaved)",92: "Closed mixed leaf forest (broadleaved and needle-leaved)", 
    #                    120: "Shrubland",121: "Evergreen shrubland",122: "Deciduous shrubland",130: "Grassland",140: "Lichens and mosses",
    #                    150: "Sparse vegetation",152: "Sparse shrubland",153: "Sparse herbaceous",181: "Swamp",182: "Marsh",183: "Flooded flat",
    #                    184: "Saline",185: "Mangrove",186: "Salt marsh",187: "Tidal flat",190: "Impervious surfaces",200: "Bare areas",
    #                    201: "Consolidated bare areas",202: "Unconsolidated bare areas",210: "Water body",220: "Permanent ice and snow",
    #                    0: "Filled value",250: "Filled value"}
    
    #CPR: lulc = [10, 11, 12, 20] --> 100
    #EBF: lulc = [51, 52] --> 200
    #DBF: lulc = [61, 62] --> 300
    #ENF: lulc = [71, 72] --> 400
    #DNF: lulc = [81, 82] --> 500
    #MF: lulc = [91, 92] --> 600
    #SHR: lulc = [120, 121, 122] --> 700
    #GRA: lulc = [130] --> 800
    array = array.astype(int)
    pft = np.full(array.shape, np.nan)
    pft[np.isin(array, [10, 11, 12, 20])] = 100
    pft[np.isin(array, [51, 52])] = 200
    pft[np.isin(array, [61, 62])] = 300
    pft[np.isin(array, [71, 72])] = 400
    pft[np.isin(array, [81, 82])] = 500
    pft[np.isin(array, [91, 92])] = 600
    pft[np.isin(array, [120, 121, 122])] = 700
    pft[np.isin(array, [130])] = 800
    return pft

## 1. Merge all environmental variables together

In [None]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/"
os.makedirs(f"{data_path}1_seasonal_variation_attribution", exist_ok=True)
out_path = f"{data_path}1_seasonal_variation_attribution/"

os.makedirs(f"{out_path}1_environmental_variables", exist_ok=True)
output_path = f"{out_path}1_environmental_variables/"

os.makedirs(f"{out_path}2_merged_traits_environmental_variables", exist_ok=True)
output_path2 = f"{out_path}2_merged_traits_environmental_variables/"

trait_path = f"{data_path}0_trait_maps/1_clouds_masked/"
lulc_path = f"{data_path}0_trait_maps/0_original_lulc/"
vi_path = "/Volumes/ChenLab-1/Fujiang/0_Seasonal_PRISMA_traits/12_RTM_estimation_through_given_LAI/1_LAI_estimation/9_time_series_analysis/NBAR_refl/1_masked_LAI_data/"
climate_path = "/Volumes/ChenLab-1/Fujiang/0_Seasonal_PRISMA_traits/5_Forcing_data/4_clipped_NEON_extent/"
topo_path = "/Volumes/ChenLab-1/Fujiang/0_Seasonal_PRISMA_traits/8_topographic_data/3_clipped_NEON_extent/"
soil_path = "/Volumes/ChenLab-1/Fujiang/0_Seasonal_PRISMA_traits/9_soil_data/3_clipped_NEON_extent/"


folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']
vi_variables = ["NDVI", "NIRv","LAI"]
climate_variables = ['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'vp']
topo_variables = ['Elevation', 'Slope', 'Aspect']
soil_variables = ['bdod', 'cec', 'cfvo', 'clay','nitrogen', 'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']

all_variables = ["lulc","VI",'dayl', 'prcp', 'srad', 'tmax', 'tmin', 'vp','elevation', 'slope', 'aspect','bdod', 
                 'cec', 'cfvo', 'clay','nitrogen', 'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']

for folder in folders:
    os.makedirs(f"{output_path}{folder}", exist_ok=True)
    os.makedirs(f"{output_path2}{folder}", exist_ok=True)
    
    trait_tif_folder = f"{trait_path}{folder}"
    out_tif_path = f"{output_path}{folder}"
    out_tif_path2 = f"{output_path2}{folder}"

    lulc_folder = f"{lulc_path}{folder}"
    vi_folder = f"{vi_path}{folder}"
    climate_folders = [f"{climate_path}{cv}/{folder}" for cv in climate_variables]
    topo_folder = f"{topo_path}{folder}"
    soil_folder = f"{soil_path}{folder}"
    
    
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if "_all_data_models_traits_masked.tif" in x and "._" not in x and ".aux.xml" not in x]
    for kk, file in enumerate(file_name):
        print(f"{folder}: {kk+1}/{len(file_name)}")
        year = file.split("_")[3][0:4]
        if year == "2023":
            year = 2022
        date = file.split("_")[3]
        date = f"{date[0:4]}-{date[4:6]}-{date[6:8]}"
        
        landuse = f"{lulc_folder}/{folder}_{year}_land_cover.tif"
        vi_file = f"{vi_folder}/{('_').join(file.split('_')[0:9])}_LAI_VI_masked.tif"
        climate_files = [f"{climate_folder}/{date}_{climate_variables[i]}.tif" for i, climate_folder in enumerate(climate_folders)]
        topo_files = [f"{topo_folder}/{x}.tif" for x in topo_variables]
        soil_files = [f"{soil_folder}/{x}.tif" for x in soil_variables]
        all_files = [landuse] + [vi_file] + climate_files + topo_files + soil_files
        
        #### resampling.
        proj, x_res, y_res, x_size, y_size, bounds = get_corner(f"{trait_tif_folder}/{file}")
        all_out_tif = []
        for ii, variable in enumerate(all_files):
            input_ds = gdal.Open(variable)
            out_tif = f"{out_tif_path}/{('_').join(file.split('_')[0:9])}_{all_variables[ii]}.tif"
            all_out_tif.append(out_tif)
            if ii == 0:
                gdal.Warp(out_tif, input_ds, xRes=x_res, yRes=abs(y_res),dstSRS=proj, outputBounds=bounds, 
                          width=x_size, height=y_size, resampleAlg=gdal.GRA_NearestNeighbour)
            else:
                gdal.Warp(out_tif, input_ds, xRes=x_res, yRes=abs(y_res),dstSRS=proj, outputBounds=bounds, 
                          width=x_size, height=y_size, resampleAlg=gdal.GRA_Bilinear)
            input_ds = None
            out_tif = None
        
        #### start merging.
        im_data, im_Geotrans, im_proj,im_rows, im_cols = read_tif(f"{trait_tif_folder}/{file}")

        for jj, var_file in enumerate(all_out_tif):
            var_data,var_Geotrans, var_proj,var_rows, var_cols = read_tif(var_file)
            if jj == 0:
                var_data = transfer_lulc_pft(var_data)
                var_data = var_data[:, :, np.newaxis]
            elif jj == 1:
                var_data = var_data
            else:
                var_data = var_data[:, :, np.newaxis]
    
            im_data = np.concatenate((im_data, var_data), axis=2)
        
        out_name = f"{out_tif_path2}/{('_').join(file.split('_')[0:9])}_traits_environmental_variables.tif"
        band_names = ['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std', 
                      'Nitrogen_mean','Nitrogen_std',"PFTs", "NDVI", "NIRv","LAI", 'dayl', 'prcp', 'srad', 'tmax', 
                      'tmin', 'vp', 'elevation', 'slope', 'aspect','bdod', 'cec', 'cfvo', 'clay','nitrogen', 
                      'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']
        array_to_geotiff(im_data, out_name, im_Geotrans, im_proj, band_names=band_names)

## 2. Convert data to WGS-84

In [None]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}2_merged_traits_environmental_variables/"
os.makedirs(f"{data_path}3_convert_to_wgs84", exist_ok=True)
out_path = f"{data_path}3_convert_to_wgs84/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']

for folder in folders:
    os.makedirs(f"{out_path}{folder}", exist_ok=True)
    trait_tif_folder = f"{in_path}{folder}"
    out_tif_path = f"{out_path}{folder}"
    
    file_name = os.listdir(trait_tif_folder)
   
    file_name = [x for x in file_name if ".tif" in x and "._" not in x and ".aux.xml" not in x]
    for kk, file in enumerate(file_name):
        print(f"{folder}: {kk+1}/{len(file_name)}")
        in_tif = f"{trait_tif_folder}/{file}"
        out_tif = f"{out_tif_path}/{file}"

        input_ds = gdal.Open(in_tif)
        output_ds = gdal.Warp(out_tif, input_ds, dstSRS='EPSG:4326')
        
        input_ds = None
        output_ds = None

## 3. Clip

In [161]:
bounds = {'D01_BART':(-71.45, 43.95, -71.15, 44.166),'D01_HARV':(-72.335, 42.415,-72.025, 42.62),'D02_SCBI':(-78.285, 38.785, -77.99, 39.005),
          'D03_OSBS':(-82.17, 29.56, -81.87, 29.79),'D07_MLBS': (-80.67, 37.265, -80.38, 37.49),'D07_ORNL': (-84.458, 35.815, -84.16, 36.055),
          'D08_TALL': (-87.538, 32.84, -87.245, 33.07),'D10_CPER': (-104.905, 40.7, -104.587, 40.93),
          'D13_MOAB': (-109.544, 38.135, -109.247, 38.365),'D14_JORN': (-107.0, 32.47, -106.725, 32.706),
          'D16_WREF': (-122.11, 45.73, -121.82, 45.93)}

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}3_convert_to_wgs84/"
os.makedirs(f"{data_path}4_clipped_data", exist_ok=True)
out_path = f"{data_path}4_clipped_data/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']

for folder in folders:
    os.makedirs(f"{out_path}{folder}", exist_ok=True)
    trait_tif_folder = f"{in_path}{folder}"
    out_tif_path = f"{out_path}{folder}"
    
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if "_traits_environmental_variables.tif" in x and "._" not in x and ".aux.xml" not in x]
    for file in file_name:
        input_tif = f"{trait_tif_folder}/{file}"
        output_tif = f"{out_tif_path}/{file[:-4]}_clipped.tif"
        gdal.Warp(output_tif,  input_tif, format = 'GTiff', outputBounds=bounds[folder])
        input_tif = None
        output_tif = None

## 4. convert to points and save as *.csv file

In [None]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}4_clipped_data/"
os.makedirs(f"{data_path}5_convert_to_point", exist_ok=True)
out_path = f"{data_path}5_convert_to_point/"

M1 = ["04","05","06","07","08","09","10","11"]
M2 = ["04","05","06","07","08","09","10"]
M3 = ["05","06","07","08","09"]
M4 = ["05","06","07","08","09","10"]
M5 = ["06","07","08","09"]
folder_month_map = {'D14_SRER': M1, 'D07_ORNL': ["04","06","07","08","09","10"], 'D08_TALL': M2, 
                    'D16_WREF': M2, 'D13_MOAB': M3, 'D19_BONA':M3, "D19_HEAL": M5}
default_month = M4

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']
band_names = ['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std', 'Nitrogen_mean','Nitrogen_std',
              "PFTs", "NDVI", "NIRv","LAI", 'dayl', 'prcp', 'srad', 'tmax', 'tmin', 'vp', 'elevation', 'slope', 'aspect','bdod', 'cec', 
              'cfvo', 'clay','nitrogen', 'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']
PFTs = {100:"CPR", 200:"EBF", 300:"DBF", 400:"ENF", 500:"DNF", 600:"MF", 700:"SHR", 800:"GRA"}

for folder in folders:
    trait_tif_folder = f"{in_path}{folder}"
    mon = folder_month_map.get(folder, default_month)
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if "_traits_environmental_variables_clipped.tif" in x and "._" not in x and ".aux.xml" not in x]
    file_name = [f for f in file_name if f.split('_')[3][4:6] in mon]
    
    var = True
    for kk, file in enumerate(file_name):
        print(f"{folder}: {file} -- {kk+1}/{len(file_name)}")
        
        date = f"{file.split('_')[3][0:4]}-{file.split('_')[3][4:6]}-{file.split('_')[3][6:8]}"
        year = f"{file.split('_')[3][0:4]}"
        month = f"{file.split('_')[3][4:6]}"
        input_tif = f"{trait_tif_folder}/{file}"
        
        im_data, im_Geotrans, im_proj,rows, cols = read_tif(input_tif)
        temp = im_data.reshape(-1,im_data.shape[2])
        df = pd.DataFrame(temp)
        df.columns = band_names
    
        df_cleaned = df.dropna(subset=['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std'], how="all")
        df_cleaned = df_cleaned.dropna(subset = ["PFTs"], how = "any")
        df_cleaned = df_cleaned[~(df_cleaned[['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std','PFTs']] <= 0).any(axis=1)]
        df_cleaned["tmean"] = (df_cleaned["tmin"] + df_cleaned["tmax"])/2
         
        df_cleaned.reset_index(drop = True, inplace = True)
        
        df_cleaned["site"] = folder[-4:]
        df_cleaned["date"] = date
        df_cleaned["year"] = year
        df_cleaned["month"] = month
        pfts = [PFTs[x] for x in df_cleaned["PFTs"]]
        df_cleaned["PFTs"] = pfts
        if var:
            data = df_cleaned
            var = False
        else:
            data = pd.concat([data, df_cleaned], axis = 0)
    print(f"start saving {folder}......")
    data.reset_index(drop = True, inplace = True)
    data.to_csv(f"{out_path}{folder}_plant_trait_points_with_environmental_variables.csv", index = False)

## 5. seasonal variations attribution

### 5.1 seasonal variation across NEON sites.

In [16]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']

start_var = True
for folder in folders:
    df = pd.read_csv(f"{in_path}{folder}_plant_trait_points_with_environmental_variables.csv")
    df = df[['date', 'Chla+b_mean', 'Chla+b_std', 'Ccar_mean', 'Ccar_std', 'EWT_mean','EWT_std', 'LMA_mean','LMA_std', 
         'Nitrogen_mean', 'Nitrogen_std', 'dayl', 'prcp', 'srad', 'tmax', 'tmin','tmean','vp']]
    grouped_mean = df.groupby("date").mean()
    grouped_mean.reset_index(inplace = True)
    grouped_mean["site"] = folder[-4:]
    if start_var:
        data = grouped_mean
        start_var = False
    else:
        data = pd.concat([data, grouped_mean], axis = 0)
data.reset_index(drop = True, inplace = True)
data.to_csv(f"{in_path}1_seasonal_variation_across_sites_mean_by_date.csv", index = False)

In [34]:
"""
Random Forest (trait togther)
"""

def RF_model(X,y):
    RF = RandomForestRegressor(n_estimators= 100, random_state=0, oob_score= True, n_jobs=-1, min_samples_split= 2, max_depth=150)
    RF.fit(X,y)
    return RF

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"

data = pd.read_csv(f"{in_path}1_seasonal_variation_across_sites_mean_by_date.csv")

start_var = True
for site in data["site"].unique():
    df = data[data["site"] == site]
    df = df.drop(["date","site"], axis=1)
    df_nor = (df - df.min()) / (df.max() - df.min())
    X = df_nor[['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp']]
    y = df_nor[["Chla+b_mean","Ccar_mean","EWT_mean","Nitrogen_mean"]]

    RF = RF_model(X,y)
    pred = RF.predict(X)
    r2 = r2_score(y, pred)

    FI = RF.feature_importances_
    FI = pd.DataFrame(FI).T
    FI.columns = ['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp']
    FI["site"] = site
    FI["R2"] = r2
    
    if start_var:
        FI_all = FI
        start_var = False
    else:
        FI_all = pd.concat([FI_all, FI], axis = 0)
        
FI_all.reset_index(drop = True, inplace = True)
FI_all.to_csv(f"{in_path}1_feature_importance_RF.csv", index = False)

In [22]:
"""
Variation partitioning analysis
"""

def calculate_r2(Y, X):
    model = LinearRegression()
    model.fit(X, Y)
    return r2_score(Y, model.predict(X))

def calculate_combination_r2(Y, groups, combination):
    combined_data = pd.concat([groups[g] for g in combination], axis = 1)
    return calculate_r2(Y, combined_data)

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"

data = pd.read_csv(f"{in_path}1_seasonal_variation_across_sites_mean_by_date.csv")

start_var = True
saved_results = {}
for site in data["site"].unique():
    df = data[data["site"] == site]
    df = df.drop(["date","site"], axis=1)
    df_nor = (df - df.min()) / (df.max() - df.min())
    X = df_nor[['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp']]
    groups = {'solar radiation':X[['dayl', "srad"]], 'precipitation':X[['prcp']], 
              'temperature':X[['tmax','tmin','tmean']], 'vapor pressure':X[['vp']]}

    y = df_nor[["Chla+b_mean","Ccar_mean","EWT_mean","Nitrogen_mean"]]
    
    all_groups = list(groups.keys())
    R2_total = calculate_combination_r2(y, groups, all_groups)
    
    shared_r2 = {}
    for i in range(1, len(all_groups) + 1):
        for combo in combinations(all_groups, i):
            R2_with_combo = calculate_combination_r2(y, groups, combo)
            shared_r2[combo] = R2_with_combo
            
    
    residual_variance = 1 - R2_total
    saved_results[site] = [shared_r2, residual_variance]


def convert_keys_to_strings(data):
    if isinstance(data, dict):
        return {str(key): convert_keys_to_strings(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_keys_to_strings(item) for item in data]
    else:
        return data

converted_saved_file = convert_keys_to_strings(saved_results)
with open(f"{in_path}1_feature_importance_PA.json", 'w') as file:
    json.dump(converted_saved_file, file, indent=4)

### 5.2 seasonal variation across NEON sites and PFTs.

In [200]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']

var = True
for folder in folders:
    df = pd.read_csv(f"{in_path}{folder}_plant_trait_points_with_environmental_variables.csv")
    start_var = True
    for pft in df["PFTs"].unique():
        df_temp = df[df["PFTs"] == pft]
        df_temp = df_temp[['date', 'Chla+b_mean', 'Chla+b_std', 'Ccar_mean', 'Ccar_std', 'EWT_mean','EWT_std', 'LMA_mean','LMA_std', 
                           'Nitrogen_mean', 'Nitrogen_std', 'dayl', 'prcp', 'srad', 'tmax', 'tmin','tmean','vp']]
        
        grouped_mean = df_temp.groupby("date").mean()
        grouped_mean.reset_index(inplace = True)
        grouped_mean["PFTs"] = pft
        if start_var:
            data = grouped_mean
            start_var = False
        else:
            data = pd.concat([data, grouped_mean], axis = 0)
    data["site"] = folder[-4:]
    if var:
        final_data = data
        var = False
    else:
        final_data = pd.concat([final_data, data], axis = 0)

final_data.reset_index(drop = True, inplace = True)
final_data.to_csv(f"{in_path}2_seasonal_variation_across_sites_and_PFTs.csv", index = False)

In [37]:
"""
Random Forest
"""

def RF_model(X,y):
    RF = RandomForestRegressor(n_estimators= 100, random_state=0, oob_score= True, n_jobs=-1, min_samples_split= 2, max_depth=150)
    RF.fit(X,y)
    return RF


data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"

data = pd.read_csv(f"{in_path}2_seasonal_variation_across_sites_and_PFTs.csv")
start_var = True
for site in data["site"].unique():
    for PFT in data["PFTs"].unique():
        df = data[(data["site"] == site)&(data["PFTs"] == PFT)]
        if len(df) > 4:
            df = df.drop(["date","site", "PFTs"], axis=1)
            df_nor = (df - df.min()) / (df.max() - df.min())
            df_nor.loc[:, df_nor.isna().all()] = 0
            X = df_nor[['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp']]
            y = df_nor[["Chla+b_mean","Ccar_mean","EWT_mean","Nitrogen_mean"]]
        
            RF = RF_model(X,y)
            pred = RF.predict(X)
            r2 = r2_score(y, pred)
            
            FI = RF.feature_importances_
            FI = pd.DataFrame(FI).T
            FI.columns = ['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp']
            FI["site"] = site
            FI["PFTs"] = PFT
            FI["R2"] = r2
            if start_var:
                FI_all = FI
                start_var = False
            else:
                FI_all = pd.concat([FI_all, FI], axis = 0)
        
FI_all.reset_index(drop = True, inplace = True)
FI_all.to_csv(f"{in_path}2_feature_importance_across_sites&PFTs_RF.csv", index = False)

In [24]:
"""
Variation partitioning analysis
"""

def calculate_r2(Y, X):
    model = LinearRegression()
    model.fit(X, Y)
    return r2_score(Y, model.predict(X))

def calculate_combination_r2(Y, groups, combination):
    combined_data = pd.concat([groups[g] for g in combination], axis = 1)
    return calculate_r2(Y, combined_data)

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"

data = pd.read_csv(f"{in_path}2_seasonal_variation_across_sites_and_PFTs.csv")
start_var = True
saved_results = {}
for site in data["site"].unique():
    for PFT in data["PFTs"].unique():
        df = data[(data["site"] == site)&(data["PFTs"] == PFT)]
        if len(df) > 4:
            df = df.drop(["date","site","PFTs"], axis=1)
            df_nor = (df - df.min()) / (df.max() - df.min())
            df_nor.loc[:, df_nor.isna().all()] = 0
            X = df_nor[['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp']]
            groups = {'solar radiation':X[['dayl', "srad"]], 'precipitation':X[['prcp']], 
                      'temperature':X[['tmax','tmin','tmean']], 'vapor pressure':X[['vp']]}
        
            y = df_nor[["Chla+b_mean","Ccar_mean","EWT_mean","Nitrogen_mean"]]
            
            all_groups = list(groups.keys())
            R2_total = calculate_combination_r2(y, groups, all_groups)
    
            shared_r2 = {}
            for i in range(1, len(all_groups) + 1):
                for combo in combinations(all_groups, i):
                    R2_with_combo = calculate_combination_r2(y, groups, combo)
                    shared_r2[combo] = R2_with_combo
            
            residual_variance = 1 - R2_total
            saved_results[f"{site} & {PFT}"] = [shared_r2, residual_variance]


def convert_keys_to_strings(data):
    if isinstance(data, dict):
        return {str(key): convert_keys_to_strings(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_keys_to_strings(item) for item in data]
    else:
        return data

converted_saved_file = convert_keys_to_strings(saved_results)
with open(f"{in_path}2_feature_importance_across_sites&PFTs_PA.json", 'w') as file:
    json.dump(converted_saved_file, file, indent=4)

### 5.3 spatial variation across NEON sites.

In [None]:
pheno = {"D01_BART": ['05', '08', '10'], "D01_HARV": ['05', '07', '10'], "D02_SCBI": ['05', '08', '10'], "D03_OSBS": ['05', '06', '10'], 
         "D07_MLBS": ['05', '08', '10'], "D07_ORNL": ['04', '06', '10'], "D08_TALL": ['04', '07', '10'], "D10_CPER": ['05', '06', '10'], 
         "D13_MOAB": ['05', '07', '09'], "D14_JORN": ['05', '08', '10'], "D16_WREF": ['04', '07', '10']}

def nanmean_images(image_list):
    stacked_images = np.stack(image_list, axis=-1)
    averaged_image = np.nanmean(stacked_images, axis=-1)
    return averaged_image
def get_corner_coordinates(geotrans, cols, rows):
    ul_x = geotrans[0]
    ul_y = geotrans[3]
    lr_x = geotrans[0] + cols * geotrans[1] + rows * geotrans[2]
    lr_y = geotrans[3] + cols * geotrans[4] + rows * geotrans[5]
    return ul_x, ul_y, lr_x, lr_y

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}2_merged_traits_environmental_variables/"
os.makedirs(f"{data_path}6_spatial_variation_attribution", exist_ok=True)
out_path = f"{data_path}6_spatial_variation_attribution/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']
for folder in folders:
    os.makedirs(f"{out_path}{folder}", exist_ok=True)
    trait_tif_folder = f"{in_path}{folder}"
    out_tif_path = f"{out_path}{folder}"
    mon = pheno[folder]
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if "traits_environmental_variables.tif" in x and "._" not in x and ".aux.xml" not in x]
    lists = []
    for file in file_name:
        month = file.split('_')[3][4:6]
        dicts = (month, file)
        lists.append(dicts)
    dictionary = {}
    for key, value in lists:
        if key in dictionary:
            dictionary[key].append(value)
        else:
            dictionary[key] = [value]
    keys = mon
    for key in keys:
        values = dictionary[key]
        
        ul_x_all,lr_y_all, lr_x_all, ul_y_all = [],[],[],[]
        for image in values:
            im_data, im_Geotrans, im_proj,rows, cols = read_tif(f"{trait_tif_folder}/{image}")
            ul_x, ul_y, lr_x, lr_y = get_corner_coordinates(im_Geotrans, cols, rows)
            ul_x_all.append(ul_x)
            lr_y_all.append(lr_y)
            lr_x_all.append(lr_x)
            ul_y_all.append(ul_y)
        ul_x, lr_y, lr_x, ul_y = min(ul_x_all),min(lr_y_all), max(lr_x_all), max(ul_y_all)
        
        for image in values:
            input_tif = f"{trait_tif_folder}/{image}"
            output_tif = f"{out_tif_path}/{image}"
            gdal.Warp(output_tif, input_tif, format = 'GTiff', outputBounds=(ul_x, lr_y, lr_x, ul_y))
        
        image_list = []   
        for image in values:
            im_data, im_Geotrans, im_proj,rows, cols = read_tif(f"{out_tif_path}/{image}")
            im_data = np.where(im_data <= 0, np.nan, im_data)
            print(folder, key, image, im_data.shape)
            image_list.append(im_data)
        mean_image = nanmean_images(image_list)
        out_tif = f"{out_tif_path}/{key}.tif"
        band_names = ['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std', 
                      'Nitrogen_mean','Nitrogen_std',"PFTs", "NDVI", "NIRv","LAI", 'dayl', 'prcp', 'srad', 'tmax', 
                      'tmin', 'vp', 'elevation', 'slope', 'aspect','bdod', 'cec', 'cfvo', 'clay','nitrogen', 
                      'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']
        array_to_geotiff(mean_image, out_tif, im_Geotrans, im_proj, band_names=band_names)
        out_tif = None
        for image in values:
            os.remove(f"{out_tif_path}/{image}") 

In [7]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}6_spatial_variation_attribution/"
out_path = f"{data_path}6_spatial_variation_attribution/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']

for folder in folders:
    trait_tif_folder = f"{in_path}{folder}"
    out_tif_path = f"{out_path}{folder}"
    
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if ".tif" in x and "._" not in x and ".aux.xml" not in x]
    for file in file_name:
        in_tif = f"{trait_tif_folder}/{file}"
        out_tif = f"{out_tif_path}/{file[:-4]}_wgs84.tif"

        input_ds = gdal.Open(in_tif)
        output_ds = gdal.Warp(out_tif, input_ds, dstSRS='EPSG:4326')
        
        input_ds = None
        output_ds = None

bounds = {'D01_BART':(-71.45, 43.95, -71.15, 44.166),'D01_HARV':(-72.335, 42.415,-72.025, 42.62),'D02_SCBI':(-78.285, 38.785, -77.99, 39.005),
          'D03_OSBS':(-82.17, 29.56, -81.87, 29.79),'D07_MLBS': (-80.67, 37.265, -80.38, 37.49),'D07_ORNL': (-84.458, 35.815, -84.16, 36.055),
          'D08_TALL': (-87.538, 32.84, -87.245, 33.07),'D10_CPER': (-104.905, 40.7, -104.587, 40.93),
          'D13_MOAB': (-109.544, 38.135, -109.247, 38.365),'D14_JORN': (-107.0, 32.47, -106.725, 32.706),
          'D16_WREF': (-122.11, 45.73, -121.82, 45.93)}

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}6_spatial_variation_attribution/"
out_path = f"{data_path}6_spatial_variation_attribution/"

folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']

for folder in folders:
    trait_tif_folder = f"{in_path}{folder}"
    out_tif_path = f"{out_path}{folder}"
    
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if "_wgs84.tif" in x and "._" not in x and ".aux.xml" not in x]
    for file in file_name:
        input_tif = f"{trait_tif_folder}/{file}"
        output_tif = f"{out_tif_path}/{file[:-4]}_clipped.tif"
        gdal.Warp(output_tif,  input_tif, format = 'GTiff', outputBounds=bounds[folder])
        input_tif = None
        output_tif = None

In [27]:
data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}6_spatial_variation_attribution/"
out_path = f"{data_path}5_convert_to_point/"

pheno = {"D01_BART": ['05', '08', '10'], "D01_HARV": ['05', '07', '10'], "D02_SCBI": ['05', '08', '10'], "D03_OSBS": ['05', '06', '10'], 
         "D07_MLBS": ['05', '08', '10'], "D07_ORNL": ['04', '06', '10'], "D08_TALL": ['04', '07', '10'], "D10_CPER": ['05', '06', '10'], 
         "D13_MOAB": ['05', '07', '09'], "D14_JORN": ['05', '08', '10'], "D16_WREF": ['04', '07', '10']}
seasons = ["Early growing season", "Peak growing season", "Post-peak growing season"]
folders = ['D01_BART','D01_HARV','D02_SCBI','D03_OSBS','D07_MLBS','D07_ORNL','D08_TALL','D10_CPER','D13_MOAB','D14_JORN','D16_WREF']
band_names = ['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std', 'Nitrogen_mean','Nitrogen_std',
              "PFTs", "NDVI", "NIRv","LAI", 'dayl', 'prcp', 'srad', 'tmax', 'tmin', 'vp', 'elevation', 'slope', 'aspect','bdod', 'cec', 
              'cfvo', 'clay','nitrogen', 'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']
PFTs = {100:"CPR", 200:"EBF", 300:"DBF", 400:"ENF", 500:"DNF", 600:"MF", 700:"SHR", 800:"GRA"}

var = True
for folder in folders:
    trait_tif_folder = f"{in_path}{folder}"
    file_name = os.listdir(trait_tif_folder)
    file_name = [x for x in file_name if "_clipped.tif" in x and "._" not in x and ".aux.xml" not in x]
    
    for kk, file in enumerate(file_name):
        print(f"{folder}: {file} -- {kk+1}/{len(file_name)}")
        
        month = f"{file.split('_')[0]}"
        
        input_tif = f"{trait_tif_folder}/{file}"
        im_data, im_Geotrans, im_proj,rows, cols = read_tif(input_tif)
        temp = im_data.reshape(-1,im_data.shape[2])
        df = pd.DataFrame(temp)
        df.columns = band_names
    
        df_cleaned = df.dropna(subset=['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std'], how="all")
        df_cleaned = df_cleaned.dropna(subset = ["PFTs"], how = "any")
        df_cleaned = df_cleaned[~(df_cleaned[['Chla+b_mean','Chla+b_std', 'Ccar_mean','Ccar_std','EWT_mean','EWT_std','LMA_mean', 'LMA_std','PFTs']] <= 0).any(axis=1)]
        df_cleaned["tmean"] = (df_cleaned["tmin"] + df_cleaned["tmax"])/2
         
        df_cleaned.reset_index(drop = True, inplace = True)
        month = file.split("_")[0]
        
        df_cleaned["site"] = folder[-4:]
        df_cleaned["season"] = seasons[pheno[folder].index(month)]
        pfts = [PFTs[x] if x in PFTs.keys() else np.nan for x in df_cleaned["PFTs"]]
        df_cleaned["PFTs"] = pfts
        df_cleaned = df_cleaned.dropna(subset = ["PFTs"], how = "any")
        
        if var:
            data = df_cleaned
            var = False
        else:
            data = pd.concat([data, df_cleaned], axis = 0)
print(f"start saving......")
data.reset_index(drop = True, inplace = True)
data.to_csv(f"{out_path}3_growing_seasons_plant_trait_points_with_environmental_variables.csv", index = False)

In [37]:
"""
Random Forest
"""

def RF_model(X,y):
    RF = RandomForestRegressor(n_estimators= 100, random_state=0, oob_score= True, n_jobs=-1, min_samples_split= 2, max_depth=150)
    RF.fit(X,y)
    return RF

data_path = "/Users/fji/Desktop/NBAR_refl_with_LAI/1_seasonal_variation_attribution/"
in_path = f"{data_path}5_convert_to_point/"
data = pd.read_csv(f"{in_path}3_growing_seasons_plant_trait_points_with_environmental_variables.csv")

start_var = True
for site in data["site"].unique():
    for season in data["season"].unique():
        df = data[(data["site"] == site)&(data["season"] == season)]
        print(site, season, len(df))
        # df = df.iloc[:100,:] #####
        df = df.drop(["site", "PFTs", "season"], axis=1)
        df_nor = (df - df.min()) / (df.max() - df.min())
        df_nor.loc[:, df_nor.isna().all()] = 0
        df_nor.dropna(inplace = True)
        
        X = df_nor[['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp', 'elevation', 'slope', 'aspect', 'bdod', 'cec', 'cfvo', 'clay',
                     'nitrogen', 'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']]
        y = df_nor[["Chla+b_mean","Ccar_mean","EWT_mean","Nitrogen_mean"]]
        
        RF = RF_model(X,y)
        FI = RF.feature_importances_
        FI = pd.DataFrame(FI).T
        FI.columns = ['dayl', 'prcp', 'srad', 'tmax', 'tmin', 'tmean', 'vp', 'elevation', 'slope', 'aspect', 'bdod', 'cec', 'cfvo', 'clay',
                     'nitrogen', 'ocd', 'ocs', 'phh20', 'sand', 'silt', 'soc', 'theta_r','theta_s', 'ksat']
        FI["site"] = site
        FI["season"] = season
        if start_var:
            FI_all = FI
            start_var = False
        else:
            FI_all = pd.concat([FI_all, FI], axis = 0)

FI_all.reset_index(drop = True, inplace = True)
FI_all.to_csv(f"{in_path}3_feature_importance_growing_seasons_RF.csv", index = False)

BART Post-peak growing season 388497
BART Peak growing season 289240
BART Early growing season 500033
HARV Post-peak growing season 434730
HARV Peak growing season 506655
HARV Early growing season 502502
SCBI Post-peak growing season 500487
SCBI Peak growing season 558636
SCBI Early growing season 549445
OSBS Post-peak growing season 547534
OSBS Peak growing season 591829
OSBS Early growing season 575030
MLBS Post-peak growing season 455872
MLBS Peak growing season 579654
MLBS Early growing season 630074
ORNL Post-peak growing season 596691
ORNL Peak growing season 576515
ORNL Early growing season 508253
TALL Post-peak growing season 563873
TALL Peak growing season 235278
TALL Early growing season 525445
CPER Post-peak growing season 670051
CPER Peak growing season 684679
CPER Early growing season 701715
MOAB Post-peak growing season 544889
MOAB Peak growing season 599119
MOAB Early growing season 566701
JORN Post-peak growing season 682967
JORN Peak growing season 686061
JORN Early gr