In [46]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [20]:
data = pd.read_csv("../0_datasets/Paired leaf traits and leaf spectra dataset.csv")
data['Instruments'].fillna('', inplace=True)
# data = data[data["Instruments"].str.contains("ASD")]

### Chla+b

In [25]:
df = data[data["Chla+b"]>0]
print(df["Instruments"].value_counts())
df = df[df["Instruments"].str.contains("ASD")]

ASD FieldSpec 4                2949
ASD FieldSpec 3                1965
SVC HR-1024i                    560
PSR 3500+                       402
PSR+                            368
Lambda-19 Spectrophotometer     320
ASD FieldSpec                   276
Name: Instruments, dtype: int64


In [35]:
### sites dataset extraction
sample_size = 200
var_start = True
for site in df["Site ID"].unique():
    temp = df[df["Site ID"] == site]
    if len(temp)>sample_size:
        print(site, len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/Chla+b_dataset_sites.csv', index = False)

Site#2 276
Site#3 397
Site#4 211
Site#67 1210
Site#83 1131
Site#5 831
Site#89 224
Site#104 734


In [43]:
### PFT dataset extraction
sample_size = 1000
var_start = True
for pft in df["PFT"].unique():
    temp = df[df["PFT"] == pft]
    if len(temp)>sample_size:
        print(pft,len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/Chla+b_dataset_PFT.csv', index = False)

Deciduous broadleaf forests 1078
Grasslands 1629
Croplands 2341


In [56]:
### temporal dataset extraction
seasonal_df = df[(df["Dataset ID"]=="Dataset#3")|(df["Dataset ID"]=="Dataset#4")|(df["Dataset ID"]=="Dataset#8")]
seasonal_df["Sample date"] = seasonal_df["Sample date"].astype(int).astype(str)
seasonal_df["DOY"] = [datetime.strptime(x, "%Y%m%d").timetuple().tm_yday for x in seasonal_df["Sample date"].tolist()]

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&((seasonal_df['DOY']<165)|(seasonal_df['DOY']==165)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&((seasonal_df['DOY']>240)|(seasonal_df['DOY']==240)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&(seasonal_df['DOY']>165)&(seasonal_df['DOY']<240),'season']='peak growing season'

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&((seasonal_df['DOY']<151)|(seasonal_df['DOY']==151)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&((seasonal_df['DOY']>243)|(seasonal_df['DOY']==243)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&(seasonal_df['DOY']>151)&(seasonal_df['DOY']<243),'season']='peak growing season'

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&((seasonal_df['DOY']<175)|(seasonal_df['DOY']==175)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&((seasonal_df['DOY']>243)|(seasonal_df['DOY']==243)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&(seasonal_df['DOY']>175)&(seasonal_df['DOY']<243),'season']='peak growing season'

seasonal_df.reset_index(drop = True,inplace = True)
seasonal_df['ID'] = np.arange(len(seasonal_df))
seasonal_df.to_csv('../0_datasets/Chla+b_dataset_temporal.csv', index = False)

### Ccar

In [77]:
df = data[data["Ccar"]>0]
print(df["Instruments"].value_counts())
df = df[df["Instruments"].str.contains("ASD")]

### sites dataset extraction
sample_size = 200
var_start = True
for site in df["Site ID"].unique():
    temp = df[df["Site ID"] == site]
    if len(temp)>sample_size:
        print(site, len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/Ccar_dataset_sites.csv', index = False)

### PFT dataset extraction
sample_size = 700
var_start = True
for pft in df["PFT"].unique():
    temp = df[df["PFT"] == pft]
    if len(temp)>sample_size:
        print(pft,len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/Ccar_dataset_PFT.csv', index = False)

### temporal dataset extraction
seasonal_df = df[(df["Dataset ID"]=="Dataset#3")|(df["Dataset ID"]=="Dataset#4")|(df["Dataset ID"]=="Dataset#8")]
seasonal_df["Sample date"] = seasonal_df["Sample date"].astype(int).astype(str)
seasonal_df["DOY"] = [datetime.strptime(x, "%Y%m%d").timetuple().tm_yday for x in seasonal_df["Sample date"].tolist()]

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&((seasonal_df['DOY']<165)|(seasonal_df['DOY']==165)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&((seasonal_df['DOY']>240)|(seasonal_df['DOY']==240)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&(seasonal_df['DOY']>165)&(seasonal_df['DOY']<240),'season']='peak growing season'

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&((seasonal_df['DOY']<151)|(seasonal_df['DOY']==151)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&((seasonal_df['DOY']>243)|(seasonal_df['DOY']==243)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&(seasonal_df['DOY']>151)&(seasonal_df['DOY']<243),'season']='peak growing season'

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&((seasonal_df['DOY']<175)|(seasonal_df['DOY']==175)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&((seasonal_df['DOY']>243)|(seasonal_df['DOY']==243)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&(seasonal_df['DOY']>175)&(seasonal_df['DOY']<243),'season']='peak growing season'

seasonal_df.reset_index(drop = True,inplace = True)
seasonal_df['ID'] = np.arange(len(seasonal_df))
seasonal_df.to_csv('../0_datasets/Ccar_dataset_temporal.csv', index = False)

ASD FieldSpec 4                1765
ASD FieldSpec 3                 911
SVC HR-1024i                    560
PSR 3500+                       403
Lambda-19 Spectrophotometer     320
ASD FieldSpec                   274
Name: Instruments, dtype: int64
Site#2 274
Site#3 423
Site#4 211
Site#83 1131
Site#104 734
Deciduous broadleaf forests 881
Grasslands 798
Croplands 1131


### EWT

In [81]:
df = data[data["EWT"]>0]
print(df["Instruments"].value_counts())
df = df[df["Instruments"].str.contains("SVC")]

### sites dataset extraction
sample_size = 100
var_start = True
for site in df["Site ID"].unique():
    temp = df[df["Site ID"] == site]
    if len(temp)>sample_size:
        print(site, len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/EWT_dataset_sites.csv', index = False)

### PFT dataset extraction
sample_size = 180
var_start = True
for pft in df["PFT"].unique():
    temp = df[df["PFT"] == pft]
    if len(temp)>sample_size:
        print(pft,len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/EWT_dataset_PFT.csv', index = False)

                               1066
SVC HR-1024i                    721
ASD FieldSpec 3                 630
Lambda-19 Spectrophotometer     330
PSR 3500+                       302
ASD FieldSpec                   276
SVC HR-1024tm                   256
Name: Instruments, dtype: int64
Site#6 104
Site#13 102
Site#70 256
Site#82 184
Deciduous broadleaf forests 467
Grasslands 263
Croplands 184


### LMA

In [83]:
df = data[data["LMA"]>0]
print(df["Instruments"].value_counts())
df = df[df["Instruments"].str.contains("ASD")]

### sites dataset extraction
sample_size = 400
var_start = True
for site in df["Site ID"].unique():
    temp = df[df["Site ID"] == site]
    if len(temp)>sample_size:
        print(site, len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/LMA_dataset_sites.csv', index = False)

### PFT dataset extraction
sample_size = 200
var_start = True
for pft in df["PFT"].unique():
    temp = df[df["PFT"] == pft]
    if len(temp)>sample_size:
        print(pft,len(temp))
        df_sample = temp.sample(n=sample_size, replace=True)
        if var_start:
            df_final = df_sample
            var_start = False
        else:
            df_final= pd.concat([df_final,df_sample],axis = 0)
df_final['ID'] = np.arange(len(df_final))
df_final.reset_index(drop = True,inplace = True)
df_final.to_csv('../0_datasets/LMA_dataset_PFT.csv', index = False)

### temporal dataset extraction
seasonal_df = df[(df["Dataset ID"]=="Dataset#3")|(df["Dataset ID"]=="Dataset#4")|(df["Dataset ID"]=="Dataset#8")]
seasonal_df["Sample date"] = seasonal_df["Sample date"].astype(int).astype(str)
seasonal_df["DOY"] = [datetime.strptime(x, "%Y%m%d").timetuple().tm_yday for x in seasonal_df["Sample date"].tolist()]

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&((seasonal_df['DOY']<165)|(seasonal_df['DOY']==165)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&((seasonal_df['DOY']>240)|(seasonal_df['DOY']==240)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#3')&(seasonal_df['DOY']>165)&(seasonal_df['DOY']<240),'season']='peak growing season'

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&((seasonal_df['DOY']<151)|(seasonal_df['DOY']==151)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&((seasonal_df['DOY']>243)|(seasonal_df['DOY']==243)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#4')&(seasonal_df['DOY']>151)&(seasonal_df['DOY']<243),'season']='peak growing season'

seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&((seasonal_df['DOY']<175)|(seasonal_df['DOY']==175)),'season']='early growing season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&((seasonal_df['DOY']>243)|(seasonal_df['DOY']==243)),'season']='post-peak season'
seasonal_df.loc[(seasonal_df['Dataset ID']=='Dataset#8')&(seasonal_df['DOY']>175)&(seasonal_df['DOY']<243),'season']='peak growing season'

seasonal_df.reset_index(drop = True,inplace = True)
seasonal_df['ID'] = np.arange(len(seasonal_df))
seasonal_df.to_csv('../0_datasets/LMA_dataset_temporal.csv', index = False)

ASD FieldSpec 3                16500
PSR 3500+                      11030
SVC HR-1024i                    8237
PSR+                            3402
ASD FieldSpec 4                 1743
ASD FieldSpec Pro FR            1106
                                1068
SVC HR-2014i                     841
SVC HR-1024i, PSR Plus           342
Lambda-19 Spectrophotometer      330
ASD FieldSpec                    276
SVC HR-1024tm                    256
ASD FieldSpec Pro                238
SVC LC-RP-Pro                     48
Name: Instruments, dtype: int64
Site#3 403
Site#33 9795
Site#73 460
Site#75 458
Site#76 756
Site#77 462
Site#78 588
Site#83 1117
Site#84 735
Site#5 807
Site#93 494
Site#104 630
Deciduous broadleaf forests 11206
Shrublands 491
Vine 226
Evergreen broadleaf forests 559
Grasslands 5520
Evergreen needleleaf forests 336
Croplands 1117


In [100]:
tr_name = ["Chla+b", "Ccar", "EWT", "LMA"]
for tr in tr_name:
    df1 = pd.read_csv(f"../0_datasets/{tr}_dataset_sites.csv")
    locations = []
    for site in df1["Site ID"].unique():
        temp = df1[df1["Site ID"] == site]
        coordinate = (round(temp["Latitude"].mean(),2),round(temp["Longitude"].mean(),2))
        locations.append(coordinate)
    print(tr,len(df1),len(temp),locations)
    df2 = pd.read_csv(f"../0_datasets/{tr}_dataset_PFT.csv")
    print(tr, len(df2), df2["PFT"].value_counts())
    if tr!="EWT":
        df3 = pd.read_csv(f"../0_datasets/{tr}_dataset_temporal.csv")
        print(tr,len(df3),df3["season"].value_counts())
    print("--------------------------------------------------------------")

Chla+b 1600 200 [(47.49, -0.53), (41.36, -70.58), (42.54, -72.17), (45.86, -96.52), (43.08, -89.42), (45.4, -93.2), (52.22, 0.05), (49.01, 8.42)]
Chla+b 3000 Deciduous broadleaf forests    1000
Grasslands                     1000
Croplands                      1000
Name: PFT, dtype: int64
Chla+b 608 peak growing season     278
post-peak season        259
early growing season     71
Name: season, dtype: int64
--------------------------------------------------------------
Ccar 1000 200 [(47.49, -0.53), (41.36, -70.58), (42.54, -72.17), (43.08, -89.42), (49.01, 8.42)]
Ccar 2100 Deciduous broadleaf forests    700
Grasslands                     700
Croplands                      700
Name: PFT, dtype: int64
Ccar 634 post-peak season        285
peak growing season     278
early growing season     71
Name: season, dtype: int64
--------------------------------------------------------------
EWT 400 100 [(45.54, -73.34), (45.63, -73.47), (51.23, 3.04), (40.86, -72.87)]
EWT 540 Deciduous broadleaf