In [100]:
import pandas as pd
import zipfile
import numpy as np
from pathlib import Path
import re
from typing import Any
import xarray as xr

Get the sitenames of all the zipfiles:

In [72]:
fluxnet_zip_folder = Path("/home/bart/Data/EXCITED/fluxnet/zips")
fluxnet_zip_files = list(fluxnet_zip_folder.glob("*.zip"))
fluxnet_filenames = "\n".join([f.name for f in fluxnet_zip_files])
regex_sitename = "AMF_([A-Z]{2}-.{3})_FLUXNET"
sitenames = re.findall(regex_sitename, fluxnet_filenames)
print(sitenames[:5])

['US-Rws', 'US-ARM', 'US-CS4', 'US-Tw3', 'US-Hn3']


Get the lat & lon of every site:

In [76]:
def read_site_properties(
    metadata_file: Path,
    sitename: str, 
    properties: list[str],
) -> dict[str, Any]:
    df_meta = pd.read_excel(metadata_file)
    df_site = df_meta.where(df_meta["SITE_ID"]==sitename).dropna()
    data = {}
    for prop in properties:
        data[prop] = df_site.where(
            df_site["VARIABLE"]==prop
        ).dropna()["DATAVALUE"].to_numpy()[0]
    return data

metadata_file = Path("/home/bart/Data/EXCITED/fluxnet/AMF_AA-Flx_FLUXNET-BIF_CCBY4_20221210.xlsx")
metadata = read_site_properties(
    metadata_file=metadata_file,
    sitename=sitenames[0],
    properties=["LOCATION_LAT", "LOCATION_LONG"]
)
metadata

{'LOCATION_LAT': '43.1675', 'LOCATION_LONG': '-116.7132'}

In [None]:
def read_site_csv(
    sitename: str,
    fluxnet_zip_folder: Path,
) -> xr.Dataset:
    site_zip_fname = f"AMF_{sitename}_FLUXNET_FULLSET_*.zip"
    site_zipfile = list(fluxnet_zip_folder.glob(site_zip_fname))[0]
    site_csv_fname = site_zipfile.name.replace(".zip", ".csv")
    
    z = zipfile.ZipFile(site_zipfile)
    with z.open(site_csv_fname) as f:
        df = pd.read_csv(f).set_index("TIMESTAMP_START")

    required_labels = ["NEE_VUT_REF", "NEE_VUT_REF_QC"]
    df_req = df[required_labels]

    df_req_qc = df_req.where(df_req["NEE_VUT_REF_QC"]<3,np.nan)
    df_req_qc = df_req_qc.dropna()

    df_req_qc.to_xarray()    

    ds_site = ds_site.rename({"TIMESTAMP_START": "time"})
    ds_site = ds_site.expand_dims("site")

    return ds_site


In [None]:
for site in sitenames[:1]:

    ds_site = read_site_csv(site, fluxnet_zip_folder)
    ds_site["sitename"] = (["site"], [site])

    metadata_file = Path("/home/bart/Data/EXCITED/fluxnet/AMF_AA-Flx_FLUXNET-BIF_CCBY4_20221210.xlsx")
    metadata = read_site_properties(
        metadata_file=metadata_file,
        sitename=site,
        properties=["LOCATION_LAT", "LOCATION_LONG"]
    )

    ds_site["latitude"] = (["site"], [float(metadata["LOCATION_LAT"])])
    ds_site["longitude"] = (["site"], [float(metadata["LOCATION_LONG"])])


In [91]:
archive = "/home/bart/Data/EXCITED/fluxnet/zips/AMF_AR-TF1_FLUXNET_FULLSET_2016-2018_3-5.zip"
file = "AMF_AR-TF1_FLUXNET_FULLSET_HH_2016-2018_3-5.csv"

z = zipfile.ZipFile(archive)
with z.open(file) as f:
    df = pd.read_csv(f)

df["TIMESTAMP_START"] = pd.to_datetime(df["TIMESTAMP_START"], format="%Y%m%d%H%M")
df = df.set_index("TIMESTAMP_START")
df.keys()

Index(['TIMESTAMP_END', 'TA_F_MDS', 'TA_F_MDS_QC', 'TA_ERA', 'TA_F', 'TA_F_QC',
       'SW_IN_POT', 'SW_IN_F_MDS', 'SW_IN_F_MDS_QC', 'SW_IN_ERA',
       ...
       'GPP_DT_CUT_SE', 'GPP_DT_CUT_05', 'GPP_DT_CUT_16', 'GPP_DT_CUT_25',
       'GPP_DT_CUT_50', 'GPP_DT_CUT_75', 'GPP_DT_CUT_84', 'GPP_DT_CUT_95',
       'RECO_SR', 'RECO_SR_N'],
      dtype='object', length=216)

In [98]:
required_labels = ["NEE_VUT_REF", "NEE_VUT_REF_QC"]
df_req = df[required_labels]

df_req_qc = df_req.where(df_req["NEE_VUT_REF_QC"]<3,np.nan)
df_req_qc = df_req_qc.dropna()

ds_site = df_req_qc.to_xarray()
ds_site = ds_site.rename({"TIMESTAMP_START": "time"})

In [99]:
ds_site = ds_site.expand_dims("site")
ds_site["name"] = (["site"], ["test"])
ds_site
