# Filter Data for US-MMS Site
This notebook loads a multi-feature dataset stored in a Parquet file and filters it to retain only the records corresponding to the US-MMS AmeriFlux location.

In [1]:
import pandas as pd
from pathlib import Path

path= "/home/jovyan/research_code/Transformers/temportal_fusion_transformers/data/CSIFMETEO/BDT_50_20/sorted_BDT_50_20_merged_1982_2021.parquet"
# Path to your dataset
parquet_path = Path(path)

# Load the dataset
# This expects latitude, longitude, and time columns
# along with any other features.
df = pd.read_parquet(parquet_path)
print('Rows before filtering:', len(df))

Rows before filtering: 339590536


In [2]:
df

Unnamed: 0,time,location,sif_clear_inst,tmin,tmax,radiation,precipitation,latitude,longitude,soil,photoperiod,swvl1
0,1982-01-15,0,0.154806,278.1250,278.8125,693568.0,0.009155,50.0,-127.25,6.0,8.314672,-1.320243e-05
1,1982-01-16,0,0.152109,275.6250,279.5000,1691776.0,0.039856,50.0,-127.25,6.0,8.350347,-1.043081e-06
2,1982-01-17,0,0.149411,275.0625,277.1250,3658560.0,0.028259,50.0,-127.25,6.0,8.387132,-1.043081e-06
3,1982-01-18,0,0.146714,271.5625,277.0000,5744640.0,0.000000,50.0,-127.25,6.0,8.425000,-1.043081e-06
4,1982-01-19,0,0.144017,272.1875,276.8125,5476224.0,0.000000,50.0,-127.25,6.0,8.463925,1.111627e-05
...,...,...,...,...,...,...,...,...,...,...,...,...
339590531,2021-12-27,23265,0.713788,283.7500,289.1250,8559360.0,0.000000,20.0,103.75,9.0,10.796718,-1.233816e-05
339590532,2021-12-28,23265,0.711810,281.9375,293.6250,14631424.0,0.000000,20.0,103.75,9.0,10.799443,-5.960464e-08
339590533,2021-12-29,23265,0.709832,282.5625,294.4375,16476800.0,0.000000,20.0,103.75,9.0,10.802567,1.221895e-05
339590534,2021-12-30,23265,0.707855,284.8125,292.6250,9267328.0,0.000061,20.0,103.75,9.0,10.806089,1.221895e-05


In [3]:
# Coordinates for US-MMS (Morgan Monroe State Forest)
US_MMS_LAT = 39.3232
US_MMS_LON = -86.4137

tolerance = 0.25  # adjust depending on grid resolution

site_df = df[(df['latitude'].sub(US_MMS_LAT).abs() <= tolerance) &
             (df['longitude'].sub(US_MMS_LON).abs() <= tolerance)].copy()

print('Rows after filtering:', len(site_df))

Rows after filtering: 58384


In [4]:
site_df

Unnamed: 0,time,location,sif_clear_inst,tmin,tmax,radiation,precipitation,latitude,longitude,soil,photoperiod,swvl1
193294828,1982-01-15,13243,0.106985,247.8750,271.5625,7538240.0,0.000977,39.50,-86.50,4.0,9.504436,-1.320243e-05
193294829,1982-01-16,13243,0.105275,248.4375,273.1875,11439232.0,0.004761,39.50,-86.50,4.0,9.527534,-1.043081e-06
193294830,1982-01-17,13243,0.103565,240.5000,256.9375,6629952.0,0.000916,39.50,-86.50,4.0,9.551383,-1.043081e-06
193294831,1982-01-18,13243,0.101855,257.0000,270.3750,8585728.0,0.000000,39.50,-86.50,4.0,9.575972,-1.043081e-06
193294832,1982-01-19,13243,0.100145,263.9375,274.2500,2473216.0,0.000122,39.50,-86.50,4.0,9.601283,1.111627e-05
...,...,...,...,...,...,...,...,...,...,...,...,...
196141043,2021-12-27,13437,0.077710,280.7500,292.2500,3886656.0,0.003540,39.25,-86.25,4.0,9.251109,-1.233816e-05
196141044,2021-12-28,13437,0.077557,278.5000,286.8750,1294464.0,0.016479,39.25,-86.25,4.0,9.257565,-5.960464e-08
196141045,2021-12-29,13437,0.077403,278.6250,282.0000,3664384.0,0.000488,39.25,-86.25,4.0,9.264963,1.221895e-05
196141046,2021-12-30,13437,0.077249,277.6875,282.9375,3648448.0,0.002014,39.25,-86.25,4.0,9.273299,1.221895e-05


In [5]:
# Save the filtered data if needed
out_path = parquet_path.with_name(parquet_path.stem + '_US_MMS.parquet')
site_df.to_parquet(out_path, index=False)
print('Saved filtered site data to', out_path)

Saved filtered site data to /home/jovyan/research_code/Transformers/temportal_fusion_transformers/data/CSIFMETEO/BDT_50_20/sorted_BDT_50_20_merged_1982_2021_US_MMS.parquet
