https://aws.amazon.com/marketplace/pp/prodview-unsyxr6yzlp62?sr=0-4&ref_=beagle&applicationId=AWSMPContessa#overview

NOAA Terrestrial Climate Data Records was accessed on DATE from https://registry.opendata.aws/noaa-cdr-terrestrial .

In [None]:
import os
import boto3
from botocore import UNSIGNED
from botocore.config import Config
from datetime import datetime

def list_s3_files(bucket_name, year):
    """
    List all files in an S3 bucket for a specific year.

    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - year (int): Year to filter the files by.

    Returns:
    - list: A list of object keys that match the year.
    """
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=f'data/{year}/')

    file_keys = []
    for page in pages:
        for obj in page.get('Contents', []):
            file_keys.append(obj['Key'])
    return file_keys

def download_public_s3_file(bucket_name, object_key, local_file_path):
    """
    Download a file from a public S3 bucket without AWS credentials, ensuring the directory exists.
    """
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    s3.download_file(bucket_name, object_key, local_file_path)
    print(f"File downloaded to {local_file_path}")

def download_files_for_year(bucket_name, year, local_dir_base):
    """
    Download all files for a given year from a public S3 bucket to a local directory.
    """
    file_keys = list_s3_files(bucket_name, year)
    for file_key in file_keys:
        file_name = file_key.split('/')[-1]
        local_file_path = os.path.join(local_dir_base, file_name)
        download_public_s3_file(bucket_name, file_key, local_file_path)

# Example usage
bucket_name = 'noaa-cdr-ndvi-pds'
year = 2023
local_dir_base = f'/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/NDVI/{year}'
download_files_for_year(bucket_name, year, local_dir_base)

In [112]:
# import os
# import boto3
# from botocore import UNSIGNED
# from botocore.config import Config
# from concurrent.futures import ThreadPoolExecutor
# from datetime import datetime

# def list_s3_files(bucket_name, year):
#     """
#     List all files in an S3 bucket for a specific year.

#     Parameters:
#     - bucket_name (str): The name of the S3 bucket.
#     - year (int): Year to filter the files by.

#     Returns:
#     - list: A list of object keys that match the year.
#     """
#     s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
#     paginator = s3.get_paginator('list_objects_v2')
#     pages = paginator.paginate(Bucket=bucket_name, Prefix=f'data/{year}/')

#     file_keys = []
#     for page in pages:
#         for obj in page.get('Contents', []):
#             file_keys.append(obj['Key'])
#     return file_keys

# def download_public_s3_file(bucket_name, object_key, local_file_path):
#     """
#     Download a file from a public S3 bucket without AWS credentials, ensuring the directory exists.
#     """
#     os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
#     s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
#     s3.download_file(bucket_name, object_key, local_file_path)
#     print(f"File downloaded to {local_file_path}")

# def download_files_for_year(bucket_name, year, local_dir_base):
#     """
#     Download all files for a given year from a public S3 bucket to a local directory.
#     """
#     file_keys = list_s3_files(bucket_name, year)
#     with ThreadPoolExecutor() as executor:
#         futures = []
#         for file_key in file_keys:
#             file_name = file_key.split('/')[-1]
#             local_file_path = os.path.join(local_dir_base, str(year), file_name)
#             futures.append(executor.submit(download_public_s3_file, bucket_name, file_key, local_file_path))
        
#         # Wait for all downloads to complete
#         for future in futures:
#             future.result()

# # Example usage
# bucket_name = 'noaa-cdr-ndvi-pds'
# local_dir_base = '/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/NDVI/'

# for year in range(2001, 2002):
#     year_dir = os.path.join(local_dir_base, str(year))
#     download_files_for_year(bucket_name, year, year_dir)


In [98]:
import xarray as xr

# Open the dataset without decoding times
ds = xr.open_dataset(f'/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/NDVI/2023/VIIRS-Land_v001-preliminary_NPP13C1_S-NPP_20230101_c20230109131619.nc', decode_times=False)

ds


In [99]:
import pandas as pd

# Assuming the units are correctly set to 'days since 1981-01-01 00:00:00' as per your dataset info
reference_time = pd.Timestamp('1981-01-01')
time_delta = pd.to_timedelta(ds['time'].values, unit='D')
decoded_time = reference_time + time_delta

# Attach the decoded time back to the dataset
ds['time'] = ('time', decoded_time)


In [100]:
ds

In [101]:
ds = ds.drop_vars(["crs", "TIMEOFDAY", "QA"])

In [102]:
# Assuming ds is your xarray.Dataset
pandas_df = ds.to_dataframe().reset_index()

In [103]:
import polars as pl 

In [104]:
polars_df = pl.from_pandas(pandas_df)

In [105]:
# Define the latitude and longitude ranges
lat_range = (-12, 22)  # South to North
lon_range = (23, 52)   # West to East

# Filter the DataFrame based on the latitude and longitude ranges
filtered_df = polars_df.filter(
    (polars_df['latitude'] >= lat_range[0]) &
    (polars_df['latitude'] <= lat_range[1]) &
    (polars_df['longitude'] >= lon_range[0]) &
    (polars_df['longitude'] <= lon_range[1])
)

In [107]:
filtered_df

latitude,longitude,time,nv,lat_bnds,lon_bnds,NDVI
f32,f32,datetime[ns],i64,f32,f32,f64
21.974998,23.024994,2023-01-01 00:00:00,0,21.999998,22.999994,0.1215
21.974998,23.024994,2023-01-01 00:00:00,1,21.949999,23.049994,0.1215
21.974998,23.074997,2023-01-01 00:00:00,0,21.999998,23.049997,0.1233
21.974998,23.074997,2023-01-01 00:00:00,1,21.949999,23.099997,0.1233
21.974998,23.125,2023-01-01 00:00:00,0,21.999998,23.1,0.1256
…,…,…,…,…,…,…
-11.975006,51.875,2023-01-01 00:00:00,1,-12.000006,51.900002,
-11.975006,51.925003,2023-01-01 00:00:00,0,-11.950006,51.900002,
-11.975006,51.925003,2023-01-01 00:00:00,1,-12.000006,51.950005,
-11.975006,51.974991,2023-01-01 00:00:00,0,-11.950006,51.949989,


In [108]:
clean_df = filtered_df.drop_nulls()

In [109]:
clean_df

latitude,longitude,time,nv,lat_bnds,lon_bnds,NDVI
f32,f32,datetime[ns],i64,f32,f32,f64
21.974998,23.024994,2023-01-01 00:00:00,0,21.999998,22.999994,0.1215
21.974998,23.024994,2023-01-01 00:00:00,1,21.949999,23.049994,0.1215
21.974998,23.074997,2023-01-01 00:00:00,0,21.999998,23.049997,0.1233
21.974998,23.074997,2023-01-01 00:00:00,1,21.949999,23.099997,0.1233
21.974998,23.125,2023-01-01 00:00:00,0,21.999998,23.1,0.1256
…,…,…,…,…,…,…
-11.975006,40.474991,2023-01-01 00:00:00,1,-12.000006,40.499992,0.0038
-11.975006,49.224991,2023-01-01 00:00:00,0,-11.950006,49.199989,0.4724
-11.975006,49.224991,2023-01-01 00:00:00,1,-12.000006,49.249992,0.4724
-11.975006,49.274994,2023-01-01 00:00:00,0,-11.950006,49.249992,0.5601
