In [2]:
import re
import os
import shutil
import requests
import polars as pl
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import rasterio
from affine import Affine
import numpy as np

In [6]:

def fetch_file_links(year_url, file_type):
    try:
        response = requests.get(year_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a')
        file_links = [year_url + link['href'] for link in links if link['href'].endswith('.asc')]
        filtered_links = [link for link in file_links if file_type in link]
        return filtered_links
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch file links. Error: {e}")
        return []

def download_file(url, destination_folder):
    filename = url.split("/")[-1]
    destination_path = os.path.join(destination_folder, filename)
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(destination_path, 'wb') as f:
            f.write(response.content)
        print(f"File downloaded successfully: {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download the file: {filename}. Error: {e}")

def append_and_process_files(source_dir, transform, row_start, row_end, col_start, col_end):
    master_df = pl.DataFrame()
    for filename in os.listdir(source_dir):
        if filename.endswith('.asc'):
            file_path = os.path.join(source_dir, filename)
            with rasterio.open(file_path) as src:
                data = src.read(1)  # Read the first band
                filtered_data = data[row_start:row_end, col_start:col_end]
            df = create_polars_dataframe(file_path, filtered_data, row_start, row_end, col_start, col_end, transform)
            master_df = master_df.vstack(df) if not master_df.is_empty() else df
    return master_df


def create_polars_dataframe(file_name: str, data: np.ndarray, row_start: int, row_end: int, col_start: int, col_end: int, transform: Affine):
    """
    Creates a Polars DataFrame from filtered raster data with columns for time, latitude, longitude, and EDDI values.

    Parameters:
    - file_name (str): The name of the file, used to extract the time information.
    - data (numpy.ndarray): The filtered raster data array.
    - row_start (int): The starting index for rows of the filtered data.
    - row_end (int): The ending index for rows of the filtered data.
    - col_start (int): The starting index for columns of the filtered data.
    - col_end (int): The ending index for columns of the filtered data.
    - transform (Affine): The affine transformation used to convert indices to geographic coordinates.

    Returns:
    - pl.DataFrame: A Polars DataFrame with columns for time, latitude, longitude, and EDDI values.
    """
    # Extract the date from the file name using regular expressions
    date_match = re.search(r'\d{8}', file_name)
    if date_match:
        date = date_match.group(0)
        # Transform the date to 'yyyy-mm-dd' format
        formatted_date = f"{date[:4]}-{date[4:6]}-{date[6:]}"
    else:
        raise ValueError("Date not found in file name.")

    # Generate latitude and longitude arrays
    lon_arr = []
    lat_arr = []
    for row in range(row_start, row_end):
        for col in range(col_start, col_end):
            lon, lat = transform * (col, row)
            lon_arr.append(lon)
            lat_arr.append(lat)

    # Flatten the data array to match the latitude and longitude arrays
    eddi_values = data.flatten()

    # Create a Polars DataFrame
    df = pl.DataFrame({
        "time": [formatted_date] * len(eddi_values),
        "lat": lat_arr,
        "lon": lon_arr,
        "eddi": eddi_values
    })

    return df


from rasterio.transform import Affine
import numpy as np

def geographic_to_index(lon, lat, transform):
    """
    Convert geographic coordinates to raster indices.

    Parameters:
    - lon (float): The longitude to convert.
    - lat (float): The latitude to convert.
    - transform (Affine): The affine transform associated with the raster, defining how geographic coordinates are transformed into raster indices.

    Returns:
    - (int, int): The zero-based column index and row index corresponding to the given longitude and latitude.
    """
    col, row = ~transform * (lon, lat)
    return int(col), int(row)

def download_and_process_raster_data(base_url, start_year, end_year, file_type, destination_folder):
    # Define the affine transform for a raster where each pixel represents a 0.5x0.5 degree area
    transform = Affine(0.5, 0.0, -180, 0.0, -0.5, 90)

    # Define geographic bounds and calculate index ranges
    lat_range = (-12, 22)  # South to North
    lon_range = (23, 52)  # West to East
    col_start, row_end = geographic_to_index(lon_range[0], lat_range[0], transform)
    col_end, row_start = geographic_to_index(lon_range[1], lat_range[1], transform)

    for year in range(start_year, end_year + 1):
        year_url = f"{base_url}{year}/"
        year_destination_folder = os.path.join(destination_folder, str(year))
        
        if not os.path.exists(year_destination_folder):
            os.makedirs(year_destination_folder)
        
        file_links = fetch_file_links(year_url, file_type)
        with ThreadPoolExecutor(max_workers=8) as executor:
            executor.map(lambda url: download_file(url, year_destination_folder), file_links)
        
        # Process and append raster data to a Polars DataFrame
        master_df = append_and_process_files(year_destination_folder, transform, row_start, row_end, col_start, col_end)
        
        # Save the DataFrame as a Parquet file
        parquet_path = os.path.join(destination_folder, f"EDDI_{file_type}_{year}.parquet")
        master_df.write_parquet(parquet_path)
        print(f"Data for year {year} processed and saved as Parquet at {parquet_path}.")
        
        shutil.rmtree(year_destination_folder)  # Optional: remove the directory after processing

# Example usage
base_url = "https://downloads.psl.noaa.gov/Projects/EDDI/global_archive/NCEP/"
destination_folder = "/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/eddi"
start_year = 2011
end_year = 2024
file_type = "12mn"
download_and_process_raster_data(base_url, start_year, end_year, file_type, destination_folder)


File downloaded successfully: EDDI_12mn_20110108.asc
File downloaded successfully: EDDI_12mn_20110105.asc
File downloaded successfully: EDDI_12mn_20110102.asc
File downloaded successfully: EDDI_12mn_20110101.asc
File downloaded successfully: EDDI_12mn_20110106.asc
File downloaded successfully: EDDI_12mn_20110103.asc
File downloaded successfully: EDDI_12mn_20110107.asc
File downloaded successfully: EDDI_12mn_20110104.asc
File downloaded successfully: EDDI_12mn_20110109.asc
File downloaded successfully: EDDI_12mn_20110110.asc
File downloaded successfully: EDDI_12mn_20110113.asc
File downloaded successfully: EDDI_12mn_20110112.asc
File downloaded successfully: EDDI_12mn_20110111.asc
File downloaded successfully: EDDI_12mn_20110114.asc
File downloaded successfully: EDDI_12mn_20110115.asc
File downloaded successfully: EDDI_12mn_20110119.asc
File downloaded successfully: EDDI_12mn_20110120.asc
File downloaded successfully: EDDI_12mn_20110121.asc
File downloaded successfully: EDDI_12mn_201101

In [23]:
# Define the path to your Parquet file
parquet_file_path = "/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/eddi/EDDI_12mn_1981.parquet"

# Use Polars to read the Parquet file
df = pl.read_parquet(parquet_file_path)



In [24]:
df

time,lat,lon,eddi
str,f64,f64,f32
"""1981-01-08""",22.0,23.0,-0.17379
"""1981-01-08""",22.0,23.5,-0.17379
"""1981-01-08""",22.0,24.0,-0.17379
"""1981-01-08""",22.0,24.5,-0.17379
"""1981-01-08""",22.0,25.0,-0.17379
…,…,…,…
"""1981-12-31""",-11.5,49.5,-999.0
"""1981-12-31""",-11.5,50.0,-999.0
"""1981-12-31""",-11.5,50.5,-999.0
"""1981-12-31""",-11.5,51.0,-999.0


In [25]:
import polars as pl
import os

def merge_parquet_files(directory_path: str, output_file_path: str) -> None:
    """
    Merges multiple Parquet files located in a specified directory into a single Parquet file.

    Args:
    directory_path (str): The path to the directory containing the Parquet files.
    output_file_path (str): The path where the merged Parquet file will be saved.

    Returns:
    None
    """

    # List all Parquet files in the directory
    parquet_files = [file for file in os.listdir(directory_path) if file.endswith('.parquet')]

    # Read each Parquet file into a DataFrame and store in a list
    dataframes = [pl.read_parquet(os.path.join(directory_path, file)) for file in parquet_files]

    # Concatenate all DataFrames vertically
    merged_df = pl.concat(dataframes)

    # Write the merged DataFrame to a new Parquet file
    merged_df.write_parquet(output_file_path)

    print(f"Merged file saved to {output_file_path}")

# Specify the directory containing your Parquet files and the output file path
directory_path = '/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/eddi'
output_file_path = '/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/merged_eddi_data.parquet'

# Call the function to merge the files
merge_parquet_files(directory_path, output_file_path)


Merged file saved to /workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/merged_eddi_data.parquet


In [3]:
# Define the path to your Parquet file
parquet_file_path = "/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/merged_eddi_data.parquet"

# Use Polars to read the Parquet file
df = pl.read_parquet(parquet_file_path)

In [4]:
df

time,lat,lon,eddi
datetime[μs],f64,f64,f32
1981-01-01 00:00:00,22.0,23.0,-0.115403
1981-01-01 00:00:00,22.0,23.5,-0.057442
1981-01-01 00:00:00,22.0,24.0,-0.115403
1981-01-01 00:00:00,22.0,24.5,-0.17379
1981-01-01 00:00:00,22.0,25.0,-0.17379
…,…,…,…
2024-04-29 00:00:00,-11.5,49.5,-999.0
2024-04-29 00:00:00,-11.5,50.0,-999.0
2024-04-29 00:00:00,-11.5,50.5,-999.0
2024-04-29 00:00:00,-11.5,51.0,-999.0


In [6]:
df = df.sort(by="time")

In [7]:
df

time,lat,lon,eddi
datetime[μs],f64,f64,f32
1981-01-01 00:00:00,22.0,23.0,-0.115403
1981-01-01 00:00:00,22.0,23.5,-0.057442
1981-01-01 00:00:00,22.0,24.0,-0.115403
1981-01-01 00:00:00,22.0,24.5,-0.17379
1981-01-01 00:00:00,22.0,25.0,-0.17379
…,…,…,…
2024-04-29 00:00:00,-11.5,49.5,-999.0
2024-04-29 00:00:00,-11.5,50.0,-999.0
2024-04-29 00:00:00,-11.5,50.5,-999.0
2024-04-29 00:00:00,-11.5,51.0,-999.0


In [12]:
output_file_path = '/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/merged_eddi_data.parquet'

In [13]:
df.write_parquet(output_file_path)

In [19]:
import polars as pl
import plotly.express as px

def visualize_eddi_map(df, specific_date):
    """
    Visualizes EDDI values on a map for a specific date, excluding missing data represented by -999.0.

    Parameters:
    - df (pl.DataFrame): The DataFrame containing time, latitude, longitude, and EDDI values.
    - specific_date (str): The specific date to visualize in "YYYY-MM-DD" format.

    Returns:
    - None: Displays an interactive map.
    """
    # Check and convert 'time' column to datetime if necessary
    # if df.schema()['time'] != pl.datatypes.Datetime:
    #     df = df.with_column(pl.col('time').str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S"))

    # Convert the specific date string to datetime object
    # specific_datetime = pl.datetime.strptime(specific_date + " 00:00:00", "%Y-%m-%d %H:%M:%S")

    # Filter the DataFrame for the specific date and exclude missing data
    date_filter = "1981-01-01"

    # Filter the DataFrame
    filtered_df = df.filter(pl.col("time").dt.date() == pl.lit(specific_date))

    # Convert Polars DataFrame to Pandas DataFrame for plotting
    pd_df = filtered_df.to_pandas()

    # Create a scatter map
    fig = px.scatter_geo(pd_df,
                         lat="lat",
                         lon="lon",
                         color="eddi",
                         hover_name="eddi",
                         projection="natural earth",
                         title=f"EDDI Visualization on Map for {specific_date}",
                         color_continuous_scale=px.colors.sequential.Plasma)
    
    # Update the layout to better fit the map presentation
    fig.update_layout(geo=dict(showland=True, landcolor="lightgrey"),
                      margin={"r":0,"t":50,"l":0,"b":0})

    fig.show()

# Example usage, ensure that 'df' is already your loaded DataFrame with proper columns
specific_date = "1981-01-08 00:00:00"
visualize_eddi_map(df, specific_date)


ComputeError: cannot compare 'date/datetime/time' to a string value (create native python { 'date', 'datetime', 'time' } or compare to a temporal column)

In [66]:
def regrid_polars_df(df, start_lat=-12.0, start_lon=23.125, new_lat_res=0.5, new_lon_res=0.625):
    """
    Regrids a Polars DataFrame with latitude and longitude to a common specified grid, starting from specified latitude and longitude.

    Parameters:
    - df (pl.DataFrame): Polars DataFrame to be regridded.
    - start_lat (float): Starting latitude for the grid.
    - start_lon (float): Starting longitude for the grid.
    - new_lat_res (float): The new resolution for latitude.
    - new_lon_res (float): The new resolution for longitude.

    Returns:
    - pl.DataFrame: A DataFrame containing the regridded data.

    Example of usage:
    regridded_df = regrid_polars_df(polars_df)
    """
    # Calculate global min and max of latitude and longitude
    lat_min = df.select(pl.min('lat')).to_numpy().item()
    lat_max = df.select(pl.max('lat')).to_numpy().item()
    lon_min = df.select(pl.min('lon')).to_numpy().item()
    lon_max = df.select(pl.max('lon')).to_numpy().item()

    # Adjust the start points if they are outside the current data range
    start_lat = max(lat_min, start_lat)
    start_lon = max(lon_min, start_lon)

    # Create common latitude and longitude grids starting from the specified points
    common_lat = np.arange(start_lat, lat_max + new_lat_res, new_lat_res)
    common_lon = np.arange(start_lon, lon_max + new_lon_res, new_lon_res)

    # Function to find nearest grid point
    def find_nearest(array, value):
        idx = (np.abs(array - value)).argmin()
        return array[idx]

    # Map each lat and lon in the DataFrame to the nearest grid point
    df = df.with_columns([
        df['lat'].map_elements(lambda x: find_nearest(common_lat, x), return_dtype=float).alias('new_lat'),
        df['lon'].map_elements(lambda x: find_nearest(common_lon, x), return_dtype=float).alias('new_lon')
    ]).drop(['lat', 'lon']).rename({'new_lat': 'lat', 'new_lon': 'lon'})

    # Optional: Aggregate data if necessary
    df = df.group_by(['time', 'lat', 'lon']).agg(pl.col('eddi').mean().alias('eddi'))

    # Sort by time, latitude, and longitude for better organization
    df = df.sort(['time', 'lat', 'lon'])

    return df

# Usage example (assuming df is your DataFrame loaded with data)
regridded_df = regrid_polars_df(eddi_clean_df)


In [67]:
regridded_df

time,lat,lon,eddi
datetime[μs],f64,f64,f32
2001-01-01 00:00:00,-12.0,23.125,-0.761296
2001-01-01 00:00:00,-12.0,23.75,-0.88392
2001-01-01 00:00:00,-12.0,24.375,-0.926714
2001-01-01 00:00:00,-12.0,25.0,-1.019687
2001-01-01 00:00:00,-12.0,25.625,-1.019687
…,…,…,…
2024-04-13 00:00:00,21.5,38.75,0.567209
2024-04-13 00:00:00,21.5,39.375,0.437543
2024-04-13 00:00:00,21.5,40.0,0.199132
2024-04-13 00:00:00,21.5,43.125,-0.500068


In [68]:
regridded_df.write_parquet("/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/eddi.parquet")

# Helper functions for debugging 

In [None]:
import requests

def download_file(url, destination_filename):
    """
    Download a file from a specified URL and save it locally.

    Parameters:
    - url (str): URL of the file to download.
    - destination_filename (str): Name of the file to save the downloaded content.

    Returns:
    - None
    """
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Write the content of the response to a file
        with open(destination_filename, 'wb') as f:
            f.write(response.content)
        print("File downloaded successfully!")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")

# URL of the file to be downloaded
file_url = "https://downloads.psl.noaa.gov/Projects/EDDI/global_archive/NCEP/2002/EDDI_06mn_20020612.asc"

# Name of the file to save locally
local_filename = "/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/EDDI_12mn_20020612.asc"

# Call the function to download the file
download_file(file_url, local_filename)


In [None]:
import rasterio
from rasterio.transform import from_origin

def load_ascii_grid(file_path: str):
    """
    Load an ASCII grid file using rasterio and return its data along with metadata.

    This function opens the specified ASCII grid file, reads its contents,
    and returns both the grid data as a numpy ndarray and the metadata profile
    associated with the raster dataset.

    Parameters:
    - file_path (str): The file path to the ASCII grid file.

    Returns:
    - tuple:
        - data (numpy.ndarray): The raster data read from the file.
        - profile (dict): The metadata profile of the raster dataset, which includes
          details about the data format, dimensions, and georeferencing.

    Raises:
    - FileNotFoundError: If the file does not exist at the specified path.
    - rasterio.errors.RasterioIOError: If there is an error opening the file with rasterio.
    """

    with rasterio.open(file_path, mode='r') as src:
        data = src.read(1)  # Read the first and only band
        profile = src.profile

    return data, profile

# Example usage
file_path = '/workspace/soil-ml-modeling-pipeline/ml-modeling-pipeline/data/01_raw/EDDI_12mn_20020612.asc'
try:
    data, profile = load_ascii_grid(file_path)
    print("Data loaded successfully.")
    print("Raster profile:", profile)
except Exception as e:
    print("An error occurred:", e)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_raster(data, nodata_value, title="Raster Data Visualization"):
    """
    Plots the raster data using matplotlib, masking the 'nodata' values.

    Parameters:
    - data (numpy.ndarray): The raster data to plot.
    - nodata_value (float): The value that indicates no data in the raster.
    - title (str): The title of the plot.
    """
    # Mask the no-data values
    data_masked = np.ma.masked_where(data == nodata_value, data)

    plt.figure(figsize=(12, 6))
    plt.imshow(data_masked, cmap='viridis', interpolation='nearest')
    plt.colorbar(label='Data values')
    plt.title(title)
    plt.xlabel('Longitude Index')
    plt.ylabel('Latitude Index')
    plt.show()

# Example usage
plot_raster(data, -999.0)


In [None]:
from rasterio.transform import Affine
import numpy as np

def geographic_to_index(lon, lat, transform):
    """
    Convert geographic coordinates to raster indices.

    Parameters:
    - lon (float): The longitude to convert.
    - lat (float): The latitude to convert.
    - transform (Affine): The affine transform associated with the raster, defining how geographic coordinates are transformed into raster indices.

    Returns:
    - (int, int): The zero-based column index and row index corresponding to the given longitude and latitude.
    """
    col, row = ~transform * (lon, lat)
    return int(col), int(row)

def filter_raster_data(data, row_start, row_end, col_start, col_end):
    """
    Filter the raster data array to include only the data within specified index ranges.

    Parameters:
    - data (numpy.ndarray): The full raster data array.
    - row_start (int): The starting index for rows.
    - row_end (int): The ending index for rows.
    - col_start (int): The starting index for columns.
    - col_end (int): The ending index for columns.

    Returns:
    - numpy.ndarray: The filtered raster data array.
    """
    return data[row_start:row_end, col_start:col_end]

# Define the affine transform for a raster where each pixel represents a 0.5x0.5 degree area
transform = Affine(0.5, 0.0, -180, 0.0, -0.5, 90)

# Define geographic bounds and calculate index ranges
lat_range = (-12, 22)  # South to North
lon_range = (23, 52)  # West to East
col_start, row_end = geographic_to_index(lon_range[0], lat_range[0], transform)
col_end, row_start = geographic_to_index(lon_range[1], lat_range[1], transform)

# Assuming 'data' is your loaded raster data
filtered_data = filter_raster_data(data, row_start, row_end, col_start, col_end)
print(filtered_data)
