In [None]:
import xarray as xr
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import glob
import os

def process_multiple_merra2_files_washington(directory_path):
    # Washington state boundaries
    WA_LAT_MIN, WA_LAT_MAX = 45.5, 49.0
    WA_LON_MIN, WA_LON_MAX = -124.8, -116.9
    
    # Get all MERRA2 files in the directory
    file_pattern = os.path.join(directory_path, "MERRA2_400.tavg1_2d_aer_Nx.*")
    files = sorted(glob.glob(file_pattern))
    
    print(f"Found {len(files)} files to process")
    
    # Lists to store data frames and their columns
    hourly_data_frames = []
    daily_data_frames = []
    
    # Process first file to get coordinates
    first_file = files[0]
    with xr.open_dataset(first_file) as ds:
        lats = ds['lat'].values
        lons = ds['lon'].values
        
        # Filter coordinates for Washington state
        wa_lat_mask = (lats >= WA_LAT_MIN) & (lats <= WA_LAT_MAX)
        wa_lon_mask = (lons >= WA_LON_MIN) & (lons <= WA_LON_MAX)
        
        wa_lats = lats[wa_lat_mask]
        wa_lons = lons[wa_lon_mask]
        
        # Create base coordinate DataFrame
        coords = [[lat, lon] for lat in wa_lats for lon in wa_lons]
        base_df = pd.DataFrame(coords, columns=['latitude', 'longitude'])
    
    # Process all files
    for idx, file in enumerate(files, 1):
        print(f"Processing file {idx}/{365}: {os.path.basename(file)}", end='\r')
        
        try:
            # Open the NetCDF file using xarray
            ds = xr.open_dataset(file)
            
            # Extract TOTANGSTR values for Washington only
            totangstr = ds['TOTANGSTR'].values[:, wa_lat_mask, :][:, :, wa_lon_mask]
            
            # Extract time and convert to UTC
            times = pd.to_datetime(ds['time'].values)
            
            # Create DataFrames for this file's data
            hourly_data = {}
            
            # Process each timestamp
            for t_idx, timestamp in enumerate(times):
                time_data = []
                for lat_idx, lat in enumerate(wa_lats):
                    for lon_idx, lon in enumerate(wa_lons):
                        time_data.append(totangstr[t_idx, lat_idx, lon_idx])
                hourly_data[timestamp] = time_data
            
            # Create hourly DataFrame for this file
            hourly_df = pd.DataFrame(hourly_data)
            
            # Calculate daily averages (fixed deprecation warning)
            daily_df = hourly_df.T.groupby(lambda x: x.date()).mean().T
            
            # Append to lists
            hourly_data_frames.append(hourly_df)
            daily_data_frames.append(daily_df)
            
            ds.close()
            
        except Exception as e:
            print(f"\nError processing file {file}: {str(e)}")
            continue
    
    print("\nCombining all data...")
    
    # Combine all hourly data
    combined_hourly = pd.concat(hourly_data_frames, axis=1)
    combined_hourly = pd.concat([base_df, combined_hourly], axis=1)
    
    # Combine all daily data
    combined_daily = pd.concat(daily_data_frames, axis=1)
    combined_daily = pd.concat([base_df, combined_daily], axis=1)
    
    # Sort columns (except first two lat/lon columns)
    time_cols_hourly = combined_hourly.columns[2:]
    time_cols_daily = combined_daily.columns[2:]
    
    combined_hourly = pd.concat([
        combined_hourly.iloc[:, :2],
        combined_hourly[sorted(time_cols_hourly)]
    ], axis=1)
    
    combined_daily = pd.concat([
        combined_daily.iloc[:, :2],
        combined_daily[sorted(time_cols_daily)]
    ], axis=1)
    
    return combined_hourly, combined_daily

# Usage example
directory_path = "Downloads/M2T1NXAER_5.12.4-20250117_084759"  # Replace with your directory path
hourly_df, daily_df = process_multiple_merra2_files_washington(directory_path)

# Save to CSV
print("Saving results to CSV files...")
hourly_df.to_csv("washington_merra2_hourly_data.csv", index=False)
daily_df.to_csv("washington_merra2_daily_data.csv", index=False)

# Print information about the results
print("\nHourly data shape:", hourly_df.shape)
print("Daily data shape:", daily_df.shape)
print("\nNumber of grid points:", len(hourly_df))
print("\nSample of hourly data:")
print(hourly_df.head())
print("\nSample of daily data:")
print(daily_df.head())

# Print coordinate ranges to verify
print("\nLatitude range:", hourly_df['latitude'].min(), "to", hourly_df['latitude'].max())
print("Longitude range:", hourly_df['longitude'].min(), "to", hourly_df['longitude'].max())

In [None]:
import xarray as xr
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import glob
import os

def process_multiple_merra2_files_washington(directory_path):
    # Washington state boundaries
    WA_LAT_MIN, WA_LAT_MAX = 45.5, 49.0
    WA_LON_MIN, WA_LON_MAX = -124.8, -116.9
    
    # Get all MERRA2 files in the directory
    file_pattern = os.path.join(directory_path, "MERRA2_400.tavg1_2d_aer_Nx.*")
    files = sorted(glob.glob(file_pattern))
    
    print(f"Found {len(files)} files to process")
    
    # Lists to store DataFrames
    hourly_dfs = []
    daily_dfs = []
    
    for idx, file in enumerate(files, 1):
        print(f"Processing file {idx}/{len(files)}: {os.path.basename(file)}")
        
        try:
            # Open the NetCDF file using xarray
            ds = xr.open_dataset(file)
            
            # Extract latitude and longitude
            lats = ds['lat'].values
            lons = ds['lon'].values
            
            # Filter coordinates for Washington state
            wa_lat_mask = (lats >= WA_LAT_MIN) & (lats <= WA_LAT_MAX)
            wa_lon_mask = (lons >= WA_LON_MIN) & (lons <= WA_LON_MAX)
            
            wa_lats = lats[wa_lat_mask]
            wa_lons = lons[wa_lon_mask]
            
            # Extract TOTANGSTR values for Washington only
            BCCMASS = ds['BCCMASS'].values[:, wa_lat_mask, :][:, :, wa_lon_mask]
            
            # Extract time and convert to UTC
            times = ds['time'].values
            time_utc = pd.to_datetime(times)
            
            # Create a list for Washington coordinates
            coords = []
            for lat in wa_lats:
                for lon in wa_lons:
                    coords.append([lat, lon])
            
            # Create a DataFrame with coordinates as index
            df = pd.DataFrame(index=pd.MultiIndex.from_tuples(coords, names=['latitude', 'longitude']))
            
            # Add time data
            for t_idx, timestamp in enumerate(time_utc):
                time_data = []
                for lat_idx, lat in enumerate(wa_lats):
                    for lon_idx, lon in enumerate(wa_lons):
                        time_data.append(BCCMASS[t_idx, lat_idx, lon_idx])
                df[timestamp] = time_data
            
            # Reset index to convert MultiIndex to columns
            df = df.reset_index()
            
            # Calculate daily averages
            time_columns = df.columns[2:]  # Skip latitude and longitude columns
            daily_df = pd.DataFrame()
            daily_df['latitude'] = df['latitude']
            daily_df['longitude'] = df['longitude']
            
            # Group timestamps by date and calculate mean
            dates = pd.to_datetime(time_columns).date
            unique_dates = list(set(dates))
            for date in unique_dates:
                date_cols = [col for col, d in zip(time_columns, dates) if d == date]
                daily_df[date] = df[date_cols].mean(axis=1)
            
            # Append to lists
            hourly_dfs.append(df)
            daily_dfs.append(daily_df)
            
            # Close the dataset
            ds.close()
            
        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")
            continue
    
    # Combine all DataFrames
    print("Combining all data...")
    
    # For hourly data
    combined_hourly = hourly_dfs[0].copy()
    for df in hourly_dfs[1:]:
        # Only append the time columns, not lat/lon
        new_columns = df.columns[2:]
        combined_hourly[new_columns] = df[new_columns]
    
    # For daily data
    combined_daily = daily_dfs[0].copy()
    for df in daily_dfs[1:]:
        # Only append the date columns, not lat/lon
        new_columns = df.columns[2:]
        combined_daily[new_columns] = df[new_columns]
    
    # Sort columns (except first two lat/lon columns)
    combined_hourly = pd.concat([
        combined_hourly.iloc[:, :2],
        combined_hourly.iloc[:, 2:].sort_index(axis=1)
    ], axis=1)
    
    combined_daily = pd.concat([
        combined_daily.iloc[:, :2],
        combined_daily.iloc[:, 2:].sort_index(axis=1)
    ], axis=1)
    
    return combined_hourly, combined_daily

# Usage example
directory_path = "Downloads/M2T1NXAER_5.12.4-20250117_084759"  # Replace with your directory path
hourly_df, daily_df = process_multiple_merra2_files_washington(directory_path)

# Save to CSV
print("Saving results to CSV files...")
hourly_df.to_csv("washington_merra2_bcmassdensity_hourly_data.csv", index=False)
daily_df.to_csv("washington_merra2_bcmassdensity_daily_data.csv", index=False)

# Print information about the results
print("\nHourly data shape:", hourly_df.shape)
print("Daily data shape:", daily_df.shape)
print("\nNumber of grid points:", len(hourly_df))
print("\nSample of hourly data:")
print(hourly_df.head())
print("\nSample of daily data:")
print(daily_df.head())

# Print coordinate ranges to verify
print("\nLatitude range:", hourly_df['latitude'].min(), "to", hourly_df['latitude'].max())
print("Longitude range:", hourly_df['longitude'].min(), "to", hourly_df['longitude'].max())

