In [1]:
import pandas as pd
import icartt
import os
import warnings
import re
from datetime import datetime
import csv
from datetime import datetime, timedelta
from netCDF4 import Dataset
import numpy as np
from scipy import stats
import glob
from math import pi
import ast

In [2]:
#SECTION 2: Create index bins

def create_index_bin(old_bin_diams, new_bin_diams):
    """
    Create index bins for new bin diameters based on old bin diameters,
    with each bin containing only indices exclusive to it.
    
    Parameters:
    old_bin_diams (list): List of original bin diameters.
    new_bin_diams (list): List of desired new bin diameters.
    
    Returns:
    list: A list of lists containing indices of old bins corresponding to each new bin diameter.
    """
    
    # Create list to store highest index for each new bin diameter
    max_indices = []
    
    for diameter in new_bin_diams:
        # Find the highest index where old_bin_diams is less than the current diameter
        indices = [i for i, old_diam in enumerate(old_bin_diams) if old_diam < diameter]
        max_index = max(indices) if indices else -1
        max_indices.append(max_index)
    
    # Create exclusive bins based on max indices
    index_bins = []
    prev_max = -1
    
    for current_max in max_indices:
        # Only include indices that fall between prev_max and current_max
        exclusive_indices = list(range(prev_max + 1, current_max + 1))
        index_bins.append(exclusive_indices)
        prev_max = current_max
    
    # Add remaining indices that weren't captured
    remaining_indices = list(range(prev_max + 1, len(old_bin_diams)))
    index_bins.append(remaining_indices)
    
    return index_bins

def bin_name_list(num_bins, bin_type="Aerosol"):
    """
    Generate a list of bin names in the format "bin1", "bin2", ..., "binN"
    
    Args:
        num_bins (int): Number of bins to generate names for
        
    Returns:
        list: List of bin names as strings
    """
    #if aerosol, print bin1, bin2, etc.
    #if cloud, print cbin1, cbin2, etc.
    if(bin_type == "Aerosol"):
        return [f"bin{i+1}" for i in range(num_bins)]
    else:
        return [f"cbin{i+1}" for i in range(num_bins)]
    

#SECTION 3: Consolidate the bins to reduced # of bins
def consolidate_bins(df, index_bins, bin_names, new_bin_count, bin_type="Aerosol"):
    """
    Consolidate original bins into new bins, replacing the original bin columns
    at their original positions. Preserves empty values.
    
    Parameters:
    df (DataFrame): DataFrame containing the original bin data
    index_bins (list): Lists of indices mapping old bins to new bins
    bin_names (list): Original bin column names
    new_bin_count (int): Number of new bins to create
    bin_type (str): Type of bin for naming
    
    Returns:
    DataFrame: A DataFrame with original bin columns replaced by new consolidated bin columns
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Find position of first bin column to maintain ordering
    all_columns = list(df.columns)
    if bin_names[0] in all_columns:
        first_bin_position = all_columns.index(bin_names[0])
    else:
        first_bin_position = 0  # Default to beginning if not found
    
    # Calculate the new bin values before modifying the DataFrame
    new_bin_values = {}
    new_bin_names = bin_name_list(new_bin_count, bin_type)
    
    for new_idx, indices in enumerate(index_bins):
        # Filter out indices that are out of range for bin_names
        valid_indices = [i for i in indices if i < len(bin_names)]
        
        # Get the old bin names corresponding to these valid indices
        old_bin_columns = [bin_names[i] for i in valid_indices]
        
        if old_bin_columns:  # Check if there are any columns to sum
            # Create a mask identifying where ALL source columns are empty/NA
            all_empty_mask = df[old_bin_columns].isna().all(axis=1)
            
            # Sum the values
            summed_values = df[old_bin_columns].sum(axis=1)
            
            # Where all original values were empty, set the result to NA
            summed_values = summed_values.mask(all_empty_mask)
            
            new_bin_values[new_bin_names[new_idx]] = summed_values
        else:
            # If no old bins correspond to this new bin, fill with NA
            new_bin_values[new_bin_names[new_idx]] = pd.Series(pd.NA, index=df.index)
    
    # Drop the original bin columns
    result_df = result_df.drop(columns=bin_names)
    
    # Insert new bin columns at the original positions
    for i, new_bin_name in enumerate(new_bin_names):
        insert_position = first_bin_position + i
        # Make sure we don't try to insert beyond the DataFrame's length
        if insert_position <= len(result_df.columns):
            result_df.insert(insert_position, new_bin_name, new_bin_values[new_bin_name])
        else:
            # If we run out of space, append to the end
            result_df[new_bin_name] = new_bin_values[new_bin_name]

# NEW: Fill NaN values with 0 when other bins in the same row have data
    # Get all the new bin column names
    all_new_bin_cols = [col for col in result_df.columns if 
                       (bin_type == "Aerosol" and col.startswith('bin') and col[3:].isdigit()) or
                       (bin_type == "Cloud" and col.startswith('cbin') and col[4:].isdigit())]
    
    if all_new_bin_cols:
        # For each row, check if any bin has data (not NaN)
        has_any_data_mask = result_df[all_new_bin_cols].notna().any(axis=1)
        
        # For rows that have any bin data, fill NaN values with 0
        for col in all_new_bin_cols:
            # Only fill NaN with 0 where the row has some bin data
            result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)
    
    return result_df

In [3]:
NEW_AEROSOL_BINS = [150, 169.8,192.1,217.5,246.1,278.6,315.3,356.8,403.9,457.1,517.3,585.5,662.7,750]
NEW_CLOUD_BINS = [3, 8.3, 13.5, 18.8, 24, 29.3, 34.5, 39.8, 45]
OUTPUT_PATH = rf"C:\Users\haika\Desktop\May_Research\may_datasets\binned_data"
INPUT_PATH = rf"C:\Users\haika\Desktop\May_Research\may_datasets\unit_corrected_data"
BIN_INFO_PATH = rf"C:\Users\haika\Desktop\May_Research\may_datasets\bin_info_list"





In [4]:
def read_bin_info_files():
    """
    Read all bin info CSV files and return a dictionary with campaign names as keys
    and bin diameter information as values.
    """
    bin_info_dict = {}
    
    # Get all CSV files in the bin info directory
    bin_info_files = glob.glob(os.path.join(BIN_INFO_PATH, "*.csv"))
    
    for file_path in bin_info_files:
        # Extract campaign name from filename
        campaign_name = os.path.splitext(os.path.basename(file_path))[0]
        
        try:
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Store the dataframe in the dictionary
            bin_info_dict[campaign_name] = df
            
            print(f"Loaded bin info for campaign: {campaign_name}")
            
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    return bin_info_dict

def parse_bin_diameters(bin_diams_str):
    """
    Parse bin diameters string into a list of floats.
    Handle empty or invalid entries.
    """
    if pd.isna(bin_diams_str) or bin_diams_str.strip() == '':
        return []
    
    try:
        # Try to evaluate as a Python list
        return ast.literal_eval(bin_diams_str)
    except:
        try:
            # Try comma-separated values
            return [float(x.strip()) for x in bin_diams_str.split(',')]
        except:
            print(f"Warning: Could not parse bin diameters: {bin_diams_str}")
            return []

def get_existing_bin_columns(df, bin_type):
    """
    Get existing bin column names from dataframe based on bin type.
    """
    if bin_type == "Aerosol":
        # Look for columns like bin1, bin2, etc.
        bin_cols = [col for col in df.columns if col.startswith('bin') and col[3:].isdigit()]
    else:  # Cloud
        # Look for columns like cbin1, cbin2, etc.
        bin_cols = [col for col in df.columns if col.startswith('cbin') and col[4:].isdigit()]
    
    # Sort numerically
    if bin_type == "Aerosol":
        bin_cols.sort(key=lambda x: int(x[3:]))
    else:
        bin_cols.sort(key=lambda x: int(x[4:]))
    
    return bin_cols

def ensure_minimum_bins(df, bin_type):
    """
    Ensure minimum number of bin columns exist, creating empty ones if needed.
    """
    if bin_type == "Aerosol":
        min_bins = 15
        prefix = "bin"
    else:  # Cloud
        min_bins = 10
        prefix = "cbin"
    
    existing_bins = get_existing_bin_columns(df, bin_type)
    
    # Add missing bins with NA values
    for i in range(1, min_bins + 1):
        col_name = f"{prefix}{i}"
        if col_name not in df.columns:
            df[col_name] = pd.NA
    
    return df

def process_campaign_binning():
    """
    Process all campaign data files and apply binning transformations.
    """
    print("="*50)
    print("PROCESSING CAMPAIGN BINNING:")
    print("="*50)
    
    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    # Read bin info files
    bin_info_dict = read_bin_info_files()
    
    # Process each campaign
    for campaign_name, bin_info_df in bin_info_dict.items():
        print(f"\nProcessing campaign: {campaign_name}")
        
        # Look for campaign data file in INPUT_PATH
        input_file = os.path.join(INPUT_PATH, f"{campaign_name}.csv")
        
        if not os.path.exists(input_file):
            print(f"  No data file found: {campaign_name}.csv")
            continue
        
        try:
            # Read the campaign data
            df = pd.read_csv(input_file)
            print(f"  Loaded data with {len(df)} rows and {len(df.columns)} columns")
            
            # Process each bin type (Aerosol and Cloud)
            for _, row in bin_info_df.iterrows():
                bin_type = row['Type']
                bin_diams_str = row['bin_diams']
                
                print(f"  Processing {bin_type} bins...")
                
                # Parse bin diameters
                old_bin_diams = parse_bin_diameters(bin_diams_str)
                
                if not old_bin_diams:
                    print(f"    No bin diameters found for {bin_type}, ensuring minimum bins...")
                    df = ensure_minimum_bins(df, bin_type)
                    continue
                
                # Get existing bin columns
                existing_bin_cols = get_existing_bin_columns(df, bin_type)
                
                if not existing_bin_cols:
                    print(f"    No existing {bin_type} bin columns found")
                    df = ensure_minimum_bins(df, bin_type)
                    continue
                
                # Determine new bins and create index mapping
                if bin_type == "Aerosol":
                    new_bin_diams = NEW_AEROSOL_BINS
                    new_bin_count = max(15, len(NEW_AEROSOL_BINS) + 1)  # +1 for overflow bin
                else:  # Cloud
                    new_bin_diams = NEW_CLOUD_BINS
                    new_bin_count = max(10, len(NEW_CLOUD_BINS) + 1)  # +1 for overflow bin
                
                # Create index bins
                index_bins = create_index_bin(old_bin_diams, new_bin_diams)
                
                print(f"    Consolidating {len(existing_bin_cols)} old bins into {new_bin_count} new bins")
                
                # Consolidate bins
                df = consolidate_bins(df, index_bins, existing_bin_cols, new_bin_count, bin_type)
                
                # Ensure minimum bins after consolidation
                df = ensure_minimum_bins(df, bin_type)
            
            # Save the binned data
            output_filename = f"{campaign_name}_binned.csv"
            output_path = os.path.join(OUTPUT_PATH, output_filename)
            df.to_csv(output_path, index=False)
            
            print(f"  Saved binned data to: {output_filename}")
            
        except Exception as e:
            print(f"  Error processing {campaign_name}: {e}")

In [5]:
# Run the binning process
if __name__ == "__main__":
    process_campaign_binning()
    
    print("\n" + "="*50)
    print("BINNING PROCESS COMPLETE!")
    print("="*50)

PROCESSING CAMPAIGN BINNING:
Loaded bin info for campaign: ACE-ENA
Loaded bin info for campaign: ACMEV
Loaded bin info for campaign: BBOP
Loaded bin info for campaign: CACTI
Loaded bin info for campaign: CARES
Loaded bin info for campaign: GOAMAZON
Loaded bin info for campaign: ISDAC
Loaded bin info for campaign: TCAP2012
Loaded bin info for campaign: TCAP2013

Processing campaign: ACE-ENA
  Loaded data with 546787 rows and 76 columns
  Processing Aerosol bins...
    Consolidating 30 old bins into 15 new bins


  result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)
  result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)
  result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)


  Processing Cloud bins...
    Consolidating 21 old bins into 10 new bins
  Saved binned data to: ACE-ENA_binned.csv

Processing campaign: ACMEV
  Loaded data with 55849 rows and 142 columns
  Processing Aerosol bins...
    Consolidating 87 old bins into 15 new bins


  result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)


  Processing Cloud bins...
    Consolidating 30 old bins into 10 new bins
  Saved binned data to: ACMEV_binned.csv

Processing campaign: BBOP
  Loaded data with 64847 rows and 146 columns
  Processing Aerosol bins...
    Consolidating 91 old bins into 15 new bins
  Processing Cloud bins...
    Consolidating 30 old bins into 10 new bins
  Saved binned data to: BBOP_binned.csv

Processing campaign: CACTI
  Loaded data with 38596 rows and 145 columns
  Processing Aerosol bins...
    Consolidating 99 old bins into 15 new bins
  Processing Cloud bins...
    Consolidating 21 old bins into 10 new bins
  Saved binned data to: CACTI_binned.csv

Processing campaign: CARES
  Loaded data with 2838 rows and 141 columns
  Processing Aerosol bins...
    Consolidating 96 old bins into 15 new bins
  Processing Cloud bins...
    Consolidating 20 old bins into 10 new bins
  Saved binned data to: CARES_binned.csv

Processing campaign: GOAMAZON


  result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)
  result_df.loc[has_any_data_mask, col] = result_df.loc[has_any_data_mask, col].fillna(0)


  Loaded data with 331893 rows and 154 columns
  Processing Aerosol bins...
    Consolidating 99 old bins into 15 new bins
  Processing Cloud bins...
    Consolidating 30 old bins into 10 new bins
  Saved binned data to: GOAMAZON_binned.csv

Processing campaign: ISDAC
  Loaded data with 441569 rows and 154 columns
  Processing Aerosol bins...
    Consolidating 99 old bins into 15 new bins
  Processing Cloud bins...
    Consolidating 30 old bins into 10 new bins
  Saved binned data to: ISDAC_binned.csv

Processing campaign: TCAP2012
  Loaded data with 15523 rows and 70 columns
  Processing Aerosol bins...
    No bin diameters found for Aerosol, ensuring minimum bins...
  Processing Cloud bins...
    Consolidating 30 old bins into 10 new bins
  Saved binned data to: TCAP2012_binned.csv

Processing campaign: TCAP2013
  Loaded data with 18901 rows and 154 columns
  Processing Aerosol bins...
    Consolidating 99 old bins into 15 new bins
  Processing Cloud bins...
    Consolidating 30 old 