In [1]:
import os
import sys
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import scipy.odr as odr
import os
import sys
import arrow
import numpy as np
import pandas as pd
from math import pi
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import seaborn as sns
from datetime import datetime
from numpy import nan_to_num
from matplotlib.colors import LogNorm
from matplotlib.ticker import ScalarFormatter
import os
import sys
import arrow
import numpy as np
import pandas as pd
from math import pi
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import seaborn as sns
from datetime import date
from numpy import nan_to_num
from matplotlib.colors import LogNorm
from matplotlib.ticker import ScalarFormatter
import pandas as pd
from datetime import datetime, timedelta
import glob

In [2]:
# Path definitions
RAW_PATH = rf"C:\Users\haika\Desktop\May_Research\may_datasets\wavelength_corrected_data"
RESTRICTED_OUTPUT_PATH = rf"C:\Users\haika\Desktop\May_Research\may_datasets\restricted_campaigns"
COMPREHENSIVE_OUTPUT_PATH = rf"C:\Users\haika\Desktop\May_Research\may_datasets\comprehensive_campaigns"


RESTRICTED_COLUMNS = ["Organization", "Campaign", "UTC", "Date", "Latitude", "Longitude", "Altitude", "Temperature", "Rel_humidity", "Pressure", "BC_Mass", "bin1", "bin2", "bin3", "bin4", "bin5", "bin6", "bin7", "bin8", "bin9", "bin10", "bin11", "bin12", "bin13", "bin14", "bin15", "Sc450_total", "Sc550_total", "Sc700_total", "Abs470_total", "Abs532_total", "Abs660_total"]
COMPREHENSIVE_COLUMNS = ["Organization", "Campaign", "UTC", "Date", "Latitude", "Longitude", "Altitude", "Temperature", "Rel_humidity", "Pressure", "U", "V", "W", "Supersaturation", "Number_Concentration", "CNgt3nm", "CNgt10nm", "BC_Mass", "LWC", "cbin1", "cbin2", "cbin3", "cbin4", "cbin5", "cbin6", "cbin7", "cbin8", "cbin9", "cbin10", "bin1", "bin2", "bin3", "bin4", "bin5", "bin6", "bin7", "bin8", "bin9", "bin10", "bin11", "bin12", "bin13", "bin14", "bin15", "Sc450_total", "Sc550_total", "Sc700_total", "Abs470_total", "Abs532_total", "Abs660_total"]

In [3]:
def filter_dataframe_columns(df, required_columns):
    """
    Filter dataframe to only include required columns in the specified order.
    Creates missing columns with NaN values if they don't exist.
    
    Parameters:
    df (DataFrame): Input dataframe
    required_columns (list): List of required column names in desired order
    
    Returns:
    DataFrame: Filtered dataframe with only required columns
    """
    filtered_df = pd.DataFrame()
    
    for col in required_columns:
        if col in df.columns:
            # Column exists, copy it
            filtered_df[col] = df[col]
        else:
            # Column doesn't exist, create it with NaN values
            filtered_df[col] = pd.NA
            
    return filtered_df

def extract_campaign_name(filename):
    """
    Extract campaign name from filename by removing common suffixes.
    
    Parameters:
    filename (str): Input filename
    
    Returns:
    str: Campaign name
    """
    # Remove file extension
    base_name = os.path.splitext(filename)[0]
    
    # Remove common suffixes
    suffixes_to_remove = [
        '_wavelengthcorrected',
        '_binned',
        '_unit_corrected',
        '_processed',
        '_final'
    ]
    
    campaign_name = base_name
    for suffix in suffixes_to_remove:
        if campaign_name.endswith(suffix):
            campaign_name = campaign_name[:-len(suffix)]
            break
    
    return campaign_name

def process_campaign_files():
    """
    Process all campaign files in RAW_PATH and create restricted and comprehensive versions.
    """
    print("="*60)
    print("PROCESSING CAMPAIGN FILES:")
    print("="*60)
    
    # Create output directories if they don't exist
    os.makedirs(RESTRICTED_OUTPUT_PATH, exist_ok=True)
    os.makedirs(COMPREHENSIVE_OUTPUT_PATH, exist_ok=True)
    
    # Get all CSV files in RAW_PATH
    campaign_files = glob.glob(os.path.join(RAW_PATH, "*.csv"))
    
    if not campaign_files:
        print("No CSV files found in RAW_PATH")
        return
    
    print(f"Found {len(campaign_files)} files to process")
    
    for file_path in campaign_files:
        filename = os.path.basename(file_path)
        campaign_name = extract_campaign_name(filename)
        
        print(f"\nProcessing: {filename}")
        print(f"  Campaign name: {campaign_name}")
        
        try:
            # Read the campaign data
            df = pd.read_csv(file_path)
            print(f"  Loaded data with {len(df)} rows and {len(df.columns)} columns")
            
            # Create restricted version
            print(f"  Creating restricted version...")
            restricted_df = filter_dataframe_columns(df, RESTRICTED_COLUMNS)
            
            # Remove rows with any empty/NaN values for restricted dataset
            rows_before = len(restricted_df)
            restricted_df = restricted_df.dropna()
            rows_after = len(restricted_df)
            rows_removed = rows_before - rows_after
            
            print(f"    Removed {rows_removed} rows with missing values ({rows_after}/{rows_before} rows remaining)")
            
            # Count existing vs missing columns for restricted
            existing_restricted = sum(1 for col in RESTRICTED_COLUMNS if col in df.columns)
            missing_restricted = len(RESTRICTED_COLUMNS) - existing_restricted
            
            print(f"    Existing columns: {existing_restricted}/{len(RESTRICTED_COLUMNS)}")
            if missing_restricted > 0:
                missing_cols = [col for col in RESTRICTED_COLUMNS if col not in df.columns]
                print(f"    Missing columns (filled with NaN): {missing_cols}")
            
            # Save restricted version
            restricted_filename = f"{campaign_name}_restricted.csv"
            restricted_path = os.path.join(RESTRICTED_OUTPUT_PATH, restricted_filename)
            restricted_df.to_csv(restricted_path, index=False)
            print(f"    Saved: {restricted_filename}")
            
            # Create comprehensive version
            print(f"  Creating comprehensive version...")
            comprehensive_df = filter_dataframe_columns(df, COMPREHENSIVE_COLUMNS)
            
            # Count existing vs missing columns for comprehensive
            existing_comprehensive = sum(1 for col in COMPREHENSIVE_COLUMNS if col in df.columns)
            missing_comprehensive = len(COMPREHENSIVE_COLUMNS) - existing_comprehensive
            
            print(f"    Existing columns: {existing_comprehensive}/{len(COMPREHENSIVE_COLUMNS)}")
            if missing_comprehensive > 0:
                missing_cols = [col for col in COMPREHENSIVE_COLUMNS if col not in df.columns]
                print(f"    Missing columns (filled with NaN): {missing_cols}")
            
            # Save comprehensive version
            comprehensive_filename = f"{campaign_name}_comprehensive.csv"
            comprehensive_path = os.path.join(COMPREHENSIVE_OUTPUT_PATH, comprehensive_filename)
            comprehensive_df.to_csv(comprehensive_path, index=False)
            print(f"    Saved: {comprehensive_filename}")
            
        except Exception as e:
            print(f"  Error processing {filename}: {e}")
            import traceback
            traceback.print_exc()

def display_summary():
    """
    Display summary of available files and column requirements.
    """
    print("\n" + "="*60)
    print("COLUMN REQUIREMENTS SUMMARY:")
    print("="*60)
    
    print(f"\nRESTRICTED columns ({len(RESTRICTED_COLUMNS)} total):")
    print(", ".join(RESTRICTED_COLUMNS))
    
    print(f"\nCOMPREHENSIVE columns ({len(COMPREHENSIVE_COLUMNS)} total):")
    print(", ".join(COMPREHENSIVE_COLUMNS))
    
    print(f"\nAdditional columns in COMPREHENSIVE (not in RESTRICTED):")
    additional_cols = [col for col in COMPREHENSIVE_COLUMNS if col not in RESTRICTED_COLUMNS]
    print(", ".join(additional_cols))
    
    # Show files to be processed
    campaign_files = glob.glob(os.path.join(RAW_PATH, "*.csv"))
    print(f"\nFiles to process ({len(campaign_files)} total):")
    for file_path in campaign_files:
        filename = os.path.basename(file_path)
        campaign_name = extract_campaign_name(filename)
        print(f"  {filename} → {campaign_name}_restricted.csv, {campaign_name}_comprehensive.csv")

# Run the processing
if __name__ == "__main__":
    # Display summary first
    display_summary()
    
    # Process all files
    process_campaign_files()
    
    print("\n" + "="*60)
    print("CAMPAIGN FILTERING COMPLETE!")
    print("="*60)


COLUMN REQUIREMENTS SUMMARY:

RESTRICTED columns (32 total):
Organization, Campaign, UTC, Date, Latitude, Longitude, Altitude, Temperature, Rel_humidity, Pressure, BC_Mass, bin1, bin2, bin3, bin4, bin5, bin6, bin7, bin8, bin9, bin10, bin11, bin12, bin13, bin14, bin15, Sc450_total, Sc550_total, Sc700_total, Abs470_total, Abs532_total, Abs660_total

COMPREHENSIVE columns (50 total):
Organization, Campaign, UTC, Date, Latitude, Longitude, Altitude, Temperature, Rel_humidity, Pressure, U, V, W, Supersaturation, Number_Concentration, CNgt3nm, CNgt10nm, BC_Mass, LWC, cbin1, cbin2, cbin3, cbin4, cbin5, cbin6, cbin7, cbin8, cbin9, cbin10, bin1, bin2, bin3, bin4, bin5, bin6, bin7, bin8, bin9, bin10, bin11, bin12, bin13, bin14, bin15, Sc450_total, Sc550_total, Sc700_total, Abs470_total, Abs532_total, Abs660_total

Additional columns in COMPREHENSIVE (not in RESTRICTED):
U, V, W, Supersaturation, Number_Concentration, CNgt3nm, CNgt10nm, LWC, cbin1, cbin2, cbin3, cbin4, cbin5, cbin6, cbin7, cbin8