In [1]:
import pandas as pd
import numpy as np
from igor2 import binarywave
import os
from pathlib import Path

In [4]:
def convert_igor_timestamp(timestamp_value):
    """
    Convert Igor Pro timestamp to pandas datetime
    Igor Pro uses seconds since 1904-01-01 00:00:00 UTC
    """
    igor_epoch = pd.Timestamp('1904-01-01 00:00:00', tz='UTC')
    datetime_result = igor_epoch + pd.Timedelta(seconds=timestamp_value)
    return datetime_result

def read_single_ibw(filepath):
    """Read a single IBW file and return as DataFrame"""
    try:
        # Load the IBW file
        data = binarywave.load(filepath)
        
        # Extract the wave data
        wave_data = data['wave']['wData']
        
        # Get wave name (filename without extension)
        wave_name = Path(filepath).stem
        
        # Create DataFrame
        if wave_data.ndim == 1:
            # 1D data
            df = pd.DataFrame({wave_name: wave_data})
        else:
            # Multi-dimensional data - flatten or handle as needed
            df = pd.DataFrame(wave_data)
            
        return df
        
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

def process_instrument_folder(instrument_path, instrument_name):
    """
    Process all IBW files in an instrument folder
    Returns DataFrame with instrument-prefixed column names
    """
    print(f"Processing instrument: {instrument_name}")
    
    # Find all IBW files in the instrument folder
    ibw_files = list(Path(instrument_path).glob("*.ibw"))
    
    if not ibw_files:
        print(f"  No IBW files found in {instrument_path}")
        return None
    
    print(f"  Found {len(ibw_files)} IBW files")
    
    # Read and combine all IBW files for this instrument
    dataframes = []
    timestamp_column = None
    
    for filepath in ibw_files:
        try:
            # Load the IBW file
            data = binarywave.load(filepath)
            wave_data = data['wave']['wData']
            
            # Get column name from filename
            col_name = Path(filepath).stem
            
            # Handle different data shapes
            if wave_data.ndim == 1:
                df_temp = pd.DataFrame({col_name: wave_data})
            else:
                df_temp = pd.DataFrame({col_name: wave_data.flatten()})
            
            # Check if this is a timestamp column (contains "Time" in name)
            if 'time' in col_name.lower():
                timestamp_column = col_name
                print(f"    Found timestamp column: {col_name}")
            
            dataframes.append(df_temp)
            print(f"    Loaded: {col_name} ({len(df_temp)} rows)")
            
        except Exception as e:
            print(f"    Error reading {filepath}: {e}")
            continue
    
    if not dataframes:
        print(f"  No valid data found for {instrument_name}")
        return None
    
    # Combine all dataframes for this instrument
    try:
        instrument_df = pd.concat(dataframes, axis=1)
    except:
        instrument_df = pd.concat(dataframes, axis=1, join='outer')
    
    # Convert timestamps if we found a timestamp column
    if timestamp_column and timestamp_column in instrument_df.columns:
        print(f"    Converting timestamps from {timestamp_column}")
        instrument_df['UTC_datetime'] = instrument_df[timestamp_column].apply(convert_igor_timestamp)
        instrument_df['Date'] = instrument_df['UTC_datetime'].dt.date
        instrument_df['UTC'] = instrument_df['UTC_datetime'].dt.time
        
        # Drop the original timestamp column and UTC_datetime (keep Date and UTC for merging)
        instrument_df = instrument_df.drop(columns=[timestamp_column, 'UTC_datetime'])
    else:
        print(f"    Warning: No timestamp column found for {instrument_name}")
        return None
    
    # Rename columns with instrument prefix (except Date and UTC)
    columns_to_rename = [col for col in instrument_df.columns if col not in ['Date', 'UTC']]
    rename_dict = {col: f"{instrument_name}_{col}" for col in columns_to_rename}
    instrument_df = instrument_df.rename(columns=rename_dict)
    
    print(f"    Final columns: {instrument_df.columns.tolist()}")
    print(f"    Data shape: {instrument_df.shape}")
    
    return instrument_df

def process_campaign_data(campaign_path, output_path):
    """
    Process all instrument folders in a campaign
    Merge all data by Date and UTC columns
    """
    campaign_folder = Path(campaign_path)
    
    if not campaign_folder.exists():
        print(f"Campaign path does not exist: {campaign_path}")
        return None
    
    # Find all instrument folders (subdirectories)
    instrument_folders = [d for d in campaign_folder.iterdir() if d.is_dir()]
    
    if not instrument_folders:
        print(f"No instrument folders found in {campaign_path}")
        return None
    
    print(f"Found {len(instrument_folders)} instrument folders:")
    for folder in instrument_folders:
        print(f"  - {folder.name}")
    
    # Process each instrument folder
    all_instrument_data = []
    
    for instrument_folder in instrument_folders:
        instrument_name = instrument_folder.name
        instrument_df = process_instrument_folder(instrument_folder, instrument_name)
        
        if instrument_df is not None:
            all_instrument_data.append(instrument_df)
            print(f"✓ Successfully processed {instrument_name}")
        else:
            print(f"✗ Failed to process {instrument_name}")
        print("-" * 50)
    
    if not all_instrument_data:
        print("No valid instrument data found!")
        return None
    
    # Merge all instrument data by Date and UTC
    print("Merging all instrument data...")
    merged_df = all_instrument_data[0]
    
    for i, df in enumerate(all_instrument_data[1:], 1):
        print(f"  Merging instrument {i+1}/{len(all_instrument_data)}...")
        merged_df = pd.merge(merged_df, df, on=['Date', 'UTC'], how='outer')
    
    # Reorder columns to put Date and UTC at the front
    all_columns = merged_df.columns.tolist()
    other_columns = [col for col in all_columns if col not in ['Date', 'UTC']]
    ordered_columns = ['Date', 'UTC'] + other_columns
    merged_df = merged_df[ordered_columns]
    
    print(f"Final merged dataset shape: {merged_df.shape}")
    print(f"Final columns: {merged_df.columns.tolist()}")
    
    # Save to CSV
    output_file = os.path.join(output_path, "CALNEX_raw.csv")
    merged_df.to_csv(output_file, index=False)
    print(f"Saved to: {output_file}")
    
    return merged_df

In [None]:
# Main execution
if __name__ == "__main__":
    # Paths
    campaign_path = rf"C:\Users\haika\Desktop\May_Research\campaign_data\CALNEX"
    output_path = rf"C:\Users\haika\Desktop\May_Research\may_datasets\raw_campaigns"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Process all campaign data
    print("="*60)
    print("PROCESSING CALNEX CAMPAIGN DATA")
    print("="*60)
    
    final_df = process_campaign_data(campaign_path, output_path)
    
    if final_df is not None:
        print("\n" + "="*60)
        print("PROCESSING COMPLETE!")
        print("="*60)
        print(f"Final dataset shape: {final_df.shape}")
        print(f"Date range: {final_df['Date'].min()} to {final_df['Date'].max()}")
        print(f"Number of instruments: {len([col for col in final_df.columns if '_' in col and col not in ['Date', 'UTC']])}")
        
        # Show sample of the data
        print("\nSample of merged data:")
        print(final_df.head())
        
        # Show data completeness
        print("\nData completeness by instrument:")
        instrument_cols = [col for col in final_df.columns if '_' in col and col not in ['Date', 'UTC']]
        for col in instrument_cols:
            completeness = (final_df[col].notna().sum() / len(final_df)) * 100
            print(f"  {col}: {completeness:.1f}%")
    else:
        print("Failed to process campaign data!")

PROCESSING CALNEX CAMPAIGN DATA
Found 13 instrument folders:
  - AircraftExt
  - AircraftMet
  - AircraftMis
  - AircraftPos
  - AMS
  - CCN
  - CloudProbes
  - CRDExt
  - LWC
  - NAerosol
  - PAS
  - PSAP
  - SP2
Processing instrument: AircraftExt
  Found 3 IBW files
    Found timestamp column: AOCTimewave_all
    Loaded: AOCTimewave_all (462480 rows)
    Loaded: WindDir_smooth_all (462480 rows)
    Loaded: WindSpd_smooth_all (462480 rows)
    Converting timestamps from AOCTimewave_all
    Final columns: ['AircraftExt_WindDir_smooth_all', 'AircraftExt_WindSpd_smooth_all', 'Date', 'UTC']
    Data shape: (462480, 4)
✓ Successfully processed AircraftExt
--------------------------------------------------
Processing instrument: AircraftMet
  Found 12 IBW files
    Loaded: AmbTemp_all (462480 rows)
    Found timestamp column: AOCTimewave_all
    Loaded: AOCTimewave_all (462480 rows)
    Loaded: DewPtTempTDL_all (462480 rows)
    Loaded: DewPtTemp_all (462480 rows)
    Loaded: H2Omr_all (462