In [1]:
import pandas as pd
import icartt
import os
import warnings
import re
from datetime import datetime
import csv
from datetime import datetime, timedelta
from netCDF4 import Dataset

In [2]:
# NEEDED CONSTANTS
# list item to index mapping (for each instrument entry in file_info_list)
FILE_MAP = {
    "path_to_instrument_folder": 0,
    "has_header_info": 1,
    "has_header": 2,
    "header_start": 3,
    "time_header_name": 4,
    "time_format": 5,
    "separator": 6,
    "file_type": 7,
    "column_names": 8
}

CAMPAIGN_NAME = "BBOP"

In [3]:
def date_extractor(file_path):
    import os
    import re
    from datetime import datetime, timedelta

    filename = os.path.basename(file_path)

    # Case 1: Match full numeric date like 20230813
    match_full_date = re.search(r'20\d{6}', filename)
    if match_full_date:
        return match_full_date.group(0)

    # Case 2: Match '13Aug06' pattern (YYMmmDD format)
    match_text_date = re.search(r'\d{2}[A-Za-z]{3}\d{2}', filename)
    if match_text_date:
        try:
            # Parse as YYMmmDD instead of DDMmmYY
            parsed_date = datetime.strptime(match_text_date.group(0), "%y%b%d")
            return parsed_date.strftime("%Y%m%d")
        except ValueError:
            pass

    # Case 3: Match 'Fxx' or 'Fx' flight codes starting from F08 → 2008-03-31
    match_flight = re.search(r'[Ff](\d{1,2})', filename)
    if match_flight:
        flight_num = int(match_flight.group(1))
        base_date = datetime(2008, 3, 31)  # F08 = March 31, 2008
        days_offset = flight_num - 8       # F08 is day 0
        final_date = base_date + timedelta(days=days_offset)
        return final_date.strftime("%Y%m%d")

    # Case 4: Match MMDDYY at the start (e.g., 022214 → Feb 22, 2014)
    match_mmddyy = re.match(r'(\d{6})', filename)
    if match_mmddyy:
        try:
            parsed_date = datetime.strptime(match_mmddyy.group(1), "%m%d%y")
            return parsed_date.strftime("%Y%m%d")
        except ValueError:
            pass

    # Case 5: First 6 digits → assume YYMMDD and prepend 20
    short_date = filename[:6]
    if re.match(r'\d{6}', short_date):
        try:
            parsed_date = datetime.strptime(short_date, "%y%m%d")
            return parsed_date.strftime("%Y%m%d")
        except ValueError:
            pass

    # Case 6: Match '18-Apr-08' or similar patterns
    match_dmy = re.search(r'(\d{1,2})[-_](\w{3})[-_](\d{2})', filename)
    if match_dmy:
        try:
            parsed_date = datetime.strptime(match_dmy.group(0), "%d-%b-%y")
            return parsed_date.strftime("%Y%m%d")
        except ValueError:
            pass

    return None


def get_instrument_path(campaign_path, instrument_name):
    instrument_path = os.path.join(campaign_path, instrument_name)
    if not os.path.exists(instrument_path):
        raise FileNotFoundError(f"Instrument path '{instrument_path}' does not exist.")
    return instrument_path


def conv_to_df(file_path, instr, file_info_list, max_lines_to_scan=250):
    # Unpack metadata
    path_to_instrument_folder, has_header_info, has_header, header_start, time_header_name, time_format, separator, file_type, col_names = file_info_list[instr]

    print(f"\nReading {file_path} for instrument {instr} with file type {file_type}")
    print(f"Time header to detect: {time_header_name}")
    print(f"Expected columns: {col_names}")

    # Special handling for CDP CDF files
    if file_type.lower() == "cdf-cdp-isdac":
        nc = Dataset(file_path, mode='r')

        # Extract time and scalar data
        data_dict = {
            'time': nc.variables['time'][:],
            'timevec': nc.variables['timevec'][:],
            'CDP_n': nc.variables['CDP_n'][:]
        }

        # Extract bin concentration and label columns
        CDP_conc = nc.variables['CDP_conc'][:]
        bin_min = nc.variables['CDP_bin_min'][:]
        bin_mid = nc.variables['CDP_bin_mid'][:]
        bin_max = nc.variables['CDP_bin_max'][:]

        # Fix: generate actual integer bin labels like "2um", "3um", etc.
        bin_labels = [f"{int(midv)}um" for midv in bin_mid]

        # Add binned columns
        for i, label in enumerate(bin_labels):
            data_dict[label] = CDP_conc[:, i]

        return pd.DataFrame(data_dict)

    # Handle CDF for Convair ISDAC
    if file_type.lower() == "cdf-convair-isdac":
        nc = Dataset(file_path, mode='r')
        data_dict = {}
        length_check = None

        for var_name, var in nc.variables.items():
            data = var[:]
            if data.ndim == 1:
                data_dict[var_name] = data
                length_check = len(data) if length_check is None else length_check
            elif data.ndim == 2 and data.shape[0] == length_check:
                for i in range(data.shape[1]):
                    col_name = f"{var_name}_{i}"
                    data_dict[col_name] = data[:, i]

        return pd.DataFrame(data_dict)

    # Proceed with default CSV reading logic
    skiprows = 0
    is_regex = '\\s' in separator or '\\t' in separator

    if has_header_info:
        match_lines = []
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for i, line in enumerate(f):
                if i >= max_lines_to_scan:
                    break
                line = line.strip()
                if not line:
                    continue
                if is_regex:
                    header_fields = [x.strip() for x in re.split(separator, line)]
                else:
                    header_fields = [x.strip() for x in line.split(separator)]

                if time_header_name in header_fields:
                    match_lines.append(i)

        if header_start == "last":
            if match_lines:
                skiprows = match_lines[-1]
            else:
                raise ValueError(f"No header found in first {max_lines_to_scan} lines for {file_path}")
        elif header_start.isdigit():
            header_index = int(header_start) - 1
            if len(match_lines) > header_index:
                skiprows = match_lines[header_index]
            else:
                raise ValueError(f"Only found {len(match_lines)} matches, but header_start={header_start} in {file_path}")
        else:
            raise ValueError(f"Invalid header_start value: {header_start}")

    try:
        df = pd.read_csv(
            file_path,
            skiprows=skiprows,
            header=0 if has_header else None,
            names=col_names if not has_header else None,
            sep=separator,
            engine='python' if is_regex else 'c'
        )
    except Exception as e:
        print(f"⚠️ Error reading {file_path}: {e}")
        raise e

    return df


def normalize_date_time(df, date_str, time_col, time_format, year_col=None):
    """
    Adds 'UTC' (seconds since midnight) and 'Date' (YYYYMMDD) columns to the dataframe.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        date_str (str): Date string in 'YYYYMMDD' format (used for fixed-date formats).
        time_col (str): Column containing time values (or fractional DOY).
        time_format (str): Format of the time column (e.g., 'HH:MM:SS', 'S', 'Year+DOY').
        year_col (str): Optional column name containing the year (required for 'Year+DOY').

    Returns:
        pd.DataFrame: DataFrame with 'UTC' and 'Date' columns prepended.
    """
    def convert_to_seconds(t, idx=None):
        try:
            if time_format == "S":
                return float(t)
            elif time_format == "H.hh":
                return int(float(t) * 3600)
            elif time_format == "HH:MM:SS":
                dt = datetime.strptime(t.strip(), "%H:%M:%S")
            elif time_format == "HH:MM:SS.sss":
                dt = datetime.strptime(t.strip(), "%H:%M:%S.%f")
            elif time_format == "MM/DD/YY HH:MM:SS":
                dt = datetime.strptime(t.strip(), "%m/%d/%y %H:%M:%S")
            elif time_format == "MM/DD/YY HH:MM:SS.sss":
                dt = datetime.strptime(t.strip(), "%m/%d/%y %H:%M:%S.%f")
            elif time_format == "YYYY-MM-DD HH:MM:SS":
                dt = datetime.strptime(t.strip(), "%Y-%m-%d %H:%M:%S")
            elif time_format == "G1":
                base = datetime(1904, 1, 1)
                g1_time = float(t)
                full_time = base + timedelta(seconds=g1_time)
                return full_time.hour * 3600 + full_time.minute * 60 + full_time.second
            elif time_format == "G2":
                base = datetime(1970, 1, 1)
                g2_time = float(t)
                full_time = base + timedelta(seconds=g2_time)
                return full_time.hour * 3600 + full_time.minute * 60 + full_time.second
            elif time_format == "2008+S":
                base = datetime(2008, 1, 1)
                seconds_since = float(t)
                full_time = base + timedelta(seconds=seconds_since)
                return full_time.hour * 3600 + full_time.minute * 60 + full_time.second
            elif time_format == "HHMMSS":
                t = int(float(t))
                hours = t // 10000
                minutes = (t % 10000) // 100
                seconds = t % 100
                return hours * 3600 + minutes * 60 + seconds
            elif time_format == "Year+DOY":
                doy = float(t)
                year = int(df.loc[idx, year_col])
                base_date = datetime(year, 1, 1)
                full_time = base_date + timedelta(days=doy - 1)
                return full_time.hour * 3600 + full_time.minute * 60 + full_time.second
            elif time_format == "2008+DOY":
                doy = float(t)
                base_date = datetime(2008, 1, 1)
                full_time = base_date + timedelta(days=doy - 1)
                return full_time.hour * 3600 + full_time.minute * 60 + full_time.second
            else:
                return None
            return dt.hour * 3600 + dt.minute * 60 + dt.second + dt.microsecond / 1e6
        except:
            return None
    
    print(df.columns)
    print(df.head())

    if time_format == "[Hour][Minute][Second]":
        df["Hour"] = pd.to_numeric(df["Hour"], errors="coerce")
        df["Minute"] = pd.to_numeric(df["Minute"], errors="coerce")
        df["Second"] = pd.to_numeric(df["Second"], errors="coerce")

        # Drop rows where any of Hour, Minute, or Second is missing
        df = df[df[["Hour", "Minute", "Second"]].notna().all(axis=1)].copy()

        df["UTC"] = df["Hour"] * 3600 + df["Minute"] * 60 + df["Second"]
        df["Date"] = date_str
    elif time_format == "[HH][MM][SS]":
        df["HH"] = pd.to_numeric(df["HH"], errors="coerce")
        df["MM"] = pd.to_numeric(df["MM"], errors="coerce")
        df["SS"] = pd.to_numeric(df["SS"], errors="coerce")

        # Drop rows where any of the time parts are missing (non-numeric originally)
        df = df[df[["HH", "MM", "SS"]].notna().all(axis=1)].copy()

        df["UTC"] = df["HH"] * 3600 + df["MM"] * 60 + df["SS"]
        df["Date"] = date_str
    elif time_format in ["Year+DOY", "G2", "2008+DOY"]:
        df["UTC"] = [convert_to_seconds(row[time_col], idx) for idx, row in df.iterrows()]
        df["Date"] = date_str
    else:
        df["UTC"] = df[time_col].apply(convert_to_seconds)
        df["Date"] = date_str

    df["UTC"] = df["UTC"].apply(lambda x: int(x) if pd.notna(x) else x)
    cols = ["UTC", "Date"] + [col for col in df.columns if col not in ["UTC", "Date"]]
    return df[cols]



def read_file_info_list(csv_path):
    """
    Reads a CSV and reconstructs the file_info_list dictionary.

    Args:
        csv_path (str): Path to the CSV file.

    Returns:
        dict: file_info_list dictionary reconstructed.
    """
    file_info_list = {}

    with open(csv_path, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            instrument_name = row["instrument_name"]
            path = row["path_to_instrument_folder"]
            has_header_info = row["has_header_info"].strip().lower() == 'true'
            has_header = row["has_header"].strip().lower() == 'true'
            header_start = row["header_start"]
            time_header_name = row["time_header_name"]
            time_format = row["time_format"]
            separator = row["separator"]
            file_type = row["file_type"]
            column_names = row["column_names"].split(";")

            file_info_list[instrument_name] = [
                path,
                has_header_info,
                has_header,
                header_start,
                time_header_name,
                time_format,
                separator,
                file_type,
                column_names
            ]

    return file_info_list

def avg_same_datetime(df):
    """
    Averages rows in the DataFrame that share the same UTC and Date.
    Filters out rows with non-numeric values before averaging.
    """
    if 'UTC' not in df.columns or 'Date' not in df.columns:
        raise ValueError("DataFrame must contain 'UTC' and 'Date' columns.")

    # Convert UTC and Date to numeric, marking invalid rows as NaN
    df_clean = df.copy()
    df_clean['UTC'] = pd.to_numeric(df_clean['UTC'], errors='coerce')
    df_clean['Date'] = pd.to_numeric(df_clean['Date'], errors='coerce')
    
    # Remove rows where UTC or Date are NaN (i.e., were non-numeric)
    df_clean = df_clean.dropna(subset=['UTC', 'Date'])
    
    # Convert other columns to numeric where possible
    for col in df_clean.columns:
        if col not in ['UTC', 'Date']:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    # Group by 'UTC' and 'Date' and compute the mean of all numeric columns
    grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
    
    return grouped


In [4]:
#Go to campaign folder
campaign_path = rf"C:\Users\haika\Desktop\May_Research\campaign_data\{CAMPAIGN_NAME}"

#READ file_info_list from csv
#file info path
file_info_path = rf"C:\Users\haika\Desktop\May_Research\may_datasets\file_info_list"
file_info_output_path = os.path.join(file_info_path, rf"{CAMPAIGN_NAME}_fileinfo.csv")

file_info_list = read_file_info_list(file_info_output_path)

print(file_info_list)

#instrument folders that we're using
INSTRUMENT_NAMES = tuple(file_info_list.keys())


{'wcm': ['C:\\Users\\haika\\Desktop\\May_Research\\campaign_data\\BBOP\\wcm', True, True, 'last', 'Start_UTC', 'S', ',', 'ict', ['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021']], 'uhsas': ['C:\\Users\\haika\\Desktop\\May_Research\\campaign_data\\BBOP\\uhsas', True, True, 'last', 'Start_Time_(UTC)', 'S', ',', 'ict', ['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068', '0.070', '0.072', '0.075', '0.077', '0.079', '0.081', '0.084', '0.086', '0.088', '0.091', '0.094', '0.096', '0.099', '0.102', '0.105', '0.108', '0.111', '0.114', '0.118', '0.121', '0.124', '0.128', '0.132', '0.135', '0.139', '0.143', '0.147', '0.152', '0.156', '0.161', '0.165', '0.170', '0.175', '0.180', '0.185', '0.190', '0.196', '0.202', '0.207', '0.213', '0.220', '0.226', '0.232', '0.239', '0.246', '0.253', '0.260', '0.268', '0.276', '0.284', '0.292', '0.300', '0.309', '0.318', '0.327', '0.336', '0.346', '0.356', '0.366', '0.377', '0.388', '0.399', '0.410', '0.422', '0.434', '0.447', '0.460', '0.473', '0.487'

In [5]:
#PRINT FILE_INFO_LIST to csv for future 
file_info_path = rf"C:\Users\haika\Desktop\May_Research\may_datasets\file_info_list"
file_info_output_path = os.path.join(file_info_path, rf"{CAMPAIGN_NAME}_fileinfo.csv")

# Updated header
header = [
    "instrument_name",
    "path_to_instrument_folder",
    "has_header_info",
    "has_header",
    "header_start",
    "time_header_name",
    "time_format",
    "separator",
    "file_type",
    "column_names"
]

# Flatten and write to CSV
with open(file_info_output_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for instr_name, values in file_info_list.items():
        row = [
            instr_name,
            values[0],  # path_to_instrument_folder
            values[1],  # has_header_info
            values[2],  # has_header
            values[3],  # header_start
            values[4],  # time_header_name
            values[5],  # time_format
            values[6],  # separator
            values[7],  # file_type
            ";".join(values[8])  # column_names as semicolon-separated string
        ]
        writer.writerow(row)

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\haika\\Desktop\\May_Research\\may_datasets\\file_info_list\\ACE-ENA_fileinfo.csv'

In [5]:
# instrument_data will look like this:
#
#                date_1      date_2      date_3   .....
# instrument_1:   df_11      df_12       df_13    .....
# instrument_2:   df_21      df_22       df_23    .....
# instrument_3:   df_31      df_32       df_33    .....
# .....

instrument_data = {}
all_dates = []

for instr in INSTRUMENT_NAMES:
    folder = get_instrument_path(campaign_path, instr)
    instrument_data[instr] = {}

    for file_name in os.listdir(folder):
        file_path = os.path.join(folder, file_name)
        
        # Extract YYYYMMDD date from filename (customize this as needed)
        date_str = date_extractor(file_name)  # You define this

        #add new found date into all_dates
        if date_str not in all_dates:
            all_dates.append(date_str)
        
        df = conv_to_df(file_path, instr, file_info_list, max_lines_to_scan=250)
        instrument_data[instr][date_str] = df


#print(instrument_data["ccn"][all_dates[0]])



Reading C:\Users\haika\Desktop\May_Research\campaign_data\BBOP\wcm\wcm_g1_20130715174627_R1_L1_bbop001s.ict for instrument wcm with file type ict
Time header to detect: Start_UTC
Expected columns: ['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021']

Reading C:\Users\haika\Desktop\May_Research\campaign_data\BBOP\wcm\wcm_g1_20130717161024_R1_L1_bbop001s.ict for instrument wcm with file type ict
Time header to detect: Start_UTC
Expected columns: ['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021']

Reading C:\Users\haika\Desktop\May_Research\campaign_data\BBOP\wcm\wcm_g1_20130719163853_R1_L1_bbop001s.ict for instrument wcm with file type ict
Time header to detect: Start_UTC
Expected columns: ['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021']

Reading C:\Users\haika\Desktop\May_Research\campaign_data\BBOP\wcm\wcm_g1_20130723164725_R1_L1_bbop001s.ict for instrument wcm with file type ict
Time header to detect: Start_UTC
Expected columns: ['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021']

Reading C:\Users\ha

  df = pd.read_csv(



Reading C:\Users\haika\Desktop\May_Research\campaign_data\BBOP\iwg1\aaf.iwg1001s.g1.bbop.20130809a.a4.txt for instrument iwg1 with file type txt
Time header to detect: Date_Time
Expected columns: ['IWG1', 'Date_Time', 'Lat', 'Lon', 'GPS_MSL_Alt', 'WGS_84_Alt', 'Press_Alt', 'Radar_Alt', 'Grnd_Spd', 'True_Airspeed', 'Indicated_Airspeed', 'Mach_Number', 'Vert_Velocity', 'True_Hdg', 'Track', 'Drift', 'Pitch', 'Roll', 'Side_slip', 'Angle_of_Attack', 'Ambient_Temp', 'Dew_Point', 'Total_Temp', 'Static_Press', 'Dynamic_Press', 'Cabin_Pressure', 'Wind_speed', 'Wind_Dir', 'Vert_Wind_Spd', 'Solar_Zenith', 'Sun_Elev_AC', 'Sun_Az_Grd', 'Sun_Az_AC', 'Flag_qc', 'Flag_ac', 'Flag_Diluter', 'Flag_cloud', 'Flag_cloudP', 'RH_water', 'RH_ice', 'Theta', 'Cabin_Temperature', 'Q_Dilution', 'Q_bypass', 'D_ratio', 'Leg_num']

Reading C:\Users\haika\Desktop\May_Research\campaign_data\BBOP\iwg1\aaf.iwg1001s.g1.bbop.20130813a.a4.txt for instrument iwg1 with file type txt
Time header to detect: Date_Time
Expected 

In [43]:
# Test print all the instruments
for instr in INSTRUMENT_NAMES:
    print(f"Instrument: {instr}")
    
    date_dict = instrument_data.get(instr, {})
    valid_dates = [d for d in date_dict.keys() if d is not None]

    if not valid_dates:
        print("  No valid dates or data available")
        print()
        continue

    # Get the first available date
    first_date = sorted(valid_dates)[0]
    df = date_dict[first_date]

    print(f"  First Date: {first_date}, DataFrame shape: {df.shape}")
    print(df.head(10))
    print()

Instrument: fcdp
  First Date: 20181104, DataFrame shape: (14001, 26)
   start_time  total_number_concentration  number_concentration_0.0-1.5  \
0     47076.0                    0.000000                           0.0   
1     47077.0                  115.921997                           0.0   
2     47078.0                  116.185997                           0.0   
3     47079.0                  123.745003                           0.0   
4     47080.0                  248.078003                           0.0   
5     47081.0                  223.912003                           0.0   
6     47082.0                  116.370003                           0.0   
7     47083.0                  216.106995                           0.0   
8     47084.0                  107.039001                           0.0   
9     47085.0                  124.389999                           0.0   

   number_concentration_1.5-3.0  number_concentration_3.0-4.5  \
0                      0.000000        

In [6]:
#Unify dates for all dfs
for instrument, date_dict in instrument_data.items():
    for date_str, df in date_dict.items():
        file_info = file_info_list[instrument]
        time_col = file_info[FILE_MAP["time_header_name"]]  # time_header_name
        time_format = file_info[FILE_MAP["time_format"]] # time_format
        print(f"Processing {instrument} for date {date_str} with time column '{time_col}' and format '{time_format}'")
        #print(f"{date_str} - before: {len(df)}")
        instrument_data[instrument][date_str] = normalize_date_time(df.copy(), date_str, time_col, time_format)

        instrument_data[instrument][date_str] = avg_same_datetime(instrument_data[instrument][date_str])
        #print(f"{date_str} - after: {len(df)}")


Processing wcm for date 20130715 with time column 'Start_UTC' and format 'S'
Index(['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021'], dtype='object')
   Start_UTC   WCM_TWC   WCM_083   WCM_021
0      63987  0.001979  0.016892  0.085679
1      63988 -0.003096  0.013574  0.075955
2      63989 -0.007287  0.002945  0.062541
3      63990  0.001266  0.013546  0.083598
4      63991 -0.001278  0.012347  0.084984
Processing wcm for date 20130717 with time column 'Start_UTC' and format 'S'
Index(['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021'], dtype='object')
   Start_UTC   WCM_TWC   WCM_083   WCM_021
0      58224 -0.006263  0.018465  0.086253
1      58225  0.001643  0.027415  0.089697
2      58226  0.007487  0.031723  0.094997
3      58227  0.003946  0.025435  0.086764
4      58228  0.003838  0.028573  0.090143
Processing wcm for date 20130719 with time column 'Start_UTC' and format 'S'
Index(['Start_UTC', 'WCM_TWC', 'WCM_083', 'WCM_021'], dtype='object')
   Start_UTC   WCM_TWC   WCM_083   WCM_021


  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20130723 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
   Start_Time_(UTC)   0.061   0.063   0.065   0.067   0.068   0.070   0.072  \
0             74518 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
1             74519 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
2             74520 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
3             74521 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   
4             74522 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0   

    0.075   0.077  ...   0.789  Num_Conc_(#/cm^3)  Area_Conc_(um^2/cm^3)  \
0 -9999.0 -9999.0  ... -9999.0           

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20130730 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
   Start_Time_(UTC)     0.061    0.063    0.065    0.067    0.068    0.070  \
0             71113  1019.512  779.324  932.555  752.524  651.471  542.541   
1             71114   620.579  536.568  656.411  686.101  521.001  374.488   
2             71115   681.798  734.909  794.087  713.622  740.326  517.716   
3             71116   682.144  748.257  709.798  676.207  618.449  507.687   
4             71117   622.771  859.437  809.461  853.008  707.255  599.001   

     0.072    0.075    0.077  ...  0.789  Num_Conc_(#/cm^3)  \
0  428.195  175.348   95.343  ...    0.0           6711.111 

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20130813 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
                                                               Start_Time_(UTC)  \
64382 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
64383 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
64384 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
64385 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
64386 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   

                                                                0.061   0.063  \
64382 -9999.

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20130821 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
                                                               Start_Time_(UTC)  \
75201 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
75202 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
75203 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
75204 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
75205 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   

                                                                0.061   0.063  \
75201 -9999.

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20130905 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
                                                                     Start_Time_(UTC)  \
71259 1017.410 1376.945 1606.436 1285.149 856.766  571.177  349.700           145.344   
71260 2041.975 1873.090 2084.964 1464.808 1523.037 1156.607 772.046           261.004   
71261 2314.557 2522.175 2196.138 2280.666 2128.470 1240.582 782.138           361.409   
71262 1732.509 2382.200 2490.482 2614.805 1992.385 1443.758 636.432           301.642   
71263 3750.955 3323.828 4283.700 3897.933 2919.998 1822.410 736.656           341.702   

                                                         

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20131004 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
                                                               Start_Time_(UTC)  \
61929 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
61930 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
61931 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
61932 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   
61933 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0 -9999.0           -9999.0   

                                                                0.061   0.063  \
61929 -9999.

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing uhsas for date 20131018 with time column 'Start_Time_(UTC)' and format 'S'
Index(['Start_Time_(UTC)', '0.061', '0.063', '0.065', '0.067', '0.068',
       '0.070', '0.072', '0.075', '0.077',
       ...
       '0.789', 'Num_Conc_(#/cm^3)', 'Area_Conc_(um^2/cm^3)',
       'Vol_Conc_(um^3/cm^3)', 'Counts', 'Noise', 'CPC', 'Total', 'Max_Flag',
       'Cloud_Flag'],
      dtype='object', length=101)
                                                               Start_Time_(UTC)  \
68590 133.511 267.023 133.511 296.692 442.495 356.030 283.371           162.121   
68591 266.975 457.672 266.975 355.967 427.160 457.672 348.702           200.231   
68592 399.113 114.032 239.468 118.256 364.903 240.735 195.484           104.530   
68593 266.359 152.205 292.995 493.257 593.600 481.983 684.923           351.974   
68594 0.000   38.238  53.533  79.308  198.838 191.190 229.428           181.630   

                                                                 0.061  \
68590 133.511 267.0

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20131004 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3463751550   
1                                                              3463751560   
2                            Time resolution: 60 sec           3463751570   
3                                 Uncertainty: ~ 25%           3463751580   
4                                       SP2 Unit: 24           3463751590   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20131011 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3464364740   
1                                                              3464364750   
2                            Time resolution: 60 sec           3464364760   
3                                 Uncertainty: ~ 25%           3464364770   
4                                       SP2 Unit: 24           3464364780   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index

Processing sp2 for date 20131021 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3465226920   
1                                                              3465226930   
2                            Time resolution: 60 sec           3465226940   
3                                 Uncertainty: ~ 25%           3465226950   
4                                       SP2 Unit: 24           3465226960   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20130719 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3457096750   
1                                                              3457096760   
2                            Time resolution: 60 sec           3457096770   
3                                 Uncertainty: ~ 25%           3457096780   
4                                       SP2 Unit: 24           3457096790   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20130730 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3458052680   
1                                                              3458052690   
2                            Time resolution: 60 sec           3458052700   
3                                 Uncertainty: ~ 25%           3458052710   
4                                       SP2 Unit: 24           3458052720   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20130814 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3459344060   
1                                                              3459344070   
2                            Time resolution: 60 sec           3459344080   
3                                 Uncertainty: ~ 25%           3459344090   
4                                       SP2 Unit: 24           3459344100   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20130822 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3460038580   
1                                                              3460038590   
2                            Time resolution: 60 sec           3460038600   
3                                 Uncertainty: ~ 25%           3460038610   
4                                       SP2 Unit: 24           3460038620   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)
  grouped = df_clean.groupby(['UTC', 'Date'], as_index=False).mean(numeric_only=True)


Processing sp2 for date 20130911 with time column 'SP2_time' and format 'HH:MM:SS'
Index(['filenotes', 'SP2_datetime_in_sec', 'SP2_date', 'SP2_time',
       'SP2_rBC_conc', 'SP2_Dmin', 'SP2_Dgeo', 'SP2_Dmax', 'SP2_cnts_0',
       'SP2_cnts_1',
       ...
       'SP2_cnts_190', 'SP2_cnts_191', 'SP2_cnts_192', 'SP2_cnts_193',
       'SP2_cnts_194', 'SP2_cnts_195', 'SP2_cnts_196', 'SP2_cnts_197',
       'SP2_cnts_198', 'SP2_cnts_199'],
      dtype='object', length=208)
                                           filenotes  SP2_datetime_in_sec  \
0  rBC (refractory Black Carbon)  rBC mass conc a...           3461764630   
1                                                              3461764640   
2                            Time resolution: 60 sec           3461764650   
3                                 Uncertainty: ~ 25%           3461764660   
4                                       SP2 Unit: 24           3461764670   

     SP2_date  SP2_time  SP2_rBC_conc  SP2_Dmin  SP2_Dgeo  SP2_Dma

In [45]:
# Test print all the instruments
for instr in INSTRUMENT_NAMES:
    print(f"Instrument: {instr}")
    
    date_dict = instrument_data.get(instr, {})
    valid_dates = [d for d in date_dict.keys() if d is not None]

    if not valid_dates:
        print("  No valid dates or data available")
        print()
        continue

    # Get the first available date
    first_date = sorted(valid_dates)[0]
    df = date_dict[first_date]

    print(f"  First Date: {first_date}, DataFrame shape: {df.shape}")
    print(df.head(10))
    print()

Instrument: fcdp
  First Date: 20181104, DataFrame shape: (14001, 28)
     UTC      Date  start_time  total_number_concentration  \
0  47076  20181104     47076.0                    0.000000   
1  47077  20181104     47077.0                  115.921997   
2  47078  20181104     47078.0                  116.185997   
3  47079  20181104     47079.0                  123.745003   
4  47080  20181104     47080.0                  248.078003   
5  47081  20181104     47081.0                  223.912003   
6  47082  20181104     47082.0                  116.370003   
7  47083  20181104     47083.0                  216.106995   
8  47084  20181104     47084.0                  107.039001   
9  47085  20181104     47085.0                  124.389999   

   number_concentration_0.0-1.5  number_concentration_1.5-3.0  \
0                           0.0                      0.000000   
1                           0.0                      0.000000   
2                           0.0                     

In [7]:
# Create an output folder if it doesn't exist
output_folder = rf"C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\{CAMPAIGN_NAME}_combined_instruments_csv"
os.makedirs(output_folder, exist_ok=True)

# Step 1: Combine each instrument's data vertically
for instr, date_dict in instrument_data.items():
    # Filter out empty DataFrames
    non_empty_dfs = [df for df in date_dict.values() if not df.empty]
    
    if not non_empty_dfs:
        print(f"Skipping {instr}: no data found.")
        continue

    combined_instr_df = pd.concat(non_empty_dfs, ignore_index=True)

    # Step 2: Save to CSV
    output_path = os.path.join(output_folder, f"{instr}_combined.csv")
    combined_instr_df.to_csv(output_path, index=False)
    
    # Optional: Print shape to verify
    print(f"{instr} combined shape: {combined_instr_df.shape} -> saved to {output_path}")

wcm combined shape: (326069, 6) -> saved to C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\BBOP_combined_instruments_csv\wcm_combined.csv
uhsas combined shape: (83479, 103) -> saved to C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\BBOP_combined_instruments_csv\uhsas_combined.csv
sp2 combined shape: (33936, 210) -> saved to C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\BBOP_combined_instruments_csv\sp2_combined.csv
psap combined shape: (322488, 19) -> saved to C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\BBOP_combined_instruments_csv\psap_combined.csv
pass combined shape: (221508, 11) -> saved to C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\BBOP_combined_instruments_csv\pass_combined.csv
iwg1 combined shape: (326035, 48) -> saved to C:\Users\haika\Desktop\May_Research\DOE_combiner_and_processing\BBOP_combined_instruments_csv\iwg1_combined.csv
cpc combined shape: (326035, 6) -> saved to C:\Users\ha

In [8]:
#MERGE ALL INSTRUMENTS AND DATES INTO ONE DATAFRAME
from functools import reduce
555555
# Step 1: Collect all dates
all_dates = sorted({date for data in instrument_data.values() for date in data})

# Step 2: Merge instruments horizontally by date
combined_by_date = {}

for date in all_dates:
    dfs_for_date = []

    for instr, date_dict in instrument_data.items():
        if date in date_dict:
            df = date_dict[date].copy()

            # Drop rows with NaN in UTC
            df = df.dropna(subset=["UTC"])

            # Convert UTC to numeric (if not already)
            df["UTC"] = pd.to_numeric(df["UTC"], errors="coerce")

            # Avoid column name collisions by prefixing only non-shared columns
            df = df.rename(columns={
                col: f"{instr}_{col}" for col in df.columns if col not in ["Date", "UTC"]
            })

            dfs_for_date.append(df)

    # Merge all instrument DataFrames for this date on Date+UTC
    if dfs_for_date:
        merged = reduce(lambda left, right: pd.merge(left, right, on=["Date", "UTC"], how="outer"), dfs_for_date)
        combined_by_date[date] = merged

# Step 3: Combine all dates vertically and sort
final_combined_df = pd.concat(combined_by_date.values(), ignore_index=True)
final_combined_df.sort_values(by=["Date", "UTC"], inplace=True)
final_combined_df.reset_index(drop=True, inplace=True)

In [9]:
RAW_CAMPAIGNS_PATH = rf'C:\Users\haika\Desktop\May_Research\may_datasets\raw_campaigns'
COMPREHENSIVE_CAMPAIGNS_PATH = rf'C:\Users\haika\Desktop\May_Research\may_datasets\comprehensive_campaigns'
RESTRICTED_CAMPAIGNS_PATH = rf'C:\Users\haika\Desktop\May_Research\may_datasets\restricted_campaigns'


final_combined_df.to_csv(os.path.join(RAW_CAMPAIGNS_PATH, rf"{CAMPAIGN_NAME}_raw.csv"), index=False)