INSTRUCTIONS

1. Line 16 - 25 (Cell 1): Change to folder paths for each instrument you want to combine. Assumes that the folder is filled with .ict files and are under the same campaign.
2. Line 27 (Cell 1): Change final_file_name to a name that you want for the output file.
3. Line 29 (Cell 1): Change the TIME_COL_NAMES to the potential name of time columns in your dataset


ASSUMPTIONS (READ THIS TOO)
1. Assumes that time columns in each .ict file contains the string "Time" (non-case sensitive). This is the main way we synchronize each row across different instrument files.
2. Assumes that each time frame in each row is incremented the same (by one second). We have not implemented time-averaging. ASIA-AQ seems to have consistent time skips over all instruments

In [3]:
#                                              #
#  INITIALIZES DICTS TO GATHER ALL FILE NAMES  #
#                                              #
import pandas as pd
import icartt
import os
import warnings
import re
#ignore warnings
warnings.filterwarnings('ignore')

#change to current directory
os.chdir(os.getcwd())

#Get list of file_paths (instruments) you want to combine
file_list = [
    r"campaign_data\ASIA-AQ\DC-8\Bennet.Ryan\MetNav",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\UHSAS",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\OPTICAL",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\SP2",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\CDP",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\CPSPD",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\CCN",
    r"campaign_data\ASIA-AQ\DC-8\Yum.Seongsoo\CPC"
]

final_file_name = "test.csv"

TIME_COL_NAMES = ["time", "utc", "start"]

#from "ASIAAQ-LARGE-CDP_DC8_20240206_R0", return 20240206
def get_date(file_name):
    match = re.search(r"\d{8}", file_name)

    if match:
        date_str = match.group()

    return date_str

# <Key: Val> -> <Instrument Path : List of files under instrument path>
file_dict = {file_path: [] for file_path in file_list}

#Go through each instrument path, download all files (Populates file_dict)
for file_path in file_dict:
    all_files = os.listdir(file_path)
    file_dict[file_path] = [os.path.join(file_path, file) for file in all_files]

# <Key: Val> -> <Date : List of files under Date>
date_dict = {}

#populates date_dict
for instrument_path in file_dict:
    files = file_dict[instrument_path]
    for file in files:
        curr_date = get_date(file)

        if curr_date not in date_dict:
            date_dict[curr_date] = [file]
        else:
            date_dict[curr_date].append(file)

In [4]:
#                                                #
# COMBINES THE Different Variables by date first #
#                                                #

#Get years present in ALL four lists
def condense_time(time_list):
    # Convert all lists to sets
    sets = [set(time_list[i]) for i in range(len(time_list))]
    
    # Find the intersection of all sets
    common_time = set.intersection(*sets)
    
    # Return the sorted result
    return sorted(common_time)

#change ict to df
def ict_to_df(filename):
    #from file name
    ict = icartt.Dataset(filename)

    df = ict.data[:]
    df = pd.DataFrame(df)
    
    return df

#combine each instrument file column wise (left to right)
def combine_column_wise(files, curr_date):
    #convert each instrument file to df
    dfs = [ict_to_df(file) for file in files]
    df_combined = pd.DataFrame()

    #time_column = [col for col in df.columns if "time" in col.lower()]

    #keeps the name of "Time" column in df_combined
    time_label1 = None

    #Filter the times that are outside of common_time in each instrument file
    for i in range(len(dfs)):
        curr_df = dfs[i]

        if(i == 0):
            #combined df now is the first df found
            df_combined = curr_df
            #get "Time" column name
            time_label1 = [col for col in df_combined.columns if any(keyword in col.lower() for keyword in TIME_COL_NAMES)][0]
        else:
            time_label2 = [col for col in curr_df.columns if any(keyword in col.lower() for keyword in TIME_COL_NAMES)][0]
            
            #do an outer join, only on "Time" column, preserving both time columns
            df_combined = pd.merge(df_combined, curr_df, left_on=time_label1, right_on=time_label2, how='outer')

            if time_label1 != time_label2:
                # Drop the redundant time column from the right dataframe
                df_combined.drop(columns=[time_label2], inplace=True)

    #drop index column
    df_combined.reset_index(drop=True)

    #add date column at the start
    df_combined.insert(0, 'date', curr_date)

    return df_combined

#list of all combined dfs for each date
df_combines = []

date_counter = 1
total_dates = len(date_dict.keys())

#Concatenates each curr_date files column-wise
for curr_date in sorted(date_dict.keys()):
    files = date_dict[curr_date]
    #for each date, combine the instrument files left to right
    df_combined_date = combine_column_wise(files, curr_date)
    df_combines.append(df_combined_date)

    print(f"Combined date {date_counter} / {total_dates}")
    date_counter += 1

df_combines = [df_curr.reset_index(drop=True) for df_curr in df_combines]

#concatenates all files row-wise
df_final = pd.concat(df_combines, axis=0, ignore_index=True)

#output to csv
df_final.to_csv(final_file_name, index=False)

                                                                                                                                                                                                       

Combined date 1 / 21
Combined date 2 / 21
Combined date 3 / 21
Combined date 4 / 21
Combined date 5 / 21
Combined date 6 / 21
Combined date 7 / 21
Combined date 8 / 21
Combined date 9 / 21
Combined date 10 / 21
Combined date 11 / 21
Combined date 12 / 21
Combined date 13 / 21
Combined date 14 / 21
Combined date 15 / 21
Combined date 16 / 21
Combined date 17 / 21
Combined date 18 / 21
Combined date 19 / 21
Combined date 20 / 21
Combined date 21 / 21
