INSTRUCTIONS

1. Line 16 - 25 (Cell 1): Change to folder paths for each instrument you want to combine. Assumes that the folder is filled with .ict files and are under the same campaign.
2. Line 27 (Cell 1): Change final_file_name to a name that you want for the output file.


ASSUMPTIONS (READ THIS TOO)
1. Assumes that time columns in each .ict file contains the string "Time" (non-case sensitive). This is the main way we synchronize each row across different instrument files.
2. Assumes that each time frame in each row is incremented the same (by one second). We have not implemented time-averaging. ASIA-AQ seems to have consistent time skips over all instruments

In [5]:
#                                              #
#  INITIALIZES DICTS TO GATHER ALL FILE NAMES  #
#                                              #
import pandas as pd
import icartt
import os
import warnings
import re
#ignore warnings
warnings.filterwarnings('ignore')

#change to current directory
os.chdir(os.getcwd())

#Get list of file_paths (instruments) you want to combine
file_list = [
    r"campaign_data\ASIA-AQ\DC-8\Bennet.Ryan\MetNav",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\UHSAS",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\OPTICAL",
    r"campaign_data\ASIA-AQ\DC-8\Moore.Richard\SP2",
]

final_file_name = "ASIA-AQ.csv"

#from "ASIAAQ-LARGE-CDP_DC8_20240206_R0", return 20240206
def get_date(file_name):
    match = re.search(r"\d{8}", file_name)

    if match:
        date_str = match.group()

    return date_str

# <Key: Val> -> <Instrument Path : List of files under instrument path>
file_dict = {file_path: [] for file_path in file_list}

#Go through each instrument path, download all files (Populates file_dict)
for file_path in file_dict:
    all_files = os.listdir(file_path)
    file_dict[file_path] = [os.path.join(file_path, file) for file in all_files]

# <Key: Val> -> <Date : List of files under Date>
date_dict = {}

#populates date_dict
for instrument_path in file_dict:
    files = file_dict[instrument_path]
    for file in files:
        curr_date = get_date(file)

        if curr_date not in date_dict:
            date_dict[curr_date] = [file]
        else:
            date_dict[curr_date].append(file)

#get max list length found in date_dict
max_length = max(len(v) for v in date_dict.values())

#filter date_dict to only have dates present in ALL instrument types
date_dict = {k: v for k, v in date_dict.items() if len(v) == max_length}


In [11]:
#                                                #
# COMBINES THE Different Variables by date first #
#                                                #

#Get years present in ALL four lists
def condense_time(time_list):
    # Convert all lists to sets
    sets = [set(time_list[i]) for i in range(len(time_list))]
    
    # Find the intersection of all sets
    common_time = set.intersection(*sets)
    
    # Return the sorted result
    return sorted(common_time)

#change ict to df
def ict_to_df(filename):
    #from file name
    ict = icartt.Dataset(filename)

    df = ict.data[:]
    df = pd.DataFrame(df)
    
    return df

#combine each instrument file column wise (left to right)
def combine_column_wise(files, curr_date):
    #convert each instrument file to df
    dfs = [ict_to_df(file) for file in files]
    combined_df = None
    dfs_filtered = []
    time_list = []

    #populate time_list and get the common_time amongst all instrument files
    for df in dfs:
        # Find the column that contains "time" in its name (case-insensitive)
        time_column = [col for col in df.columns if "time" in col.lower()]
        
        if time_column:  # If a "time" column is found
            time_list.append(df[time_column[0]])

    common_time = condense_time(time_list)

    #Filter the times that are outside of common_time in each instrument file
    for i in range(len(dfs)):
        curr_df = dfs[i]
        curr_time = pd.Series(time_list[i].tolist())
        
        curr_df = curr_df[curr_time.isin(common_time)]
        dfs_filtered.append(curr_df)

    #drop index column
    dfs_filtered = [df_curr.reset_index(drop=True) for df_curr in dfs_filtered]
    
    #put date at the start of each row
    df_combined = pd.concat(dfs_filtered, axis=1)
    df_combined.insert(0, 'date', curr_date)

    return df_combined

#list of all combined dfs for each date
df_combines = []

#Concatenates each curr_date files column-wise
for curr_date in sorted(date_dict.keys()):
    files = date_dict[curr_date]
    #for each date, combine the instrument files left to right
    df_combined_date = combine_column_wise(files, curr_date)
    df_combines.append(df_combined_date)

df_combines = [df_curr.reset_index(drop=True) for df_curr in df_combines]

#concatenates all files row-wise
df_final = pd.concat(df_combines, axis=0, ignore_index=True)

#output to csv
df_final.to_csv(final_file_name, index=False)

