# Data processing - Level 1
Code written by Radiance and Yolanda (with the help of ChatGPT)

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib ipympl

In [None]:
import logging
from helikite.config import load_config
from helikite.constants import constants

logging.basicConfig(level=constants.LOGLEVEL_CONSOLE)
config = load_config(constants.INPUTS_FOLDER / constants.CONFIG_FILE)

# define your input and output directories here
input_level0_dir = config.processing_dir / "Level0"
output_level1_dir = config.processing_dir / "Level1"
output_level1_dir.mkdir(parents=True, exist_ok=True)

## Load data

In [None]:
from helikite.metadata.utils import load_parquet

df_level0, metadata = load_parquet(input_level0_dir / f"level0_{config.flight_basename}.parquet")

In [None]:
metadata


**Example commands to output different individual elements of the dataset**  

*To use individual fields, just use the object (.) notation, for example*  
  
print(metadata.flight_date)  
print(metadata.landing_time)  
metadata.takeoff_time

flight_computer_columns = [col for col in df.columns if col.startswith("flight_computer_")]  
print(flight_computer_columns)  

smart_tether_columns = [col for col in df.columns if col.startswith("smart_tether_")]  
print(smart_tether_columns)

## DataProcessor class

In [None]:
from helikite.classes.output_schemas import OutputSchemas
from helikite.classes.data_processing_level1 import DataProcessorLevel1

data_processor = DataProcessorLevel1(getattr(OutputSchemas, config.output_schema), df_level0, metadata)
data_processor.state()

## Processing missing instruments
**If no measurements are available for an instrument, add columns with NaNs into the dataset.**

In [None]:
data_processor.add_missing_columns()

## Outlier removal

In [None]:
outliers_file = output_level1_dir / f"level1_{config.flight_basename}_outliers.csv"

`data_processor.detect_outliers()` marks points that are outside the interquartile range (IQR) using the provided IQR factor.
 When columns are not specified, default columns are used. When `acceptable_ranges` are not specified, default ranges are applied.

In [None]:
data_processor.detect_outliers(outliers_file, columns=None, acceptable_ranges=None, iqr_factor=5)
data_processor.state()

```data_processor.choose_outliers()``` loads all the data, plots an individual variable, and then saves a CSV of any outliers that are chosen as True. This outlier CSV will be later used by ```data_processor.set_outliers_to_nan()``` to mask the values in the original dataframe.

_Note: No data is changed in the input dataframe._

In a first time, check the **'flight_computer_pressure'** against **'flight_computer_time'** as well as **'flight_computer_Out1_T'**, **'flight_computer_Out2_T'**, **'flight_computer_Out1_H'** and **'flight_computer_Out2_H'**.
Then check **'smart_tether_Wind (m/s)'**. The WD values corresponding to the WS outliers will automatically also be set as outliers – no need to manually select **'smart_tether_Wind (degrees)'** outliers (if ```use_coupled_columns``` is ```True```).
If needed, remove **'flight_computer_Lat'** outliers. The Long values corresponding to the Lat outliers will automatically also be set as outliers – no need to manually select **'flight_computer_Long'** outliers (if ```use_coupled_columns``` is ```True```).

### Coupled columns
To check which columns are coupled, see output of ```data_processor.state()```. To add new groups of coupled columns to an instrument, pass the list of all the groups to argument ```coupled_columns```  of the
instrument instance. For example:
```python
flight_computer_v1 = FlightComputerV1(
    name="flight_computer",
    ...
    coupled_columns=[
        ('flight_computer_TEMP1', 'flight_computer_RH1'),
        ('flight_computer_TEMP2', 'flight_computer_RH2'),
    ]
)
```
 For the coupled columns to be updated, restart the kernel and rerun the cells.

In [None]:
data_processor.choose_outliers(y=f"{data_processor.reference_instrument.name}_pressure", outliers_file=outliers_file, use_coupled_columns=True)
# data_processor.choose_outliers(y="pops_pressure", outliers_file=outliers_file, use_coupled_columns=True)
# data_processor.choose_outliers(y="cpc_DateTime", outliers_file=outliers_file, use_coupled_columns=True)

If coupled columns were not specified when choosing outliers, ensure consistency manually.
If coupled columns were specified, this adjustment is applied automatically.

In [None]:
# outliers = pd.read_csv(outliers_file, index_col=0, parse_dates=True)
# outliers["smart_tether_Wind (degrees)"] = outliers["smart_tether_Wind (m/s)"]   # Remove WD values corresponding to outlying WS
# outliers["flight_computer_Long"] = outliers["flight_computer_Lat"]              # Remove Long values corresponding to outlying Lat
# outliers.to_csv(outliers_file, date_format="%Y-%m-%d %H:%M:%S")                 # Save corresponding outliers into the csv file

In [None]:
data_processor.set_outliers_to_nan()
#df.loc["2025-02-15 09:47:40":"2025-02-15 09:47:50", 'smart_tether_Wind (m/s)']   # Print time range to control if values replaced by NaN

In [None]:
# Set GPS data in case of missing FC files
data_processor.fillna_if_all_missing({"flight_computer_Lat": 7039.724, "flight_computer_Long": 817.1591})

In [None]:
data_processor.state()

It is possible to access and modify the data directly in cases where certain operations are not yet supported by the data processor.

If an operation becomes part of the standard workflow, it is strongly encouraged to add it to the data processor instead.

In [None]:
#data_processor.df['flight_computer_pressure'] = data_processor.df['pops_pressure']

### Outlier removal double check
Plot variables with possible removed outliers.

In [None]:
data_processor.plot_outliers_check()

Check position of balloon compared to the station based on GPS coordinates.

Transformation of degrees-minutes coordinates (DM) into decimal degrees (DD) coordinates.
Addition of 'latitude_dd' and 'longitude_dd' into df.

In [None]:
data_processor.convert_gps_coordinates(lat_col='flight_computer_Lat', lon_col='flight_computer_Long',
                                       lat_dir='S', lon_dir='W')
data_processor.plot_gps_on_map(center_coords=(-70.6587, -8.2850), zoom_start=14)

## T and RH averaging

Averages flight computer temperature and humidity data from the two T/RH sensors.
If > K NaNs for one of the sensors, takes only the other one into account, where K is a NaN threshold, which can be specified in `T_RH_averaging`.

Plots T and RH as a function of pressure.
Smart Tether is also plotted as an indication but not taken into account for the averaging.

Adds 'Average_Temperature' and 'Average_RH' into df.

In [None]:
# if columns are not specified, takes the default flight computer columns
data_processor.T_RH_averaging(columns_t=None, columns_rh=None, nan_threshold=400)
data_processor.plot_T_RH(save_path=output_level1_dir / f"Level1_{config.flight_basename}_T_RH_averaging.png")

**In case the standard T and RH averaging is not working, and the ST measurements need to be taken into account, update the cell above by:**

Applying any required corrections to the smart tether T and RH measurements.
```python
data_processor.df['smart_tether_T (deg C)_corr'] = data_processor.df['smart_tether_T (deg C)'].ffill().bfill() - 0.2
data_processor.df['smart_tether_%RH_corr'] = data_processor.df['smart_tether_%RH'].ffill().bfill() - 6.1
```

Specifying the columns to average in the function call, including smart tether T and RH measurements:
```python
data_processor.T_RH_averaging(
    columns_t=['flight_computer_Out1_T', 'flight_computer_Out2_T', 'smart_tether_T (deg C)_corr'],
    columns_rh=['flight_computer_Out1_H', 'flight_computer_Out2_H', 'smart_tether_%RH_corr'],
    nan_threshold=400,
)
```

## Altitude calculation

Adds 'DateTime', 'Pressure_ground', 'Temperature_ground' and 'Altitude' into df.

**When FC started at the balloon height and not on the sledge:**
specify the height of the sledge in meters in the `offset_to_add` parameter

In [None]:
data_processor.altitude_calculation_barometric(offset_to_add=0)
data_processor.df.head()

In [None]:
data_processor.plot_altitude()

## CO2 data processing

In [None]:
from helikite.instruments import co2

data_processor.normalize(co2, min_threshold=200, max_threshold=500)
data_processor.plot_raw_and_normalized_data(co2)

## STAP data processing

In [None]:
from helikite.instruments import stap

data_processor.normalize(stap)
data_processor.plot_raw_and_normalized_data(stap)

## POPS data processing
### POPS total concentration calculation

Calculate the total concentration and dN/dlogDP for each bin

From Pohorsky et al. (2024) it appeared that particles with diameters between 142 and 186 (bins 0 to 2) are wrongly detected by the POPS as total particle concentration increases. This phenomenon can be explained by electronic noise from the detector, where fringes on the edge of the Gaussian signal are perceived as smaller particles by the software. It was therefore decided to only consider data for particles larger than 186 nm as the error induced by the first three bins is too high.

*dN_pops = pops_bX / popsflow_mean = dN*  
*pops_total_conc = sum of dN_pops*  
*pops_bX_dlogDp = dN/dlogDp*  

Adds 'pops_total_conc' and 'pops_bX_dlogDp' into df.

In [None]:
from helikite.instruments import pops

data_processor.calculate_derived(pops)
data_processor.plot_raw_and_normalized_data(pops)  # no normalized data yet -> only raw data will be plotted

### Setting POPS outliers to NaNs
**Set `pops_has_outliers = True` if there are outliers in POPS measurements**

If an observation being an outlier in one bin implies that observations in other bins are also outliers, set `use_coupled_columns=True`. In this case, marking an outlier in one column will automatically mark the corresponding values in the coupled columns as well.

If you want to mark outliers in a single column only, set `use_coupled_columns=False`.

See `pops.py` for the definition of the POPS instrument to check which columns are coupled:

```python
pops = POPS(
    name="pops",
    ...
    coupled_columns=[
        ...
    ]
)
```
If the coupled columns need to be updated, modify `coupled_columns` and rerun the notebook for the changes to take effect.

After the outliers are selected and the file is created, run
`data_processor.set_outliers_to_nan()` (next cell) to update the data frame accordingly.

In [None]:
# set to True to choose outlier regions in POPS data
pops_has_outliers = False

if pops_has_outliers:
    outliers_pops_file = output_level1_dir / f"level1_{config.flight_basename}_outliers_pops.csv"
    data_processor.choose_outliers(y="Altitude", outliers_file=outliers_pops_file, use_coupled_columns=True, instruments=[pops])

In [None]:
if pops_has_outliers:
    data_processor.set_outliers_to_nan()

In [None]:
# TODO: check 2025-02-12 07:57:25 + update coupled columns ? or change choose_outliers
if False:
    pd.Timestamp("2025-02-12 07:57:25")
    df.loc["2025-02-10 13:47:00":"2025-02-10 13:47:15", 'pops_b3_dlogDp']

### Normalization of POPS concentrations to standard temperature and pressure (STP)

at 0°C (273.15 K) and 1 atm (1013.25 hPa).  

$C_{\text{STP}} = C_{\text{measured}} \times \left( \frac{P_{\text{measured}}}{P_{\text{STP}}} \right) \times \left( \frac{T_{\text{STP}}}{T_{\text{measured}}} \right)$  


       
Adds 'pops_total_conc_stp' and 'pops_bX_dlogDp_stp' into df.

In [None]:
data_processor.normalize(pops)
data_processor.plot_raw_and_normalized_data(pops)

### Plot POPS size distribution and total concentration  

STP normalized bin concentrations and total concentration

In [None]:
data_processor.plot_distribution(pops)

## mSEMS data processing
### mSEMS total concentration calculation

*msems_inverted_Bin_ConcX = dN/dlogDp*  
*msems_inverted_dN_Bin_ConcX = conc * dlogDp*  
*msems_inverted_dN_totalconc = sum of msems_inverted_dN_Bin_ConcX*  

Adds 'msems_inverted_dN_Bin_ConcX' and 'msems_inverted_dN_totalconc' into df.

In [None]:
from helikite.instruments import msems_inverted

data_processor.calculate_derived(msems_inverted)
data_processor.plot_raw_and_normalized_data(msems_inverted)  # no normalized data yet -> only raw data will be plotted

### Setting mSEMS outliers to NaNs
**Set `msems_has_outliers = True` if there are outliers in msems measurements**

If an observation being an outlier in one bin implies that observations in other bins are also outliers, set `use_coupled_columns=True`. In this case, marking an outlier in one column will automatically mark the corresponding values in the coupled columns as well.

If you want to mark outliers in a single column only, set `use_coupled_columns=False`.

See `msems.py` for the definition of the msems instrument to check which columns are coupled:

```python
msems = msems(
    name="msems",
    ...
    coupled_columns=[
        ...
    ]
)
```
If the coupled columns need to be updated, modify `coupled_columns` and rerun the notebook for the changes to take effect.

After the outliers are selected and the file is created, run
`data_processor.set_outliers_to_nan()` (next cell) to update the data frame accordingly.

In [None]:
# set to True to choose outlier regions in mSEMS data
msems_has_outliers = False

if msems_has_outliers:
    outliers_msems_file = output_level1_dir / f"level1_{config.flight_basename}_outliers_msems.csv"
    data_processor.choose_outliers(y="Altitude", outliers_file=outliers_msems_file, use_coupled_columns=True, instruments=[msems_inverted])

In [None]:
if msems_has_outliers:
    data_processor.set_outliers_to_nan()

In [None]:
# TODO
import numpy as np

if False:
    # Define time range of "bad" mSEMS measurements
    start_time = "2025-01-27 17:55"
    end_time = "2025-01-27 19:45"

### Normalization of mSEMS concentrations to standard temperature and pressure (STP)

at 0°C (273.15 K) and 1 atm (1013.25 hPa).

$C_{\text{STP}} = C_{\text{measured}} \times \left( \frac{P_{\text{measured}}}{P_{\text{STP}}} \right) \times \left( \frac{T_{\text{STP}}}{T_{\text{measured}}} \right)$

Adds 'msems_inverted_Bin_ConcX_stp' and 'msems_inverted_dN_totalconc_stp' to df.

In [None]:
data_processor.normalize(msems_inverted)
data_processor.plot_raw_and_normalized_data(msems_inverted)

### Plot mSEMS size distribution and total concentration

STP normalized bin concentrations and total concentration

In [None]:
data_processor.plot_distribution(msems_inverted, time_start=None, time_end=None)

## mCDA data processing
### mCDA bin concentrations, total concentration and normalization per bin width

For bins 1 to 256 :  
*mcda_dataB X = raw counts*  
*mcda_dataB X_dN = counts / (flow rate * sampling interval) = concentration*  
*mcda_dN_totalconc = sum of mcda_dataBX_dN*  
*mcda_dataB X_dN_dlogDp = dN/dlogDp = mcda_dataBX_dN / dlogDp*

Adds 'mcda_dataB X_dN', 'mcda_dN_totalconc' and 'mcda_dataB X_dN_dlogDp' into df.

In [None]:
from helikite.instruments import mcda

data_processor.calculate_derived(mcda)
data_processor.plot_raw_and_normalized_data(mcda)  # no normalized data yet -> only raw data will be plotted

### Setting mCDA outliers to NaNs
**Set `mcda_has_outliers = True` if there are outliers in mcda measurements**

If an observation being an outlier in one bin implies that observations in other bins are also outliers, set `use_coupled_columns=True`. In this case, marking an outlier in one column will automatically mark the corresponding values in the coupled columns as well.

If you want to mark outliers in a single column only, set `use_coupled_columns=False`.

See `mcda.py` for the definition of the mcda instrument to check which columns are coupled:

```python
mcda = mcda(
    name="mcda",
    ...
    coupled_columns=[
        ...
    ]
)
```
If the coupled columns need to be updated, modify `coupled_columns` and rerun the notebook for the changes to take effect.

After the outliers are selected and the file is created, run
`data_processor.set_outliers_to_nan()` (next cell) to update the data frame accordingly.

In [None]:
# set to True to choose outlier regions in mCDA data
mcda_has_outliers = False

if mcda_has_outliers:
    outliers_mcda_file = output_level1_dir / f"level1_{config.flight_basename}_outliers_mcda.csv"
    data_processor.choose_outliers(y="Altitude", outliers_file=outliers_mcda_file, use_coupled_columns=True, instruments=[mcda])

In [None]:
if mcda_has_outliers:
    data_processor.set_outliers_to_nan()

### Normalization of mCDA concentrations to standard temperature and pressure (STP)

at 0°C (273.15 K) and 1 atm (1013.25 hPa).  

$C_{\text{STP}} = C_{\text{measured}} \times \left( \frac{P_{\text{measured}}}{P_{\text{STP}}} \right) \times \left( \frac{T_{\text{STP}}}{T_{\text{measured}}} \right)$  

Adds 'mcda_dataB X_dN_dlogDp_stp' and 'mcda_dN_totalconc_stp' to df.

In [None]:
data_processor.normalize(mcda)
data_processor.plot_raw_and_normalized_data(mcda)

### Plot mCDA size distribution and total concentration

STP normalized bin concentrations and total concentration

In [None]:
data_processor.plot_distribution(mcda)

**Vertical droplet size distribution**

In [None]:
data_processor.plot_vertical_distribution(mcda)

## CPC3007 data processing

In [None]:
from helikite.instruments import cpc

data_processor.plot_raw_and_normalized_data(cpc)  # no normalized data yet -> only raw data will be plotted

### Setting CPC3007 outliers to NaNs

**Set `cpc_has_outliers = True` if there are outliers in cpc measurements**

After the outliers in `cpc_totalconc_raw` are selected and the file is created, run
`data_processor.set_outliers_to_nan()` (next cell) to update the data frame accordingly.

In [None]:
# set to True to choose outlier regions in CPC3007 data
cpc_has_outliers = False

if cpc_has_outliers:
    outliers_cpc_file = output_level1_dir / f"level1_{config.flight_basename}_outliers_cpc.csv"
    data_processor.choose_outliers(y="Altitude", outliers_file=outliers_cpc_file, use_coupled_columns=True, instruments=[cpc])

In [None]:
if cpc_has_outliers:
    data_processor.set_outliers_to_nan()

### Normalization of CPC3007 concentrations to standard temperature and pressure (STP)

at 0°C (273.15 K) and 1 atm (1013.25 hPa).  

$C_{\text{STP}} = C_{\text{measured}} \times \left( \frac{P_{\text{measured}}}{P_{\text{STP}}} \right) \times \left( \frac{T_{\text{STP}}}{T_{\text{measured}}} \right)$  


       
Adds 'CPC_total_N_stp' into df.

In [None]:
data_processor.normalize(cpc)
data_processor.plot_raw_and_normalized_data(cpc)

## Filter data check

In [None]:
# NO FILT DATA FROM FC --> READ IN THE FILT FILE AND PASTE IT INTO DF WITH CORRECT NAMES !!!!!

In [None]:
# TODO: "250127A3.TXT"
if False:
    filt = pd.read_csv(DATA_FLIGHT_DIRPATH / "250127A3.TXT", skiprows=13, delimiter='\t')  # or use sep=',' if it's CSV

    date_str = filt['#YY/MM/DD'].str.strip()
    time_str = filt['HR:MN:SC'].str.strip()
    combined = date_str + ' ' + time_str
    filt['datetime'] = pd.to_datetime(combined, format='%y/%m/%d %H:%M:%S')
    filt = filt.set_index('datetime', drop=False)
    filt.columns = 'filter_' + filt.columns.astype(str)

    filt

In [None]:
if False:
    df = df.join(filt, how='left')
    df

In [None]:
from helikite.processing.post.level1 import filter_data

if False:
    filter_data(df)

**In case of broken filters, replace filter positions by 1.**

In [None]:
if False:
    df['filter_cur_pos'] = 1
    df['filter_pump_pw'] = 0

In [None]:
if False:
    df.loc[df['filter_cur_pos'] == 0, 'filter_cur_pos'] = 1
    df.loc[df['filter_cur_pos'] == 2, 'filter_cur_pos'] = 1
    df.loc[df['filter_cur_pos'] == 4, 'filter_cur_pos'] = 1

In [None]:
if False:
    broken_filter_start = pd.Timestamp("2025-02-11 15:44:14")
    broken_filter_end = pd.Timestamp("2025-02-11 16:11:39")
    df.loc[df['filter_cur_pos'] == 0, 'filter_cur_pos'] = 1

    df.loc[(df.index >= broken_filter_start) & (df.index <= broken_filter_end), 'filter_cur_pos'] = 2.0
    df.loc[(df.index >= broken_filter_start) & (df.index <= broken_filter_end), 'filter_pump_pw'] = 37.0
    df['filter_cur_pos']

## Data quicklooks
### Flight profile
Variables plotted in the flight profile are normally defined in the output schema, so they remain consistent within a single campaign. See `helikite/classes/output_schemas.py`.

By default, no variables are explicitly provided to the function, and the values from the output schema are used. If you want to change the variables being plotted, you can create a custom list of `FlightProfileVariable` objects and pass it via the `variables` argument.

In the example below, we take a copy of the original list of variables for the campaign and replace the bounds defined for the 5th variable.

```python
import dataclasses

custom_variables = data_processor.output_schema.flight_profile_variables.copy()
custom_variables[4] = dataclasses.replace(custom_variables[4], x_divider=20, x_bounds=(0, 120))

data_processor.plot_flight_profiles(config.flight_basename, save_path, variables=custom_variables)
```
In another example, we take a copy and replace the first variable with longitude in degrees.

```python
custom_variables[0] = FlightProfileVariable(
    column_name="longitude_dd",
    plot_kwargs=dict(color="brown", linewidth=3.0, marker='.', linestyle="none"),
    x_bounds=(-8.29, -8.26),
    x_divider=0.01,
    x_label="Longitude (dd)",
)

data_processor.plot_flight_profiles(config.flight_basename, save_path, variables=custom_variables)
```

In [None]:
save_path = output_level1_dir / f'Level1_{config.flight_basename}_Flight_{config.flight}.png'

data_processor.plot_flight_profiles(config.flight_basename, save_path, variables=None)

### Size distributions

In [None]:
save_path = output_level1_dir / f'Level1_{config.flight_basename}_SizeDistr_Flight_{config.flight}.png'

data_processor.plot_size_distr(config.flight_basename, save_path, time_start=None, time_end=None)

## Level 1
**Save file containing all the columns (processed)**

In [None]:
data_processor.export_data(filepath=output_level1_dir / f'level1_{config.flight_basename}.csv')

# Random code bits
### Remove outliers from the Smart Tether WS and WD datapoints

This is a **sliding-window median filter** used for **outlier detection and removal**.
- Look at a window of neighboring values around each data point (10 neighboring values)
- Compare the current point to the median of this window.
- If the point is significantly different (>35% away from the median), it's treated as an outlier and **replaced with NaN**.

Applied on WS, the corresponding WD datapoints are then also removed.

In [None]:
from helikite.instruments.smart_tether import wind_outlier_removal

# set to True to apply the filter on WS and WD data
apply_wind_filter = False

if apply_wind_filter:
    df_filtered = wind_outlier_removal(data_processor.df)

**IF THE FILTER APPLIES CORRECTLY** : save the filtered WS and WD data back into th original dataframe

In [None]:
if apply_wind_filter:
    # Save the filtered data back into th original dataframe
    data_processor.df['smart_tether_Wind (m/s)'] = df_filtered['smart_tether_Wind (m/s)']
    data_processor.df['smart_tether_Wind (degrees)'] = df_filtered['smart_tether_Wind (degrees)']
    print("Filtered data saved to the original dataframe.")

### Metadata dictionary

In [None]:
# Otherwise, to make a dictionary from the metadata:
metadata_dict = metadata.model_dump()

# Then use it as a normal Python dictionary
metadata_dict['flight_date']

### GPS coordinate check

In [None]:
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots(figsize=(12, 4))

# First y-axis for Longitude
color = 'tab:blue'
ax1.set_xlabel('Time')
ax1.set_ylabel('Longitude', color=color)
ax1.plot(data_processor.df.index, data_processor.df['flight_computer_Long'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.grid(True)

# Second y-axis for Latitude
ax2 = ax1.twinx()
color = 'tab:green'
ax2.set_ylabel('Latitude', color=color)
ax2.plot(data_processor.df.index, data_processor.df['flight_computer_Lat'], color=color)
ax2.tick_params(axis='y', labelcolor=color)

plt.title('Longitude and Latitude over Time')
plt.show()

In [None]:
from helikite.processing.post.outliers import convert_gps_coordinates

lon_col, lat_col = 'flight_computer_Long', 'flight_computer_Lat'

df_coord = data_processor.df[[lon_col, lat_col]].copy()
df_coord = convert_gps_coordinates(df_coord, lat_col, lon_col, lat_dir="S", lon_dir="W")

df_coord['latitude_dd']

### Export TAPIR data for Delphine

In [None]:
# set to True to export TAPIR data
export_tapir_data = False

if export_tapir_data:
    # Create a new DataFrame with the same DateTime index as df
    df_tapir = data_processor.df.loc[:, ['Altitude', 'Average_Temperature', 'Temperature_ground', 'mcda_dN_totalconc_stp'] +
                                        [col for col in data_processor.df.columns if col.startswith('tapir_')]].copy()
    df_tapir['Altitude'] = df_tapir['Altitude'].round(2)
    df_tapir['Average_Temperature'] = df_tapir['Average_Temperature'].round(2)
    df_tapir['mcda_dN_totalconc_stp'] = df_tapir['mcda_dN_totalconc_stp'].round(2)

    df_tapir

In [None]:
import os

if export_tapir_data:
    # Example metadata as a dictionary (you can adjust this to your actual metadata object)
    metadata_lines = {
        'Flight date' : metadata.flight_date,
        'Flight number' : metadata.flight,
        'Takeoff time' : metadata.takeoff_time,
        'Landing time' : metadata.landing_time,
        'Average_Temperature (in °C)' : 'average T from two temperature sensors',
        'Temperature_ground (in K)' : 'extrapolated ground temperature based on T at takeoff and landing',
        'mcda_dN_totalconc_stp (cm-3)' : 'droplet total concentration',
        'Note' : 'there are two "peaks" in the temperature profile, I am however not yet sure if they are significant or outliers'
    }

    # Construct the dynamic filename
    filename = f"{metadata.flight_date}_Flight{metadata.flight}_TAPIR.txt"

    # Define your output directory (use raw string if needed)
    output_dir_tapir = output_level1_dir / "tapir"
    output_dir_tapir.mkdir(parents=True, exist_ok=True)

    # Combine the path and filename
    output_path = os.path.join(output_dir_tapir, filename)

    # Save the file
    with open(output_path, 'w', newline='') as f:
        for key, value in metadata_lines.items():
            f.write(f"# {key}: {value}\n")
        f.write("\n")
        df_tapir.to_csv(f, index=True)