# Identify balloon maneuvering and create data segments

Author: Brian Green (briangre@stanford.edu)

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from notebook_utils import before_and_after_flights
from tqdm.notebook import tqdm
from scipy import interpolate

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tqdm.pandas()

%matplotlib inline

RMEAN_LENG = 61                           # the number of values used in the running mean (should be odd, because 
                                          # it's centered)
VERTVEL_THRESH_DT = pd.Timedelta(4, 'hours')  # the time it takes the smoothed (running mean) altitude to change
                                          # DZ_THRESH; less time means the balloon is suspected of taking 
                                          # off/landing/depressurizing
DZ_THRESH = 1000                          # (meters); any continuous segments violating VERTVEL_THRESH over a
                                          # height range greater than DZ_THRESH will be deleted
MIN_SEG_LEN = pd.Timedelta(1, 'hours')    # the minimum segment length
SEG_MIN_ALT = 10000                       # (meters); the minimum segment mean altitude

### Load the data from the previous step

In [2]:
flights = pd.read_feather('temp_data/1_flights_sorted.feather')
n_data_orig = flights.shape[0]

## Sometimes the COSMIC code struggles to analyze all the years at once; when that happens,
## process one year at a time
## Select just the flights in 2014
#flight_time_start = flights.groupby('flight_id').progress_apply(lambda f: f.time.iloc[0])
#flight_ids = flight_time_start[flight_time_start < pd.Timestamp('2014-12-01 00:00:00')]
#flight_ids = flight_ids[flight_ids >= pd.Timestamp('2014-07-01 00:00:00')]
#flights = flights[flights.flight_id.isin(flight_ids.index)]
#n_data_orig = flights.shape[0]
#flights_temp.flight_id.nunique()

### Split up the flights into segments when the balloon's maneuvering system is turned on (ACS or propeller)

In [4]:
def segments_from_maneuvering(flight):
    
    # Create a counter so that the segment ID is incremented across the flights, creating unique segment IDs
    if "counter" not in segments_from_maneuvering.__dict__:
        segments_from_maneuvering.counter = 0
        
    # Each time maneuvering stops, a new segment begins:
    flight['segment_id'] = (flight.maneuvering.diff() == -1).astype(int).cumsum()
    n_segments = flight.segment_id.max() + 1 # the segments start at zero
    
    # Delete any data points where maneuvering is happening
    flight = flight[flight.maneuvering==0]
    
    # Apply the counter and increment it
    flight.segment_id = flight.segment_id + segments_from_maneuvering.counter
    segments_from_maneuvering.counter = segments_from_maneuvering.counter + n_segments
    
    return flight

# Delete flights that are too short
print('Calculating flight length:')
num_before = flights.flight_id.nunique()
flight_length = flights.groupby('flight_id').progress_apply(lambda f: f.time.iloc[-1] - f.time.iloc[0])
flights_tooshort = flight_length[flight_length < MIN_SEG_LEN]
flights = flights[~flights.flight_id.isin(flights_tooshort.index)].reset_index(drop=True)
print(f'Removed {num_before-flights.flight_id.nunique()}/{num_before} flights that are too short')
print(' ')

# Combine the acs and propeller_on columns into one variable that indicates if the balloon is maneuvering
flights['maneuvering'] = 0
flights.loc[flights.acs > 0, 'maneuvering'] = 1
flights.loc[flights.propeller_on > 0, 'maneuvering'] = 1
flights = flights.drop(columns=['acs','propeller_on'])

# Split flights into segments where maneuvering is zero
print('Splitting the flights into segments:')
flights = flights.groupby('flight_id').progress_apply(segments_from_maneuvering).reset_index(drop=True)
print(f'Number of segments: {flights.segment_id.nunique()}')
print(' ')

# Delete segments that are too short
print('Calculating segment length:')
num_before = flights.segment_id.nunique()
segment_length = flights.groupby('segment_id').progress_apply(lambda f: f.time.iloc[-1] - f.time.iloc[0])
segments_tooshort = segment_length[segment_length < MIN_SEG_LEN]
flights = flights[~flights.segment_id.isin(segments_tooshort.index)].reset_index(drop=True)
print(f'Removed {num_before-flights.segment_id.nunique()}/{num_before} segments that are too short')
print(' ')

Calculating flight length:


  0%|          | 0/7 [00:00<?, ?it/s]

Removed 0/7 flights that are too short
 
Splitting the flights into segments:


  0%|          | 0/7 [00:00<?, ?it/s]

Number of segments: 11634
 
Calculating segment length:


  0%|          | 0/11634 [00:00<?, ?it/s]

Removed 9262/11634 segments that are too short
 


### Look for times when the balloon might have deviated from isopycnic behavior (takeoffs/landings, depressurizations, etc.)

This is done by calculating the running mean altitude, then looking for instances when it changed rapidly enough to be flagged as suspicious.

In [5]:
def altitude_runningmean(segment):
    
    # Calculate the running mean of the altitude
    temp = np.array(segment.altitude)
    rmean = np.zeros(np.size(temp))
    ind = int(np.floor(RMEAN_LENG/2))
    # calculate the running mean in the middle of the time series
    for i in np.arange(RMEAN_LENG-1):
        rmean[ind:-ind] = rmean[ind:-ind]+temp[i:-(RMEAN_LENG-i-1)]/RMEAN_LENG
    rmean[ind:-ind] = rmean[ind:-ind]+temp[RMEAN_LENG-1:]/RMEAN_LENG
    # calculate the running mean at the time series edges
    for i in np.arange(ind):
        rmean[i] = np.mean(temp[:ind+i])
        rmean[-(i+1)] = np.mean(temp[-(ind+i):])
    segment['alt_rmean'] = rmean
    
    # Calculate the vertical velocity using the running mean
    # The time steps are uneven, so I'm doing a weighted centered difference
    dt = np.array(segment.time.diff()/pd.Timedelta(1,'seconds'))
    dz = np.array(segment['alt_rmean'].diff())
    dt1 = dt[1:-1]
    dt2 = dt[2:]
    w1 = dz[1:-1]/dt1
    w2 = dz[2:]/dt2
    w = (w1*dt2 + w2*dt1)/(dt1 + dt2)
    segment['w_smooth'] = np.nan
    segment.w_smooth.iloc[1:-1] = w
    
    return segment

# Delete segments that are too short
print('Calculating segment length:')
num_before = flights.segment_id.nunique()
segment_size = flights.groupby('segment_id').progress_apply(lambda f: f.time.shape[0])
segments_tooshort = segment_size[segment_size < 2*RMEAN_LENG]
flights = flights[~flights.segment_id.isin(segments_tooshort.index)].reset_index(drop=True)
print(f'Removed {num_before-flights.segment_id.nunique()}/{num_before} segments that are too short')
print(' ')

# Calculate the running mean altitude and the smoothed vertical velocity
print('Calculating smoothed altitude and vertical velocity:')
flights = flights.groupby('segment_id').progress_apply(altitude_runningmean).reset_index(drop=True)

# Tag times where the smoothed vertical velocity exceeds VERTVEL_THRESH
# Ascending and descending instances are flagged seperately
temp_dt = VERTVEL_THRESH_DT/pd.Timedelta(1,'seconds')
VERTVEL_THRESH = DZ_THRESH/temp_dt
flights['vertvel_flag'] = 0
flights.loc[flights.w_smooth > VERTVEL_THRESH, 'vertvel_flag'] = 1
flights.loc[flights.w_smooth < -VERTVEL_THRESH, 'vertvel_flag'] = -1

# Delete w_smooth
flights = flights.drop(columns=['w_smooth'])

Calculating segment length:


  0%|          | 0/2372 [00:00<?, ?it/s]

Removed 715/2372 segments that are too short
 
Calculating smoothed altitude and vertical velocity:


  0%|          | 0/1657 [00:00<?, ?it/s]

### Interrogate the flagged times, looking for intervals when the altitude changed more than DZ_THRESH

If it does, flag the data within +/-VERTVEL_THRESH_DT of the interval for deletion.

The idea here is to detect large (> ~1km) secular changes in altitude over short (hours) time intervals and delete them (without accidentally deleting large amplitude GWs). This usually works, but sometimes suspicious data aren't deleted because the altitude change isn't large enough. These data are flagged, so if they are included in a GW packet, that packet can be looked at to see if it makes sense.

In [6]:
def flag_largedz(segment):
    
    # Create a counter so that the segment ID is incremented across the flights, creating unique segment IDs
    if "counter" not in flag_largedz.__dict__:
        flag_largedz.counter = 0
    
    # Create data "chunks" that are identified by their vertvel_flag value
    # (an ascending chunk is followed by a neutral chunk, is followed by a descending chunk, etc.)
    segment['chunk_id'] = (segment.vertvel_flag.diff() != 0).astype(int).cumsum()
    chunk_ids = segment.chunk_id.unique()
    n_chunks = segment.chunk_id.nunique()
    
    # Check if alt_rmean changed too much across the chunk
    segment['flagged'] = 0
    for i in np.arange(n_chunks):
        temp = segment[segment.chunk_id == chunk_ids[i]]
        if temp.vertvel_flag.iloc[0] != 0: # only analyze flagged chunks
            # Include the interval VERTVEL_THRESH_DT/2 before and after the chunk
            t_start = temp.time.iloc[0] - VERTVEL_THRESH_DT/2
            t_stop = temp.time.iloc[-1] + VERTVEL_THRESH_DT/2
            temp = segment[segment.time >= t_start]
            temp = temp[temp.time < t_stop]
            if temp.alt_rmean.max() - temp.alt_rmean.min() > DZ_THRESH:
                segment.loc[segment.index.isin(temp.index),'flagged'] = 1
                
    # Each time maneuvering stops, a new segment begins:
    segment['segment_id_new'] = (segment.flagged.diff() == -1).astype(int).cumsum()
    n_segments = segment.segment_id_new.max() + 1 # the segments start at zero
    
    # Delete any data points where maneuvering is happening
    segment = segment[segment.flagged==0]
    
    # Apply the counter and increment it
    segment.segment_id_new = segment.segment_id_new + flag_largedz.counter
    flag_largedz.counter = flag_largedz.counter + n_segments
    
    return segment


# Flag chunks of data where the smoothed dz is too big
num_before = flights.segment_id.nunique()
print('Flagging times when the smoothed altitude changes too much and splitting data into segments')
flights = flights.groupby('segment_id').progress_apply(flag_largedz).reset_index(drop=True)
flights = flights.drop(columns=['segment_id'])
flights = flights.rename(columns={'segment_id_new': 'segment_id'})
#segments = segments.groupby('segment_id').progress_apply(flag_largedz).reset_index(drop=True)
#segments = segments.drop(columns=['segment_id'])
#segments = segments.rename(columns={'segment_id_new': 'segment_id'})
print(f'Number of segments before: {num_before}; number of segments now: {flights.segment_id.nunique()}')
print(' ')

# Delete segments that are too short
print('Calculating segment length:')
num_before = flights.segment_id.nunique()
segment_length = flights.groupby('segment_id').progress_apply(lambda f: f.time.iloc[-1] - f.time.iloc[0])
segments_tooshort = segment_length[segment_length < MIN_SEG_LEN]
flights = flights[~flights.segment_id.isin(segments_tooshort.index)].reset_index(drop=True)
print(f'Removed {num_before-flights.segment_id.nunique()}/{num_before} segments that are too short')
print(' ')

# Delete segments with mean altitudes that are too low
print('Calculating segment mean altitude:')
num_before = flights.segment_id.nunique()
segment_alt = flights.groupby('segment_id').progress_apply(lambda f: f.altitude.mean())
segments_toolow = segment_alt[segment_alt < SEG_MIN_ALT]
flights = flights[~flights.segment_id.isin(segments_toolow.index)].reset_index(drop=True)
print(f'Removed {num_before-flights.segment_id.nunique()}/{num_before} segments that are too low')
print(' ')

# How much data is left over?
print(f'Of original {n_data_orig} data points, {100*flights.shape[0]/n_data_orig}% are retained')

Flagging times when the smoothed altitude changes too much and splitting data into segments


  0%|          | 0/1657 [00:00<?, ?it/s]

Number of segments before: 1657; number of segments now: 1651
 
Calculating segment length:


  0%|          | 0/1651 [00:00<?, ?it/s]

Removed 0/1651 segments that are too short
 
Calculating segment mean altitude:


  0%|          | 0/1651 [00:00<?, ?it/s]

Removed 0/1651 segments that are too low
 
Of original 986795 data points, 60.6904169559027% is retained


### Save the data

In [7]:
# Delete data fields that are unneeded after this
flights = flights.drop(columns=['maneuvering','alt_rmean','chunk_id','flagged'])
flights = flights.rename(columns={'vertvel_flag': 'suspicious_motion'})

flights.to_feather('temp_data/2_segments_2014.feather')
flights.time.min(), flights.time.max()

In [None]:
# If the years are done separately and need to be combined, make sure the segment IDs are unique
temp1 = pd.read_feather('temp_data/3_segments_2011-2013.feather')
temp2 = pd.read_feather('temp_data/3_segments_2014.feather')
temp3 = pd.read_feather('temp_data/3_segments_2015.feather')
temp4 = pd.read_feather('temp_data/3_segments_2016.feather')
temp5 = pd.read_feather('temp_data/3_segments_2017.feather')
temp6 = pd.read_feather('temp_data/3_segments_2018.feather')

# Increment the segment IDs from one file to another, so they are still unique
temp2.segment_id = temp2.segment_id + temp1.segment_id.max() + 1
temp3.segment_id = temp3.segment_id + temp2.segment_id.max() + 1
temp4.segment_id = temp4.segment_id + temp3.segment_id.max() + 1
temp5.segment_id = temp5.segment_id + temp4.segment_id.max() + 1
temp6.segment_id = temp6.segment_id + temp5.segment_id.max() + 1

frames = [temp1, temp2, temp3, temp4, temp5, temp6]
flights = pd.concat(frames, ignore_index = True)

flights.to_feather('temp_data/3_segments_cosmic_2011-2018.feather')