# Reduce catalog based on SEDs
## January 2024
### Primary author: Bethany Ludwig

In [1]:
import pandas as pd 
import numpy as np 
import os

np.seterr(invalid='ignore')

# Functions 
def check_stars(df):
    # Make sure we have all the discovery stars 
    discovery_names = ['Star_1','Star_2','Star_3','Star_4','Star_5','Star_6','Star_7','Star_8','Star_9','Star_10','Star_11','Star_12','Star_13','Star_14','Star_16','Star_17','Star_18','Star_20','Star_21','Star_22','Star_23','Star_24','Star_25']
    c = 0
    for star in discovery_names:
        if star not in df.discovery_name.unique():
            print (star)
            c += 1
    if c == 0:
        print ("All Remaining Discovery Stars After Magnitude Reductions Have Been Found")
# Paths 
data_dir = os.getenv("DATADIR")
directory = data_dir + '/0_SUMS_Catalogs/CandidateCatalog/'
synth_photom_file = data_dir + '1_Models/StrippedStars/photometry_CMFGEN_composites.txt'

## Load in photometry

In [2]:
# Read in magnitude reduced files 
df = pd.read_csv(directory+'1_magnitude_reduced.csv') 
print(df.galaxy.value_counts())

# Make a key
df['key'] = np.arange(df.shape[0])

# Rename filters for simplicity, we can rename them back at the end
df = df.rename(columns={'uvw2_dered':'w2','uvm2_dered':'m2','uvw1_dered':'w1',
                                          'U_dered':'u','B_dered':'b','V_dered':'v','I_dered':'i',
                                          'uvw2 - b':'w2 - b', 'uvw2 - v':'w2 - v', 'uvw2 - i':'w2 - i',
                                          'uvw1 - b':'w1 - b', 'uvw1 - v':'w1 - v', 'uvw1 - i':'w1 - i', 
                                          'uvm2 - b':'m2 - b', 'uvm2 - v':'m2 - v', 'uvm2 - i':'m2 - i'})
# Select keys
optical_columns = ['u','b','v','i']
uv_columns = ['w2','m2','w1']
color_columns = ['w2 - b', 'w2 - v', 'w2 - i',
                'm2 - b', 'm2 - v', 'm2 - i',
                'w1 - b', 'w1 - v', 'w1 - i']
max_mag_err = 0.217


# Print how many stars we have
print("How many stars we have: ",df.shape[0])

######################################################################################################
# "We use the DGL+23 composite stripped star plus MS model grid described in §5.1.2 to describe the  #
# “expected” properties of such systems [binaries containing stripped stars]."                       #
######################################################################################################
names = ['Minit_strip','M_MS','frac_MS','U','B','V','R','I','UVM2','UVW2','UVW1']
sf = pd.read_csv(synth_photom_file,comment='#',delimiter='\t',names=names)

# Make dictionary of the minimum and maximum difference between different filters
min_diff = {'w2 - m2': np.min(sf.UVW2-sf.UVM2) - max_mag_err,
            'm2 - w1': np.min(sf.UVM2-sf.UVW1) - max_mag_err,
            'w1 - u': np.min(sf.UVW1-sf.U) - max_mag_err,
            'u - b': np.min(sf.U-sf.B) - max_mag_err,
            'b - v': np.min(sf.B-sf.V) - max_mag_err,
            'v - i': np.min(sf.V-sf.I) - max_mag_err,
            # Additional UV - Optical Filters
            'w1 - b': np.min(sf.UVW1-sf.B) - max_mag_err,
            'm2 - u': np.min(sf.UVM2-sf.U) - max_mag_err,
            'm2 - b': np.min(sf.UVM2-sf.B) - max_mag_err,
           }

max_diff = {'w2 - m2': np.max(sf.UVW2-sf.UVM2) + max_mag_err,
            'm2 - w1': np.max(sf.UVM2-sf.UVW1) + max_mag_err,
            'w1 - u': np.max(sf.UVW1-sf.U) + max_mag_err,
            'u - b': np.max(sf.U-sf.B) + max_mag_err,
            'b - v': np.max(sf.B-sf.V) + max_mag_err,
            'v - i': np.max(sf.V-sf.I) + max_mag_err,
            # Additional UV - Optical Filters
            'w1 - b': np.max(sf.UVW1-sf.B) + max_mag_err,
            'm2 - u': np.max(sf.UVM2-sf.U) + max_mag_err,
            'm2 - b': np.max(sf.UVM2-sf.B) + max_mag_err,
           }

check_stars(df)

galaxy
lmc    8898
smc    4445
Name: count, dtype: int64
How many stars we have:  13343
All Remaining Discovery Stars After Magnitude Reductions Have Been Found


#### If blue due to one filter, check that that filter is not super different from adjacent filters

# UV Optical Source Mismatches 
#### #1 Remove sources that have a uv-optical mismatch 
"We first exclude sources for which the UV and optical magnitudes each show very little spread in observed magnitude within their respective groups, while the two groups themselves exhibit significant separation from each other.
We consider the optical and/or UV photometry to show little variation if the standard deviation of the magnitudes
is less than 0.217 mag (the maximum statistical error we allowed for a photometric point to be considered in our
analysis) and we consider the UV and optical magnitudes to be “significantly separated” if the means of each group are separated by >1 AB mag in excess of the maximum photometric error."

In [3]:
size = df.shape[0]
# Mismatch #1: UV and Optical components of SED are fairly flat but there's a large discrepency between them

# If all the mags are within the max allowed error, we consider it 'flat'
flat_std = max_mag_err
flat_mag_jump = 1. + max_mag_err

flat_jump_index = []

for ind,row in df.iterrows():
    uv = row[uv_columns].values
    opt = row[optical_columns].values
    
    if np.nanstd(uv) < flat_std and np.nanstd(opt) < flat_std:
        # If you have a flat sed look for a big jump 
        if np.abs(np.nanmean(uv) - np.nanmean(opt)) > flat_mag_jump:
            flat_jump_index.append(row['key'])
            
# Remove from sample 
df = df[~df['key'].isin(flat_jump_index)]

print(f'How many we have: ',df.shape[0])
print(f'How many we lost: ',size - df.shape[0])
check_stars(df)  

How many we have:  13043
How many we lost:  300
All Remaining Discovery Stars After Magnitude Reductions Have Been Found


#### #2 Remove sources that have a uv-optical mismatch 
We additionally exclude sources where the most adjacent filters between the UV and optical groups (typically
UVW1 and U-band, but if either are missing we utilize UVM2/B-band) are separated by an amount that is more
than 0.217 mag larger than the biggest separation found between those filters in the helium plus MS star composite
model grid. When carrying out this analysis, we allow for the possibility of a single poorly estimated magnitude by
checking the next most adjacent magnitude difference as well (i.e. if the UVW1-U color of a source is too large, but
UVW1-B color is within the range allowed by the composite model grid we do not exclude it at this stage).

In [4]:
size = df.shape[0]
# Mismatch #2: Check either UVW1 or UVW2 against U or B to see if there's a large jump 
def not_in_bin(row,uv_filter,optical_filter):
    diff = row[uv_filter] - row[optical_filter]
    if diff < min_diff[f'{uv_filter} - {optical_filter}'] or diff > max_diff[f'{uv_filter} - {optical_filter}']:
        return True
    return False

def exists(row,Filter):
    return np.isfinite(row[Filter])

mismatch_index = []

for i,r in df.iterrows():

    # Ideal case: UVW1, U, and B, all exist. 
    if exists(r,'w1') and exists(r,'u') and exists(r,'b'):
        # If UVW1 - U and UVW1 - B are both not in synthetic bin then drop
        if not_in_bin(r,'w1','u') and not_in_bin(r,'w1','b'):
            mismatch_index.append(r['key'])
    
    # UVW1 and U exist, but B does not.
    if exists(r,'w1') and exists(r,'u') and not exists(r,'b'):
        # If UVW1 - U is not in synthetic bin then drop
        if not_in_bin(r,'w1','u'):
            mismatch_index.append(r['key'])
            
    # UVW1 and B exist, but U does not. 
    if exists(r,'w1') and exists(r,'b') and not exists(r,'u'):
        # If UVW1 - B is not in synthetic bin then drop
        if not_in_bin(r,'w1','b'):
            mismatch_index.append(r['key'])    
    
    # UVM2, U, and B, exist, but UVW1 does not. 
    if exists(r,'m2') and exists(r,'u') and exists(r,'b') and not exists(r,'w1'):
        # If UVM2 - U and UVM2 - B are both not in synthetic bin then drop
        if not_in_bin(r,'m2','u') and not_in_bin(r,'m2','b'):
            mismatch_index.append(r['key']) 
            
    # UVM2 and U exist, but UVW1 and B do not.
    if exists(r,'m2') and exists(r,'u') and not exists(r,'w1') and not exists(r,'b'):
        # If UVM2 - U is not in synthetic bin then drop
        if not_in_bin(r,'m2','u'):
            mismatch_index.append(r['key'])     

    # UVM2 and B exist, but UVW1 and U do not. 
    if exists(r,'m2') and exists(r,'b') and not exists(r,'w1') and not exists(r,'u'):
        # If UVM2 - B is not in synthetic bin then drop
        if not_in_bin(r,'m2','b'):
            mismatch_index.append(r['key'])    
            
df = df[~df['key'].isin(mismatch_index)]

print(f'How many we have: ',df.shape[0])
print(f'How many were categorized as mismatched: ',size - df.shape[0])
check_stars(df)  

How many we have:  11629
How many were categorized as mismatched:  1414
All Remaining Discovery Stars After Magnitude Reductions Have Been Found


# Optically-red SEDs:
When examining the SEDs in our sample, we found some sources that appear to progressively decrease in flux from the UV to blue optical bands, but then increase in flux again when moving to progressively redder optical bands (resulting in a “V”-like SED). While these may be interesting sources in their own right, they are not consistent with expectations for helium star plus MS star binaries with absolute magnitudes in the range of our search. We therefore remove any sources that increase in flux between
two or more adjacent optical filters (i.e. from U to B, B to V, and/or V to I).

In [5]:
size = df.shape[0]
red_index = []

for ind, row in df.iterrows():
    y = np.array(row[optical_columns].values).astype(float)
    # Calculate derivatives
    d = np.diff(y)
    # If derivatives are negative 2 or more times (indicating positive slope for mags) then save
    n = len(d[np.isfinite(d) & (d<0)])
    if n >= 2:
        red_index.append(row['key'])

df = df[~df['key'].isin(red_index)].reset_index(drop=True)

print(f'How many we have: ',df.shape[0])
print(f'How many we lost: ',size-df.shape[0])
check_stars(df)  

How many we have:  7707
How many we lost:  3922
All Remaining Discovery Stars After Magnitude Reductions Have Been Found


# Poor-quality photometric points:

In [6]:
vf = df.copy()

def not_in_bin(row,uv_filter,optical_filter):
    diff = row[uv_filter] - row[optical_filter]
    if diff < min_diff[f'{uv_filter} - {optical_filter}'] or diff > max_diff[f'{uv_filter} - {optical_filter}']:
        return True
    return False

variable_index = []
filters = ['w2','m2','w1','u','b','v','i']
for ind,row in vf.iterrows():
    # Get the sed
    sed = row[filters]
    # Take the derivatives         
    div = np.abs(np.diff(sed))
    # Compare to allowed differences 
    compare = [not_in_bin(row,f1,f2) for f1,f2 in zip(filters[:-1],filters[1:])]
    # Where do the bumps occur? 
    loc = np.where(compare)[0]
    # How many bumps are there?
    n_bumps = len(loc)
    if n_bumps == 0: 
        continue
    # "If more than three colors were flagged as suspect, we reject the source from our sample due to either having a poor
    # quality SED or varying dramatically from expectations for stripped star binaries"
    if n_bumps >= 3:
        variable_index.append(row.key)
        
        
    # "If two adjacent colors were flagged, then we remove the middle photometric 
    # point and assess whether the source would still be considered bluewards of the ZAMS based
    # on the method in Section 5.3."
    if n_bumps == 2 and (loc[1] == loc[0] - 1 or loc[1] == loc[0] + 1): 
        # UVM2 is connected
        if loc[0] == 0 and loc[1] == 1:
            vf.loc[vf.key==row.key,color_columns[1]] = np.nan
        # UVW1 is connected
        if loc[0] == 1 and loc[1] == 2:
            vf.loc[vf.key==row.key,color_columns[2]] = np.nan
        # B is connected, zero out related colors
        if loc[0] == 3 and loc[1] == 4:
            vf.loc[vf.key==row.key,color_columns[4]] = np.nan
        # V is connected, zero out related colors
        if loc[0] == 4 and loc[1] == 5:
            vf.loc[vf.key==row.key,color_columns[5]] = np.nan
            
    # "If only one color (or two non-adjacent colors) are flagged, then we remove both relevant magnitudes and assess
    # whether the source would still be considered bluewards of the ZAMS." 
    else:
        # UVW2 and UVM2 
        if 0 in loc:
            vf.loc[vf.key==row.key,color_columns[0] + color_columns[1]] = np.nan
        # UVM2 and UVW1 
        if 1 in loc: 
            vf.loc[vf.key==row.key,color_columns[1] + color_columns[2]] = np.nan
        # UVW1 and U (we don't color by U)
        if 2 in loc: 
            vf.loc[vf.key==row.key,color_columns[2]] = np.nan
        # U and B (we don't color by U)
        if 3 in loc: 
            vf.loc[vf.key==row.key,color_columns[4]] = np.nan
        # B and V 
        if 4 in loc: 
            vf.loc[vf.key==row.key,color_columns[4] + color_columns[5]] = np.nan
        # V and I 
        if 5 in loc: 
            vf.loc[vf.key==row.key,color_columns[5] + color_columns[6]] = np.nan
            
# Recalculate to see what is still blue 
n_blue = vf[color_columns].isin(['blue']).sum(axis=1)

# Find out which rows are no longer blue 
nolongerblue = vf.loc[n_blue == 0,'key']

# Combine Variable Index and No Longer Blue, Remove Duplicates
variable_index = np.sort(np.unique(np.append(variable_index,nolongerblue)))

# Drop variables
df = df[~df['key'].isin(variable_index)]
print(f'How many we have: ',df.shape[0])
print(f'How many we lost: ',len(variable_index))
check_stars(df)  

How many we have:  7027
How many we lost:  680
All Remaining Discovery Stars After Magnitude Reductions Have Been Found


# Faint I-band photometry: 
When examining the remaining sources in our sample, we noted multiple occasions where the MCPS I-band photometry appears significantly fainter than would be expected from extrapolating the rest of the optical SED. We note that while all of these objects would likely have V-I colors that fall with the overall range of composite model grid (due to the previous criteria for inclusion), this does not necessarily imply that it is consistent with expectations based on the rest of the SED. We therefore test whether the remaining objects would still be considered bluewards of the ZAMS if we remove the I-band. We remove any objects
that only show a UV excess due to this band.


In [7]:
# Change all color combos with 'i' to not be blue. 
i_df = df.copy()

for color in ['w2 - i','w1 - i', 'm2 - i']:
    i_df[color] = i_df[color].replace('blue',np.nan)

# Recalculate to see what is still blue 
n_blue = i_df[color_columns].isin(['blue']).sum(axis=1)

# Find out which rows are no longer blue 
bad_i_index = i_df.loc[n_blue == 0,'key']

# Drop variables
df = df[~df['key'].isin(bad_i_index)]
print(f'How many we have: ',df.shape[0])
print(f'How many we lost: ',len(bad_i_index))
check_stars(df)  

How many we have:  5184
How many we lost:  1843
Star_17
Star_25


# Clean Up

In [8]:
#Reread in and calculate the number of jumps for everything: 
df = pd.read_csv(directory+'1_magnitude_reduced.csv') 

# Rename filters for simplicity, we can rename them back at the end
df = df.rename(columns={'uvw2_dered':'w2','uvm2_dered':'m2','uvw1_dered':'w1',
                                          'U_dered':'u','B_dered':'b','V_dered':'v','I_dered':'i',
                                          'uvw2 - b':'w2 - b', 'uvw2 - v':'w2 - v', 'uvw2 - i':'w2 - i',
                                          'uvw1 - b':'w1 - b', 'uvw1 - v':'w1 - v', 'uvw1 - i':'w1 - i', 
                                          'uvm2 - b':'m2 - b', 'uvm2 - v':'m2 - v', 'uvm2 - i':'m2 - i'})

#Recalculate the number of bumps for this sample and save it as a column: 
filters = ['w2','m2','w1','u','b','v','i']
n_bumps = []
for ind,row in df.iterrows():
    # Get the sed
    sed = row[filters]
    # Take the derivatives         
    div = np.abs(np.diff(sed))
    # Compare to allowed differences 
    compare = [not_in_bin(row,f1,f2) for f1,f2 in zip(filters[:-1],filters[1:])]
    # Where do the bumps occur? 
    loc = np.where(compare)[0]
    # How many bumps are there?
    n_bumps.append(len(loc))

# Do any cuts overlap?
drop_index = [flat_jump_index , mismatch_index , red_index , variable_index , bad_i_index]
drop_name = ['flat_jump','mismatch','red','variable','bad_i']

# Re-Read in magnitude reduced files 
df = pd.read_csv(directory+'1_magnitude_reduced.csv') 

# Make a key
df['key'] = np.arange(df.shape[0])

# Add the number of bumps to the dataframe
df['n_bumps'] = n_bumps


# Make a cut column 
df['cut'] = ''

# Fill the cut column 
for index, name in zip(drop_index,drop_name):
    df.loc[df['key'].isin(index),'cut'] = name

print(f'{df[df.cut == ""].shape[0]} stars remaining')
print(f'{df[df.cut != ""].shape[0]} stars cut')
print(df[df.cut == ""].galaxy.value_counts())


# Save the dataframe
df.to_csv(directory+'2_sed_reduced.csv',index=False)
print('Saved')

5184 stars remaining
8159 stars cut
galaxy
lmc    3130
smc    2054
Name: count, dtype: int64
Saved
