### In Topcat I crossmatched the Step 5 Photometry Catalog with both Gaia EDR3 and Simbad  
Here I recombine them, format, and calculate some gaia cut offs.

In [1]:
import glob 
import pandas as pd
from astropy.coordinates import SkyCoord
import astropy.units as u
import numpy as np

def google_sheet_format(val):
    if val == 'none':
        return val
    simbad_url_name = val.replace(' ','+')
    url = f'https://simbad.u-strasbg.fr/simbad/sim-id?Ident={simbad_url_name}&NbIdent=1&Radius=1&Radius.unit=arcsec&submit=submit+id'
    return f'=HYPERLINK("{url}","{val}")'


galaxy = 'smc'

data_dir = '/home/bethany/Projects/0_Data/'

df = pd.read_csv(data_dir + f'0_SUMS_Catalogs/CompleteCatalog/Step5/{galaxy}_colors.csv')

gaia = pd.read_csv(data_dir + f'0_SUMS_Catalogs/CompleteCatalog/Crossmatched/{galaxy}_step5_gaia_topcatcrossmatch.csv')

simbad = pd.read_csv(data_dir + f'0_SUMS_Catalogs/CompleteCatalog/Crossmatched/{galaxy}_step5_simbad_topcatcrossmatch.csv')

# Rename some columns to match what I expect: 
vizier_gaia_cols = ['RAJ2000','DEJ2000','Plx','e_Plx','pmRA','pmDE','e_pmRA','e_pmDE','epsi','gofAL','Gmag','angDist']
gaia_cols = ['gaia_ra','gaia_dec','parallax','parallax_error','pmra','pmdec','pmra_error','pmdec_error','astrometric_excess_noise','astrometric_gof_al','phot_g_mean_mag','gaia_match_distance']
gaia = gaia.rename(columns=dict(zip(vizier_gaia_cols,gaia_cols)))
simbad.rename(columns={'angDist':'simbad_match_distance','ra_x':'simbad_ra','dec_x':'simbad_dec'},inplace=True)
simbad_cols = ['simbad_ra','simbad_dec','main_id','main_type','otype','sp_type','simbad_match_distance']
 
df.rename(columns={'Unnamed: 0':'col1'},inplace=True)

# Only keep the columns I care about 
gaia = gaia[['col1'] + gaia_cols]
simbad = simbad[['col1'] + simbad_cols]

# Use the index column to merge 
df = df.merge(gaia,how='left',on='col1')
df = df.merge(simbad,how='left',on='col1')


  simbad = pd.read_csv(data_dir + f'0_SUMS_Catalogs/CompleteCatalog/Crossmatched/{galaxy}_step5_simbad_topcatcrossmatch.csv')


In [4]:
#######################
# Calculate Gaia Chi2 #
#######################

cov = np.load(data_dir+f'7_GAIA/{galaxy}_cov4_w_error_2d_v3.npy')
mu = np.load(data_dir+f'7_GAIA/{galaxy}_cov_medians4_w_error_2d_v3.npy')

df['gaia_chi2'] = 0

for ind, row in df.iterrows():
    pm = np.array([row['pmra'],row['pmdec']])
    pm_error = np.array([row['pmra_error'],row['pmdec_error']])
    chi2 = np.dot(pm-mu, np.dot(pm-mu,np.linalg.inv(cov+np.diag(pm_error**2))))
    df.loc[ind,'gaia_chi2'] = chi2

  df.loc[ind,'gaia_chi2'] = chi2


In [None]:
#############
# Gaia Cuts #
#############

# If the Gaia G band magnitude is more than 1.5 mags away from B or V magnitude then it may not be the same source, in which case we shouldn't drop it. 
# Phot_g_mean_mag is the Gaia G band magnitude. It is in the Vega system and needs to be converted to AB. 
# These values are from Table 5.2 https://gea.esac.esa.int/archive/documentation/GEDR3/Data_processing/chap_cu5pho/cu5pho_sec_photProc/cu5pho_ssec_photCal.html
ZP_AB = 25.8010
ZP_Vega = 25.6874
df['phot_g_mean_mag_AB'] = df['phot_g_mean_mag'] - (ZP_AB - ZP_Vega)
mcps_filter = []
no_mcps_filter_counter = 0
for ind, row in df.iterrows():
    if ~np.isnan(row['e_V']):
        mcps_filter.append(row['V'])
    elif ~np.isnan(row['e_B']):
        mcps_filter.append(row['B'])
    else:
        mcps_filter.append(np.nan)
        no_mcps_filter_counter += 1

# Things we should drop due to parallax requirement
df['gaia_px_cut'] = 'no'
df.loc[(df.astrometric_gof_al < 3) & (df.parallax > 4 * df.parallax_error),'gaia_px_cut'] = 'yes'

# Things we should drop due to proper motions
df['gaia_pm_cut'] = 'no'
df.loc[(df.astrometric_gof_al < 3) & (df.gaia_chi2 > 10.6),'gaia_pm_cut'] = 'yes'

# Are there cases where gaia data exists but gaia_chi2 is nan?
check = df.loc[(df.gaia_chi2.isnull()) & (df.pmra.notnull()),'gaia_chi2'].shape[0]
if check > 0:
    print('There are cases where gaia data exists but gaia_chi2 is nan')
# Are there cases where gaia_chi2 = 0?
check = df.loc[df.gaia_chi2 == 0,'gaia_chi2'].shape[0]
if check > 0:
    print('There are cases where gaia_chi2 = 0')

In [4]:
#####################
# Simbad Formatting #
#####################


# Make all nans under otype or sptype be 'none'
df.loc[df.main_id.isnull(),'main_id'] = 'none'
df.loc[df.main_type.isnull(),'main_type'] = 'none'
df.loc[df.sp_type.isnull(),'sp_type'] = 'none'

# Make clickable links
df['simbad_link'] = df['main_id'].apply(lambda x: google_sheet_format(x))

# Strip empty strings
df['main_type'] = df['main_type'].str.strip()
df['sp_type'] = df['sp_type'].str.strip()

# Strip any brackets or parentheses from sp_type
df['sp_type'] = df['sp_type'].str.replace('[','')
df['sp_type'] = df['sp_type'].str.replace('(','')


# Make an object type group to have a cleaner column 
df['simbad_group'] = df['main_type']

# If the word _Candidate_ is in the name, change the group to Candidate
df.loc[df['simbad_group'].str.contains('Candidate'), 'simbad_group'] = 'Candidate'

# Make an 'unclassfied' group for things that are not named after a specific type of star, like LongPeriodV, EMLine, etc
df.loc[df['simbad_group'].isin(['LongPeriodV*','EmLine*','Unknown',
                                'SB*','EllipVar', 'HighPM*','NearIR','IR','Infrared','MidIR','FarIR','Cluster*',
                                'UV','PulsV*','Association','Radio','cmRad','Galaxy','ISM','EmObj','MolCld',
                                'HIIReg','X','GroupG','','OpenCluster','nan','Eruptive*','Nova','Cloud','Maser','Seyfert1',
                                ]),'simbad_group'] = 'Vague'
# Rename * to star
df.loc[df['simbad_group'] == '*','simbad_group'] = 'Star'

# Rename ** to binary
df.loc[df['simbad_group'] == '**','simbad_group'] = 'Binary'

# Replace any remaining * with '' 
df['simbad_group'] = df['simbad_group'].str.replace('*','')

# Put things in nicer groups 
df.loc[df.main_type == 'AGB*','simbad_group'] = 'RGB/AGB'
df.loc[df.main_type == 'Be*','simbad_group'] = 'Be'
df.loc[df.main_type == 'BlueSG','simbad_group'] = 'BSG/YSG'
df.loc[df.main_type == 'C*','simbad_group'] = 'RGB/AGB'
df.loc[df.main_type == 'ClassicalCep','simbad_group'] = 'Cepheid'
df.loc[df.main_type == 'HighMassXBin','simbad_group'] = 'HMXB'
df.loc[df.main_type == 'Mira','simbad_group'] = 'RGB/AGB'
df.loc[df.main_type == 'PlanetaryNeb','simbad_group'] = 'PN'
df.loc[df.main_type == 'RRLyrae','simbad_group'] = 'RR Lyrae'
df.loc[df.main_type == 'RVTauV*','simbad_group'] = 'RGB/AGB'
df.loc[df.main_type == 'RedSG','simbad_group'] = 'RSG'
df.loc[df.main_type == 'S*','simbad_group'] = 'RGB/AGB' 
df.loc[df.main_type == 'Supergiant','simbad_group'] = 'BSG/YSG'
df.loc[df.main_type == 'Type2Cep','simbad_group'] = 'Cepheid'
df.loc[df.main_type == 'WolfRayet*','simbad_group'] = 'WR'
df.loc[df.main_type == 'YellowSG','simbad_group'] = 'BSG/YSG'
df.loc[df.main_type == 'delSctV*','simbad_group'] = 'Delta Scuti'
df.loc[df.main_type == 'post-AGB*','simbad_group'] = 'Post-AGB'


# Make a sptype group that is just the first letter of the spectral type
df['simbad_sptype'] = df['sp_type'].str[0]

# Save the final dataframe
df.to_csv(data_dir + f'0_SUMS_Catalogs/CompleteCatalog/Crossmatched/{galaxy}_step5_crossmatch.csv')

In [5]:
# How many things have no GAIA data? 
print(f'For the Entire Catalog, in the {galaxy.upper()}: ')
print(f'{df[df.pmra.isna()].shape[0] / df.shape[0] * 100:.2f}% have no Gaia data')
print(f'{df[df.gaia_pm_cut == "yes"].shape[0] / df.shape[0] * 100:.2f}% have Gaia PM cuts')
print(f'{df[df.gaia_px_cut == "yes"].shape[0] / df.shape[0] * 100:.2f}% have Gaia PX cuts')
# How many have both?
print(f'{df[(df.gaia_pm_cut == "yes") & (df.gaia_px_cut == "yes")].shape[0] / df.shape[0] * 100:.2f}% have both cuts')


For the Entire Catalog, in the SMC: 
4.07% have no Gaia data
5.06% have Gaia PM cuts
0.80% have Gaia PX cuts
0.61% have both cuts


In [8]:
# Some Other Tests


# ✨ How many sources do not have gaia data? 
# Either parallax, pmra, or pmdec should be a number for it to be a match. 
df.loc[(df.parallax.isna()) | (df.pmra.isna()) | (df.pmdec.isna()),'No Gaia Data'] = 'True'
 
# ✨ Check if phot_g_mean_mag is available 
df.loc[(df.phot_g_mean_mag.isna()) & (df['No Gaia Data'] != 'True'),'No G Mag'] = 'True'

# ✨ Check if phot_g_mean_mag is within 1.5 mag of V band, if that is available, otherwise use B band.
# Phot_g_mean_mag is the Gaia G band magnitude. It is in the Vega system and needs to be converted to AB. 
# These values are from Table 5.2 https://gea.esac.esa.int/archive/documentation/GEDR3/Data_processing/chap_cu5pho/cu5pho_sec_photProc/cu5pho_ssec_photCal.html
ZP_AB = 25.8010
ZP_Vega = 25.6874
df['phot_g_mean_mag_AB'] = df['phot_g_mean_mag'] - (ZP_AB - ZP_Vega)
mcps_filter = []
no_mcps_filter_counter = 0
for ind, row in df.iterrows():
    if ~np.isnan(row['e_V']):
        mcps_filter.append(row['V'])
    elif ~np.isnan(row['e_B']):
        mcps_filter.append(row['B'])
    else:
        mcps_filter.append(np.nan)
        no_mcps_filter_counter += 1

# Make sure there is available mcps data - this number should be 0, will need to change algorithm if it isn't. 
print(f"Sources without V or B band data: {no_mcps_filter_counter}")

# Calculate the difference in magnitudes between Gaia G and MCPS V or B band.
df['mag_diff'] = np.abs(df['phot_g_mean_mag_AB'] - mcps_filter)
df.loc[(df['mag_diff'].notna()) & (df['mag_diff'] > 1.5), 'Gaia and MCPS Mag Diff > 1.5'] = 'True'

# ✨ How many sources with gaia data have astrometric_gof_al > 3? 
# This is a quality flag that indicates the goodness of fit of the astrometric solution.
df.loc[df.astrometric_gof_al > 3, 'Questionable Fit: astrometric_gof_al > 3'] = 'True'

# ✨ How many sources with gaia data have gaia chi2 > 10.6? 
df.loc[df.gaia_chi2 > 10.6, 'Possible Foreground: chi2 > 10.6'] = 'True'

# ✨ How many sources with gaia data have parallax > 4 * parallax_error? (Anna choose 5 but I think because her sources are so bright)
df.loc[df.parallax > 4 * df.parallax_error, 'Parallax Measured: px > 4*px_err'] = 'True'

conditions = ['No Gaia Data','No G Mag','Gaia and MCPS Mag Diff > 1.5','Questionable Fit: astrometric_gof_al > 3','Possible Foreground: chi2 > 10.6','Parallax Measured: px > 4*px_err']

# Does not have gaia
print('Sources without Gaia G band magnitude: ', df.loc[(df['No G Mag'] == 'True'),conditions].shape[0])
print('No proper motion or parallax data: ', df.loc[(df['No Gaia Data'] == 'True'),conditions].shape[0]/df.shape[0]*100,'% of the sources')
print('Gaia G band magnitude is not within 1.5 mag of MCPS V or B band',df.loc[(df['Gaia and MCPS Mag Diff > 1.5'] == 'True'),conditions].shape[0])
no_gaia =  df.loc[(df['No Gaia Data'] == 'True') | (df['No G Mag'] == 'True') | (df['Gaia and MCPS Mag Diff > 1.5'] == 'True'),conditions].copy()
print('No Gaia data: ',no_gaia.shape[0],' which is ',no_gaia.shape[0]/df.shape[0]*100,'% of the sources')
print('Questionable Fit: astrometric_gof_al > 3: ', df.loc[(df['Questionable Fit: astrometric_gof_al > 3'] == 'True'),conditions].shape[0])

# Does have gaia
gaia_df = df[(df['No Gaia Data'] != 'True') & (df['No G Mag'] != 'True') & (df['Gaia and MCPS Mag Diff > 1.5'] != 'True')].reset_index(drop=True)
check_1 = gaia_df.loc[(gaia_df['Questionable Fit: astrometric_gof_al > 3'] != 'True') & (gaia_df['Possible Foreground: chi2 > 10.6'] == 'True'),conditions]
print('Not consistant with proper motions in the galaxies. (Good Fit)',check_1.shape[0])
check_2 = gaia_df.loc[(gaia_df['Questionable Fit: astrometric_gof_al > 3'] == 'True') & (gaia_df['Possible Foreground: chi2 > 10.6'] == 'True'),conditions]
print('Not consistant with proper motions in the galaxies but the fit was bad.',check_2.shape[0])


# Does have gaia and was fit well 
good_fit_size = gaia_df.loc[(gaia_df['Questionable Fit: astrometric_gof_al > 3'] != 'True') & (gaia_df['Parallax Measured: px > 4*px_err'] == 'True'),conditions].shape[0]
print('Parallax Measured: px > 4*px_err (Good Fit): ',good_fit_size/df.shape[0]*100,'% of the sources')
good_px = gaia_df.loc[(gaia_df['Questionable Fit: astrometric_gof_al > 3'] != 'True') & (gaia_df['Parallax Measured: px > 4*px_err'] == 'True')].parallax
print('Parallax values if fit was good: min: ',np.min(good_px),' mean: ',np.mean(good_px),' max: ',np.max(good_px))
# Foreground - Where parallax condition overlaps with chi2 condition
chi2 = gaia_df.loc[(gaia_df['Questionable Fit: astrometric_gof_al > 3'] != 'True') & (gaia_df['Parallax Measured: px > 4*px_err'] == 'True') & (gaia_df['Possible Foreground: chi2 > 10.6'] == 'True'),conditions].shape[0] 
print('Percent dropped due to chi2 condition: ',chi2/df.shape[0]*100,'% of the sources')
# Foreground - Combined 
foreground = gaia_df.loc[(gaia_df['Questionable Fit: astrometric_gof_al > 3'] != 'True') & ((gaia_df['Parallax Measured: px > 4*px_err'] == 'True') | (gaia_df['Possible Foreground: chi2 > 10.6'] == 'True')),conditions]
print('Foreground: ',foreground.shape[0],' which is ',foreground.shape[0]/df.shape[0]*100,'% of the sources')

Sources without V or B band data: 23
Sources without Gaia G band magnitude:  106
No proper motion or parallax data:  4.0670256149330575 % of the sources
Gaia G band magnitude is not within 1.5 mag of MCPS V or B band 4161
No Gaia data:  12730  which is  4.83365102026868 % of the sources
Questionable Fit: astrometric_gof_al > 3:  73802
Not consistant with proper motions in the galaxies. (Good Fit) 13122
Not consistant with proper motions in the galaxies but the fit was bad. 32567
Parallax Measured: px > 4*px_err (Good Fit):  0.797381550869146 % of the sources
Parallax values if fit was good: min:  0.0532  mean:  1.017385  max:  10.4172
Percent dropped due to chi2 condition:  0.6041114511584814 % of the sources
Foreground:  13631  which is  5.175765676141585 % of the sources
