# Matching O'Ryan+23 to COSMOS
Notebook that uses dictionary comprehension to conduct catalogue matching between the O'Ryan+23 sample and the COSMOS 2015 catalogue. Matches to the primary by distance from O'Ryan+23 coordinates, and then matches another system within 10 arcsecs by redshift.

In [1]:
import pandas as pd
import sys
import time
from tqdm import tqdm
tqdm.pandas()

from astroquery.irsa import Irsa
from astropy.coordinates import SkyCoord
import astropy.units as u

  import sys


In [2]:
folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/data'

In [3]:
df = pd.read_csv(f'{folder}/interacting-catalogue.csv', index_col = 0)

Catalogue keyword = cosmos2015

### Test Calling IRSA

In [61]:
df.query('SourceID == 4000705532455')

Unnamed: 0,SourceID,RA,Dec,int_prediction,references,status
21,4000705532455,150.679664,2.196615,0.970096,['2007ApJS..172...99C'],Referenced


In [62]:
test_coords = (
    SkyCoord(
        ra = df.RA.iloc[21] * u.deg,
        dec = df.Dec.iloc[21] * u.deg,
        frame = 'fk5',
    )
)

In [64]:
table = Irsa.query_region(test_coords, catalog = 'cosmos2015', radius = 10 * u.arcsec)

In [67]:
table.to_pandas().sort_values('dist', ascending = True)

Unnamed: 0,ra,dec,clon,clat,id,x_image,y_image,errx2_image,erry2_image,errxy_image,...,sfr_best,ssfr_med,ssfr_med_min68,ssfr_med_max68,ssfr_best,l_nu,l_r,l_k,dist,angle
0,150.679866,2.196563,10h02m43.17s,02d11m47.63s,590539,9199.92676,21737.71094,0.00013,0.00013,-2e-05,...,0.508,-8.712,-8.865,-8.463,-8.847,28.634,28.886,28.866,0.769299,105.397685
1,150.679538,2.197317,10h02m43.09s,02d11m50.34s,589285,9207.79785,21755.80078,0.01444,0.02512,-0.00135,...,0.789,-8.121,-8.411,-7.713,-7.337,28.683,28.434,28.339,2.54721,350.10284
2,150.681207,2.19629,10h02m43.49s,02d11m46.65s,588578,9167.76758,21731.17383,0.01434,0.01414,0.00109,...,0.831,-8.202,-8.495,-7.706,-7.827,28.841,28.726,28.485,5.689852,102.038284
3,150.678579,2.19818,10h02m42.86s,02d11m53.45s,589860,9230.81738,21776.5,0.0142,0.01264,0.00279,...,-1.011,-8.708,-9.133,-8.25,-9.169,27.183,27.608,27.602,6.830792,325.292674
4,150.678607,2.19488,10h02m42.87s,02d11m41.57s,588065,9230.10156,21697.30664,0.0014,0.00108,0.00018,...,0.643,-8.576,-8.676,-8.441,-8.628,28.762,28.928,28.79,7.318688,211.155582
5,150.681176,2.195221,10h02m43.48s,02d11m42.80s,587840,9168.50293,21705.50977,0.01196,0.0104,0.00201,...,0.652,-7.939,-8.48,-7.603,-7.623,28.618,28.438,28.371,7.422798,132.724881
6,150.677449,2.197266,10h02m42.59s,02d11m50.16s,589240,9257.91797,21754.55664,0.0183,0.02594,0.00919,...,-1.71,-8.99,-9.662,-8.36,-9.499,26.456,27.005,27.056,8.28823,286.296855
7,150.681572,2.195329,10h02m43.58s,02d11m43.19s,588687,9159.00391,21708.11133,0.00107,0.00161,6e-05,...,1.4,-9.114,-9.185,-9.032,-9.124,29.509,29.836,29.893,8.300201,124.042322
8,150.680917,2.198739,10h02m43.42s,02d11m55.46s,590230,9174.74414,21789.93164,0.01509,0.01674,0.00264,...,0.068,-8.133,-8.546,-7.691,-7.342,27.964,27.716,27.623,8.866661,30.661156
9,150.68184,2.195189,10h02m43.64s,02d11m42.68s,588102,9152.58105,21704.75391,0.00071,0.00108,0.00015,...,-1.055,-8.625,-8.745,-8.435,-8.666,27.074,27.26,27.124,9.381154,123.298481


### Finding a Valid Query

In [6]:
limits_cosmos = [150.11916667 - 2, 150.11916667 + 2, 2.20583333 - 2, 2.20583333 + 2.20583333]

I get everything from the Catalogue! The columns I want are:

    1. id - Running Object Number
    2. SSFR_BEST - The Best Fit log Specific SFR Using BC03 Templates. Taken at the minimum Chi Squared
    3. SSFR_MED_MAX68 - Upper limit on the 68% Confidence Interval
    4. SSFR_MED_MIN68 - Lower limit on the 68% confidence interval.
    5. SSFR_MED - log sSFR from BC03 best-fit template. median of the PDF
    6. SFR_BEST - log SFR from BC03 best-fit template. Taken at the minimum chi2
    7. SFR_MED_MAX68 -  upper limit, 68% confidence level
    8. SFR_MED_MIN68 -  lower limit, 68% confidence level
    9. SFR_MED -  log sSFR from BC03 best-fit template. median of the PDF
    10. MASS_BEST - 	 log Stellar mass from BC03 best-fit template
    11. MASS_MED_MAX68 - 	 upper limit, 68% confidence level
    12. MASS_MED_MIN68 -  lower limit, 68% confidence level
    13. MASS_MED - log Stellar mass from BC03 best-fit template. median of the PDF
    14. AGE -  BC03 age
    15. TYPE - 	 Type: 0 = galaxy, 1 = star, 2 = X-ray source
    16. ZPDF - 	 photo-z measured using the galaxy templates
    17. FLAG_DEEP - 1: Ultra-deep stripes, 0: deep stripes
    18. FLAG_SHALLOW - Shallow Flag
    19. l_r - 	 log(dust corr lum in erg/s/Hz) in r filter
    20. l_k - 	 log(dust corr lum in erg/s/Hz) in NUV filter
    21. l_nu -  log(dust corr lum in erg/s/Hz) in NUV filter
    22. dist - I have assumed this is the seperation from my coordinates to the catalogue ones.

In [68]:
def convert_results(table):
    export_dict = {
        'id_1': None,
        'ssfr_best_1': None,
        'ssfr_med_max68_1': None,
        'ssfr_med_min68_1': None,
        'ssfr_med_1': None,
        'sfr_best_1': None,
        'sfr_med_max68_1': None,
        'sfr_med_min68_1': None,
        'sfr_med_1': None,
        'mass_best_1': None,
        'mass_med_max68_1': None,
        'mass_med_min68_1': None,
        'mass_med_1': None,
        'age_1': None,
        'type_1': None,
        'photoz_1': None,
        'flag_deep_1': None,
        'flag_shallow_1': None,
        'l_r_1': None,
        'l_k_1': None,
        'l_nu_1': None,
        'dist_1': None,
        'id_2': None,
        'ssfr_best_2': None,
        'ssfr_med_max68_2': None,
        'ssfr_med_min68_2': None,
        'ssfr_med_2': None,
        'sfr_best_2': None,
        'sfr_med_max68_2': None,
        'sfr_med_min68_2': None,
        'sfr_med_2': None,
        'mass_best_2': None,
        'mass_med_max68_2': None,
        'mass_med_min68_2': None,
        'mass_med_2': None,
        'age_2': None,
        'type_2': None,
        'photoz_2': None,
        'flag_deep_2': None,
        'flag_shallow_2': None,
        'l_r_2': None,
        'l_k_2': None,
        'l_nu_2': None,
        'dist_2': None,
      }
    
    sort_tab = table.sort_values('dist', ascending = True)
    
    prim_table = sort_tab[[
        'id',
        'ssfr_best',
        'ssfr_med_max68',
        'ssfr_med_min68',
        'ssfr_med',
        'sfr_best',
        'sfr_med_max68',
        'sfr_med_min68',
        'sfr_med',
        'mass_best',
        'mass_med_max68',
        'mass_med_min68',
        'mass_med',
        'age',
        'type',
        'photoz',
        'flag_deep',
        'flag_shallow',
        'l_r',
        'l_k',
        'l_nu',
        'dist',
    ]].iloc[0]
    
    export_dict['id_1'] = prim_table.id
    export_dict['ssfr_best_1'] =  prim_table.ssfr_best
    export_dict['ssfr_med_max68_1'] = prim_table.ssfr_med_max68
    export_dict['ssfr_med_min68_1'] = prim_table.ssfr_med_min68
    export_dict['ssfr_med_1'] = prim_table.ssfr_med
    export_dict['sfr_best_1'] = prim_table.sfr_best
    export_dict['sfr_med_max68_1'] = prim_table.sfr_med_max68
    export_dict['sfr_med_min68_1'] = prim_table.sfr_med_min68
    export_dict['sfr_med_1'] = prim_table.sfr_med
    export_dict['mass_best_1'] = prim_table.mass_best
    export_dict['mass_med_max68_1'] = prim_table.mass_med_max68
    export_dict['mass_med_min68_1'] = prim_table.mass_med_min68
    export_dict['mass_med_1'] = prim_table.mass_med
    export_dict['age_1'] = prim_table.age
    export_dict['type_1'] = prim_table.type
    export_dict['photoz_1'] = prim_table.photoz
    export_dict['flag_deep_1'] = prim_table.flag_deep
    export_dict['flag_shallow_1'] = prim_table.flag_shallow
    export_dict['l_r_1'] =  prim_table.l_r
    export_dict['l_k_1'] = prim_table.l_k
    export_dict['l_nu_1'] = prim_table.l_nu
    export_dict['dist_1'] = prim_table.dist
    
    target_z = prim_table.photoz
    prim_id = prim_table.id
    
    sec_df = table.query('id != @prim_id')
    if len(sec_df) == 0:
        return export_dict
    
    sec_df_z = (
        sec_df
        .assign(abs_z_diff = sec_df.apply(lambda row: abs(row.photoz - target_z), axis = 1))
    )
    
    sec_df_sort = sec_df_z.sort_values('abs_z_diff', ascending = True)
    sec_table = sec_df_sort[[
        'id',
        'ssfr_best',
        'ssfr_med_max68',
        'ssfr_med_min68',
        'ssfr_med',
        'sfr_best',
        'sfr_med_max68',
        'sfr_med_min68',
        'sfr_med',
        'mass_best',
        'mass_med_max68',
        'mass_med_min68',
        'mass_med',
        'age',
        'type',
        'photoz',
        'flag_deep',
        'flag_shallow',
        'l_r',
        'l_k',
        'l_nu',
        'dist',
    ]].iloc[0]
    
    export_dict['id_2'] = sec_table.id
    export_dict['ssfr_best_2'] =  sec_table.ssfr_best
    export_dict['ssfr_med_max68_2'] = sec_table.ssfr_med_max68
    export_dict['ssfr_med_min68_2'] = sec_table.ssfr_med_min68
    export_dict['ssfr_med_2'] = sec_table.ssfr_med
    export_dict['sfr_best_2'] = sec_table.sfr_best
    export_dict['sfr_med_max68_2'] = sec_table.sfr_med_max68
    export_dict['sfr_med_min68_2'] = sec_table.sfr_med_min68
    export_dict['sfr_med_2'] = sec_table.sfr_med
    export_dict['mass_best_2'] = sec_table.mass_best
    export_dict['mass_med_max68_2'] = sec_table.mass_med_max68
    export_dict['mass_med_min68_2'] = sec_table.mass_med_min68
    export_dict['mass_med_2'] = sec_table.mass_med
    export_dict['age_2'] = sec_table.age
    export_dict['type_2'] = sec_table.type
    export_dict['photoz_2'] = sec_table.photoz
    export_dict['flag_deep_2'] = sec_table.flag_deep
    export_dict['flag_shallow_2'] = sec_table.flag_shallow
    export_dict['l_r_2'] =  sec_table.l_r
    export_dict['l_k_2'] = sec_table.l_k
    export_dict['l_nu_2'] = sec_table.l_nu
    export_dict['dist_2'] = sec_table.dist
    
    return export_dict

In [69]:
def get_table(ra, dec, reg_limits):
    
    if ra < limits_cosmos[0] or ra > limits_cosmos[1] or dec < limits_cosmos[2] or dec > limits_cosmos[3]:
        return 'outwith_cosmos'
    
    coord = SkyCoord(ra = ra * u.deg, dec = dec * u.deg)
    
    table = Irsa.query_region(coord, catalog = 'cosmos2015', radius = 10 * u.arcsec)
    if len(table) == 0:
        return 'null'
    
    for attempt in range(5):
        try:
            table_df = table.to_pandas()
            break
        except:
            time.sleep(5)
    
    if attempt == 4:
        return 'Failed'
    
    table_red = table_df[[
        'id',
        'ssfr_best',
        'ssfr_med_max68',
        'ssfr_med_min68',
        'ssfr_med',
        'sfr_best',
        'sfr_med_max68',
        'sfr_med_min68',
        'sfr_med',
        'mass_best',
        'mass_med_max68',
        'mass_med_min68',
        'mass_med',
        'age',
        'type',
        'photoz',
        'flag_deep',
        'flag_shallow',
        'l_r',
        'l_k',
        'l_nu',
        'dist',
    ]]
    
    table_dict = convert_results(table_red)
    
    time.sleep(0.01)
    
    return table_dict

In [70]:
dict_red = df[['SourceID', 'RA', 'Dec']].set_index('SourceID').to_dict(orient='index')

In [94]:
dict_results = {key : get_table(dict_red[key]['RA'], dict_red[key]['Dec'], limits_cosmos) for key in tqdm(list(dict_red.keys()))}

100%|██████████| 21926/21926 [5:18:04<00:00,  1.15it/s]


In [95]:
df_tmp = pd.DataFrame.from_dict(dict_results, orient = 'index').rename(columns = {0 : 'dict_results'})

In [96]:
df_in_cosmos = df_tmp.query('dict_results != "outwith_cosmos" and dict_results != "null"')

In [97]:
dict_cosmos = df_in_cosmos.to_dict()['dict_results']

In [98]:
exp_df = pd.DataFrame.from_dict(dict_cosmos, orient = 'index').reset_index().rename(columns = {'index' : 'SourceID'}).dropna()

In [99]:
exp_df.columns

Index(['SourceID', 'id_1', 'ssfr_best_1', 'ssfr_med_max68_1',
       'ssfr_med_min68_1', 'ssfr_med_1', 'sfr_best_1', 'sfr_med_max68_1',
       'sfr_med_min68_1', 'sfr_med_1', 'mass_best_1', 'mass_med_max68_1',
       'mass_med_min68_1', 'mass_med_1', 'age_1', 'type_1', 'photoz_1',
       'flag_deep_1', 'flag_shallow_1', 'l_r_1', 'l_k_1', 'l_nu_1', 'dist_1',
       'id_2', 'ssfr_best_2', 'ssfr_med_max68_2', 'ssfr_med_min68_2',
       'ssfr_med_2', 'sfr_best_2', 'sfr_med_max68_2', 'sfr_med_min68_2',
       'sfr_med_2', 'mass_best_2', 'mass_med_max68_2', 'mass_med_min68_2',
       'mass_med_2', 'age_2', 'type_2', 'photoz_2', 'flag_deep_2',
       'flag_shallow_2', 'l_r_2', 'l_k_2', 'l_nu_2', 'dist_2'],
      dtype='object')

In [100]:
exp_df[['SourceID', 'photoz_1', 'photoz_2']]

Unnamed: 0,SourceID,photoz_1,photoz_2
0,4000705532455,0.536,0.366
1,4000705532984,0.230,0.532
2,4000705533312,0.310,0.336
3,4000705533383,0.621,0.756
5,4000705539529,0.698,0.834
...,...,...,...
3814,6000536185304,0.538,0.480
3815,6000536185363,0.350,0.000
3816,6000536185496,0.868,0.992
3817,6000536185585,0.636,0.579


In [101]:
full_df = exp_df.merge(df, on = 'SourceID', how = 'left')

In [102]:
full_df_dedup = full_df.drop_duplicates('id_1', keep = 'first')

In [105]:
full_df_dedup.to_csv('C:/Users/oryan/Documents/mergers-in-cosmos/data/cosmos-matched-df.csv')