In [1]:
import pandas as pd
import numpy as np
from IPython.display import clear_output

from astropy.io import fits
from astropy.table import Table
from astropy.cosmology import FlatLambdaCDM
from astropy.coordinates import SkyCoord
import astropy.units as u

import sys
import os
import glob
from tqdm import tqdm

In [2]:
folder = 'C:/Users/oryan/Documents/mergers-in-cosmos'
data_folder = f'{folder}/data'
results = f'{folder}/results'

drive_folder = 'E:/cosmos-data'

In [3]:
with fits.open(f'{drive_folder}/COSMOS2020_CLASSIC_R1_v2.1_p3.fits.gz') as hdul:
    header = hdul[1].header
    data = hdul[1].data

In [4]:
df = pd.read_csv(f'{data_folder}/catalogue-matched-cosmos-2020.csv', index_col = 0)

In [5]:
data_red = Table(data)
data_red = data_red[data_red['lp_type'] == 0]
del data

In [6]:
data = data_red[['ID', 'ez_z_phot', 'lp_mass_best']]
data_coords = data_red[['ID', 'ALPHA_J2000', 'DELTA_J2000']]

In [7]:
del data_red

In [8]:
data_tmp = data[(~np.isnan(data['ez_z_phot']))]

In [9]:
data = data_tmp[(~np.isnan(data_tmp['lp_mass_best']))]

In [10]:
del data_tmp

In [11]:
df

Unnamed: 0,SourceID,ID_1,ALPHA_J2000_1,DELTA_J2000_1,X_IMAGE_1,Y_IMAGE_1,ERRX2_IMAGE_1,ERRY2_IMAGE_1,ERRXY_IMAGE_1,FLUX_RADIUS_1,...,ez_ssfr_p025_2,ez_ssfr_p160_2,ez_ssfr_p500_2,ez_ssfr_p840_2,ez_ssfr_p975_2,ez_Av_p025_2,ez_Av_p160_2,ez_Av_p500_2,ez_Av_p840_2,ez_Av_p975_2
0,4000705532984,857121.0,150.673667,2.226291,9348.870117,22451.160156,0.000002,0.000004,1.265998e-08,8.876858,...,-8.385987,-8.323476,-8.259419,-8.171948,-8.081048,0.595371,0.783795,0.924471,1.022045,1.076083
1,4000705533312,873195.0,150.668102,2.242849,9482.499023,22848.505859,0.000007,0.000002,1.829277e-06,5.542504,...,-8.962106,-8.711401,-8.478580,-8.303174,-8.135300,0.082679,0.231435,0.436111,0.622873,0.829844
2,4000705533383,861738.0,150.645118,2.237538,10033.689453,22720.843750,0.000039,0.000078,1.332813e-05,5.169795,...,-9.779914,-9.521317,-9.042374,-8.946216,-8.919963,0.570974,0.686736,0.964232,1.396826,1.587413
3,4000705539435,1280765.0,149.702469,2.636086,32637.894531,32285.564453,0.000561,0.000900,1.408110e-05,4.764572,...,-10.821019,-10.378546,-10.191748,-10.048404,-9.860973,0.251250,0.657133,1.055286,1.348915,1.713512
4,4000705539529,1284864.0,149.686223,2.637412,33027.406250,32317.517578,0.000031,0.000030,1.244353e-06,6.412690,...,-8.389942,-8.324630,-8.253101,-8.180449,-8.090549,0.360573,0.404528,0.450688,0.494164,0.550538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,6000536185363,1299189.0,150.498992,2.652105,13540.829102,32669.794922,0.000034,0.000021,-3.025893e-06,6.936677,...,-9.307705,-9.179163,-9.043135,-8.935645,-8.897905,0.939483,0.982316,1.048312,1.108769,1.154593
3736,6000536185496,1308675.0,150.501162,2.663109,13488.861328,32933.941406,0.000085,0.000062,1.369181e-05,5.260444,...,-9.680077,-9.645065,-9.610534,-9.580824,-9.555204,0.058711,0.189657,0.304278,0.435281,0.542515
3737,6000536185585,1317966.0,150.487598,2.671303,13814.117188,33130.523438,0.000014,0.000013,-3.564242e-06,5.304535,...,-8.546148,-8.497480,-8.455174,-8.411234,-8.386809,0.948853,0.995218,1.045390,1.088478,1.127819
3738,6000536185603,1320733.0,150.508423,2.673144,13314.836914,33174.851562,0.000021,0.000046,-4.340081e-06,4.414364,...,-9.697964,-9.584126,-9.493108,-9.432188,-9.395185,0.026873,0.026873,0.120597,0.258239,0.321697


In [12]:
df_red = df[['ID_1', 'ID_2', 'lp_mass_best_1', 'lp_mass_best_2', 'ez_z_phot_1','ez_z_phot_2']].dropna()

In [13]:
df_prim = df_red[['ID_1', 'lp_mass_best_1', 'ez_z_phot_1']].rename(columns = {'ID_1':'ID','lp_mass_best_1' : 'lp_mass_best', 'ez_z_phot_1' : 'ez_z_phot'})
df_sec = df_red[['ID_2', 'lp_mass_best_2', 'ez_z_phot_2']].rename(columns = {'ID_2':'ID','lp_mass_best_2' : 'lp_mass_best', 'ez_z_phot_2' : 'ez_z_phot'})

In [14]:
df_comb = pd.concat([df_prim, df_sec]).drop_duplicates('ID', keep='first')

In [15]:
df_comb

Unnamed: 0,ID,lp_mass_best,ez_z_phot
0,857121.0,10.49288,0.219858
1,873195.0,10.18078,0.346519
2,861738.0,9.36051,0.619803
3,1280765.0,9.04580,1.641619
4,1284864.0,10.17034,0.705821
...,...,...,...
3734,1294513.0,8.18020,0.487480
3735,1290594.0,7.76831,0.372096
3736,1301552.0,8.89540,0.914561
3737,1315336.0,8.49561,0.678499


In [16]:
sample_dict = df_comb.set_index('ID').to_dict(orient = 'index')

In [17]:
def matching_control_sample(sourceid, mass, z_phot, data):
    data_z_red= data[(data['ez_z_phot'] >= z_phot - 0.005) & (data['ez_z_phot'] <= z_phot + 0.005)]
    
    data_m_red = data_z_red[(data_z_red['lp_mass_best'] >= mass - 0.01) & (data_z_red['lp_mass_best'] <= mass + 0.01)]
    
    data_id_red = data_m_red[data_m_red['ID'] != sourceid]
    
    if len(data_id_red) < 0.5:
        return []
    
    ids_list = list(data_id_red['ID'])
    
    return ids_list

In [18]:
control_dict = {}
for sourceid, values in tqdm(list(sample_dict.items())):
    control_dict[sourceid] = matching_control_sample(sourceid, values['lp_mass_best'], values['ez_z_phot'], data)

100%|██████████| 6842/6842 [00:54<00:00, 126.32it/s]


In [19]:
exist_list = []
control_dict_dedup = {}
for i in tqdm(list(control_dict.keys())):
    control_list = control_dict[i]
    dedup_list = []
    for j in control_list:
        if j not in exist_list:
            exist_list.append(j)
            dedup_list.append(j)
    control_dict_dedup[i] = dedup_list

100%|██████████| 6842/6842 [09:58<00:00, 11.43it/s] 


In [35]:
del exist_list, dedup_list

NameError: name 'exist_list' is not defined

In [41]:
control_df_tmp = pd.DataFrame(list(control_dict_dedup.keys())).rename(columns = {0 : 'ID'})

In [44]:
control_df = control_df_tmp.assign(matched_ids = control_df_tmp.ID.apply(lambda x: control_dict_dedup[x]))

In [45]:
control_df.to_csv(f'{results}/all-matched-control-ids.csv')

In [52]:
def rand_entry(ids):
    index = np.random.randint(len(ids))
    return ids[index]

In [58]:
reduced_control = (
    control_df
    .assign(control_ID = control_df.matched_ids.apply(lambda x: rand_entry(x) if len(x) > 0 else np.nan))
    .drop(columns = 'matched_ids')
    .dropna()
)

In [62]:
reduced_control.to_csv(f'{results}/control_manifest.csv')