In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import ast

import sys
from tqdm import tqdm
import glob
import os
import time
from IPython.display import clear_output

from astropy.io import fits
from astropy.table import Table
from astropy.coordinates import SkyCoord
import astropy.units as u
from astropy.wcs import WCS
from astropy.wcs.utils import skycoord_to_pixel

In [2]:
data_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/back-to-basics-data'
folder = 'C:/Users/oryan/Documents/mergers-in-cosmos'
student_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/followup-data'
results_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/back-to-basics-results'
combine_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/cats-to-combine'
cosmos_folder = 'E:/cosmos-data'
fits_folder = 'E:/cosmos-fits'

### Importing Data

In [3]:
with fits.open(f'{cosmos_folder}/COSMOS2020_CLASSIC_R1_v2.1_p3.fits.gz') as hdul:
    data = hdul[1].data

In [4]:
df_cat = pd.read_csv(f'{combine_folder}/cosmos2020-categorised-interactions.csv', index_col = 0)
df_cat

Unnamed: 0,ORyan23_ID,ID,category,ALPHA_J2000,DELTA_J2000,X_IMAGE,Y_IMAGE,ERRX2_IMAGE,ERRY2_IMAGE,ERRXY_IMAGE,...,ez_ssfr_p025,ez_ssfr_p160,ez_ssfr_p500,ez_ssfr_p840,ez_ssfr_p975,ez_Av_p025,ez_Av_p160,ez_Av_p500,ez_Av_p840,ez_Av_p975
0,4000705532455,816891,stage3,150.679846,2.196543,9200.399414,21737.232422,0.000052,0.000059,-1.043282e-05,...,-8.931825,-8.911355,-8.899668,-8.883307,-8.872990,0.529107,0.560142,0.599624,0.630920,0.652330
1,4000705533383,861738,stage1,150.645118,2.237538,10033.689453,22720.843750,0.000039,0.000078,1.332813e-05,...,-9.574272,-9.548649,-9.503962,-9.439343,-9.384205,0.240042,0.281559,0.330791,0.384371,0.436491
2,4000705539529,1284864,stage1,149.686223,2.637412,33027.406250,32317.517578,0.000031,0.000030,1.244353e-06,...,-9.669377,-9.614943,-9.570104,-9.533121,-9.476167,0.094187,0.114436,0.140990,0.159430,0.183608
3,4000705539941,1301188,stage2,149.702462,2.654895,32637.949219,32737.013672,0.000064,0.000104,-2.881107e-06,...,-9.794128,-9.784252,-9.766013,-9.743796,-9.727879,0.018737,0.018737,0.018737,0.018737,0.018737
4,4000705540034,1304538,stage1,149.726828,2.658172,32053.712891,32815.468750,0.000071,0.000086,-6.036952e-06,...,-9.480532,-9.443615,-9.402433,-9.352081,-9.325659,0.302629,0.376027,0.440394,0.508153,0.567727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,6000535066150,1157587,stage4,150.422510,2.514655,15373.875977,29370.304688,0.000024,0.000027,8.697323e-06,...,-13.220035,-13.215519,-10.469687,-9.954029,-9.691111,0.173505,0.240862,0.328586,0.439079,0.546386
3792,6000535121452,1035987,stage1,149.694761,2.397231,32824.328125,26552.759766,0.000012,0.000015,-7.174042e-07,...,-10.287872,-10.060802,-9.853634,-9.456022,-9.315375,0.343488,0.463897,0.646257,1.016819,1.285051
3793,6000535865556,1138628,stage1,150.490889,2.496660,13734.171875,28938.828125,0.001929,0.001333,1.953999e-04,...,-9.875473,-9.817194,-9.580817,-9.265499,-9.099182,0.026873,0.026873,0.164250,0.549086,0.751945
3794,6000536013605,1203157,stage1,150.202813,2.558504,20641.730469,30421.910156,0.000051,0.000067,4.840965e-07,...,-9.804204,-9.683063,-9.470870,-9.189605,-8.982660,0.539500,0.595082,0.778112,1.062422,1.232495


In [5]:
df_clsf = pd.read_csv(f'{data_folder}/secondaries-identified.csv', index_col = 0).reset_index().rename(columns = {'index' : 'ORyan23_ID', '0' : 'sec_clsf'})
df_clsf

Unnamed: 0,ORyan23_ID,sec_clsf
0,4000705532455,n
1,4000705532984,nm
2,4000705533312,y
3,4000705533383,n
4,4000705539435,l
...,...,...
3781,6000536185363,nm
3782,6000536185496,n
3783,6000536185585,b
3784,6000536185603,nm


In [6]:
df_merge = df_cat.merge(
    df_clsf,
    on = 'ORyan23_ID',
    how = 'left'
)

In [7]:
df_merge.head()

Unnamed: 0,ORyan23_ID,ID,category,ALPHA_J2000,DELTA_J2000,X_IMAGE,Y_IMAGE,ERRX2_IMAGE,ERRY2_IMAGE,ERRXY_IMAGE,...,ez_ssfr_p160,ez_ssfr_p500,ez_ssfr_p840,ez_ssfr_p975,ez_Av_p025,ez_Av_p160,ez_Av_p500,ez_Av_p840,ez_Av_p975,sec_clsf
0,4000705532455,816891,stage3,150.679846,2.196543,9200.399414,21737.232422,5.2e-05,5.9e-05,-1e-05,...,-8.911355,-8.899668,-8.883307,-8.87299,0.529107,0.560142,0.599624,0.63092,0.65233,n
1,4000705533383,861738,stage1,150.645118,2.237538,10033.689453,22720.84375,3.9e-05,7.8e-05,1.3e-05,...,-9.548649,-9.503962,-9.439343,-9.384205,0.240042,0.281559,0.330791,0.384371,0.436491,n
2,4000705539529,1284864,stage1,149.686223,2.637412,33027.40625,32317.517578,3.1e-05,3e-05,1e-06,...,-9.614943,-9.570104,-9.533121,-9.476167,0.094187,0.114436,0.14099,0.15943,0.183608,n
3,4000705539941,1301188,stage2,149.702462,2.654895,32637.949219,32737.013672,6.4e-05,0.000104,-3e-06,...,-9.784252,-9.766013,-9.743796,-9.727879,0.018737,0.018737,0.018737,0.018737,0.018737,n
4,4000705540034,1304538,stage1,149.726828,2.658172,32053.712891,32815.46875,7.1e-05,8.6e-05,-6e-06,...,-9.443615,-9.402433,-9.352081,-9.325659,0.302629,0.376027,0.440394,0.508153,0.567727,n


In [8]:
df_merge.category.value_counts()

stage4    1165
stage2    1063
stage1     793
stage3     775
Name: category, dtype: int64

In [9]:
len(df_merge)

3796

### Checking

In [10]:
def getting_secondary(ra, dec, z, prim_id):
    record = data[(data['ALPHA_J2000'] > ra - (15 / (60*60))) & (data['ALPHA_J2000'] < ra + (15 / (60*60))) & (data['DELTA_J2000'] > dec - (15 / (60*60))) & (data['DELTA_J2000'] < dec + (15 / (60*60)))]
    
    df = Table(record).to_pandas()[['ID', 'ALPHA_J2000', 'DELTA_J2000', 'ez_z_phot', 'lp_zBEST']]
    
    df_z = (
        df
        .assign(redshift = df.apply(lambda row: picking_z(row.ez_z_phot, row.lp_zBEST), axis = 1))
        .drop(columns = ['ez_z_phot', 'lp_zBEST'])
        .dropna()
        .query('ID != @prim_id')
    )   
    
    return df_z

In [11]:
def picking_z(ez_z, lp_z):
    if np.isnan(ez_z):
        if not np.isnan(lp_z):
            return lp_z
        else:
            return np.nan
    
    if np.isnan(lp_z):
        if not np.isnan(ez_z):
            return ez_z
        else:
            return np.nan
    
    if lp_z < ez_z:
        return lp_z
    elif ez_z <= lp_z:
        return ez_z
    else:
        return 'what'

In [12]:
df_dor_clsf = pd.read_csv(f'{combine_folder}/dor-restaging.csv', index_col = 0)

In [13]:
dor_clsf = df_dor_clsf.to_dict(orient = 'index')

In [14]:
for i in tqdm(range(len(df_merge))):    
    sourceid = df_merge.ORyan23_ID.iloc[i]
    
    if sourceid in list(dor_clsf.keys()):
        continue
    
    row = df_merge.query('ORyan23_ID == @sourceid')
    
    ra = row['ALPHA_J2000'].iloc[0]
    dec = row['DELTA_J2000'].iloc[0]
    z_ez = row['ez_z_phot'].iloc[0]
    z_lp = row['lp_zBEST'].iloc[0]
    id_str = row['ID'].iloc[0]
    stage = row['category'].iloc[0]
    sec_category = row['sec_clsf'].iloc[0]
    
    redshift = picking_z(z_ez, z_lp)
    if redshift > 1.2:
        dor_clsf[sourceid] = {'Stage' : np.nan, 'prim_id' : np.nan, 'sec_id' : np.nan, 'other_ints' : np.nan}
        continue
    
    prim_coord = SkyCoord(ra = ra * u.deg, dec = dec * u.deg, frame = 'fk5')
    
    df_secs = getting_secondary(ra, dec, z_ez, id_str)
    
    filename = f'{fits_folder}/{sourceid}.fits'
    
    with fits.open(filename) as hdul:
        im_data = hdul[0].data
        header = hdul[0].header
    
    w = WCS(header)
        
    df_coords = (
        df_secs
        .assign(coords = df_secs.apply(lambda row: SkyCoord(ra = row.ALPHA_J2000 * u.deg, dec = row.DELTA_J2000 * u.deg, frame = 'fk5'), axis = 1))
    )
    
    coords_list = list(df_coords.coords)
    z_list = list(df_coords.redshift)
    ids_list = list(df_coords.ID)    
    pixs_list = []
    for j in coords_list:
        pixs_list.append(skycoord_to_pixel(j, w, origin = 0))
    pixs_arr = np.array(pixs_list)
    
    prim_pix = np.array(skycoord_to_pixel(prim_coord, w, origin = 0))
    
    fig, ax = plt.subplots(ncols = 2, figsize = (16,8))
    ax[0].imshow(np.log10(im_data), origin = 'lower')
    ax[0].scatter(prim_pix[0], prim_pix[1], s = 10, color='black')
    ax[0].annotate(np.round(redshift, 3), (prim_pix[0], prim_pix[1]), color = 'black')
    ax[0].scatter(pixs_arr[:,0], pixs_arr[:,1], s = 10, color = 'red')
    for j, txt in enumerate(z_list):
        ax[0].annotate(f'{np.round(txt, 3)}, {j}', (pixs_arr[j,0], pixs_arr[j,1]), color = 'red')
    ax[1].imshow(np.log10(im_data[350:650,350:650]), origin = 'lower')
    plt.show()
    
    stage_ans = input('What stage is the object?')
    
    if stage_ans == 'nm':
        dor_clsf[sourceid] = {'Stage' : np.nan, 'prim_id' : np.nan, 'sec_id' : np.nan, 'other_ints' : np.nan}
        clear_output(wait=True)
        continue
    
    prim_ans_bool = False
    while not prim_ans_bool:
        prim_bool = input('Is the primary correct?')
        if prim_bool == 'y':
            prim_ID = id_str
            prim_ans_bool = True
        elif prim_bool == 'n':
            tmp = int(input('What number is the correct primary?'))
            prim_ID = ids_list[tmp]
            prim_ans_bool = True
        else:
            print('Please input y or n')
    
    sec_ans_bool = False
    while not sec_ans_bool:
        sec_bool = input('Is the secondary in the image?')
        if sec_bool == 'y':
            tmp = int(input('What number is the correct secondary?'))
            sec_ID = ids_list[tmp]
            sec_ans_bool = True
        elif sec_bool == 'n':
            sec_ID = np.nan
            sec_ans_bool = True
        else:
            print('Please input y or n')
            
    more_ans_bool = False
    while not more_ans_bool:
        more_bool = input('Is there more interacting galaxies in the image?')
        if more_bool == 'y':
            tmp = input('Please input a list of numbers of interactors.')
            more_ints_tmp = ast.literal_eval(tmp)
            more_ints = []
            for k in more_ints_tmp:
                more_ints.append(ids_list[k])
            more_ans_bool = True
        elif more_bool == 'n':
            more_ints = []
            more_ans_bool = True
        else:
            print('Please input y or n.')
    
    dor_clsf[sourceid] = {'Stage' : stage_ans, 'prim_id' : prim_ID, 'sec_id' : sec_ID, 'other_ints' : more_ints}
    
    clear_output(wait = True)
    plt.close()

100%|██████████| 3796/3796 [00:04<00:00, 779.84it/s] 


In [15]:
df_dor_clsf = pd.DataFrame.from_dict(dor_clsf, orient = 'index')
df_dor_clsf

Unnamed: 0,Stage,prim_id,sec_id,other_ints
4000705532455,stage4,816891.0,,[]
4000705533383,stage3,860686.0,,[]
4000705539941,stage2,1301188.0,,[]
4000705540034,stage3,1304538.0,,"[1307416, 1304139]"
4000705540064,stage3,1310001.0,1311422.0,[]
...,...,...,...,...
4001111524196,stage2,991056.0,988878.0,[]
4001185627710,stage2,1294479.0,1295082.0,"[1292987, 1298256]"
6000508613553,stage2,1140076.0,,[1135646]
6000518662663,stage2,575198.0,588501.0,[]


In [16]:
df_dor_clsf.dropna(how = 'all').Stage.value_counts()

stage3       933
stage2       758
stage4       607
stage1       278
staage3        2
supernova      1
Name: Stage, dtype: int64

In [17]:
df_dor_clsf.to_csv(f'{combine_folder}/dor-restaging.csv')

In [18]:
df_clsf_merge = (
    df_dor_clsf.reset_index().merge(
        df_cat[['ORyan23_ID', 'category']],
        left_on = 'index',
        right_on = 'ORyan23_ID',
        how = 'left'
    )
)

In [19]:
df_ag = df_clsf_merge.assign(agree = df_clsf_merge.apply(lambda row: 'y' if row.Stage == row.category else 'n', axis = 1))
df_ag.agree.value_counts()

n    2703
y    1093
Name: agree, dtype: int64