### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import sys
from tqdm import tqdm
tqdm.pandas()
import glob
import os
import time
from IPython.display import clear_output

from astropy.io import fits
from astropy.table import Table
from astropy.coordinates import SkyCoord
import astropy.units as u
from astropy.wcs import WCS
from astropy.wcs.utils import skycoord_to_pixel

In [2]:
data_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/back-to-basics-data'
folder = 'C:/Users/oryan/Documents/mergers-in-cosmos'
student_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/followup-data'
results_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/back-to-basics-results'
combine_folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/cats-to-combine'
cosmos_folder = 'E:/cosmos-data'
fits_folder = 'E:/cosmos-fits'

### Checking Classifications

In [3]:
df_clsf = pd.read_csv(f'{combine_folder}/dor-restaging.csv', index_col = 0)
df_clsf

Unnamed: 0,Stage,prim_id,sec_id,other_ints
4000705532455,stage4,816891.0,,[]
4000705533383,stage3,860686.0,,[]
4000705539941,stage2,1301188.0,,[]
4000705540034,stage3,1304538.0,,"[1307416, 1304139]"
4000705540064,stage3,1310001.0,1311422.0,[]
...,...,...,...,...
4001111524196,stage2,991056.0,988878.0,[]
4001185627710,stage2,1294479.0,1295082.0,"[1292987, 1298256]"
6000508613553,stage2,1140076.0,,[1135646]
6000518662663,stage2,575198.0,588501.0,[]


In [4]:
df_prim = df_clsf[['Stage', 'prim_id']].drop_duplicates(subset = 'prim_id').dropna()
df_sec = df_clsf[['Stage', 'sec_id']].drop_duplicates(subset = 'sec_id').dropna()
df_extra = df_clsf[['other_ints']]

In [5]:
prim_ids = list(df_prim.dropna().prim_id)

In [6]:
sec_dict = df_sec.dropna().to_dict(orient = 'index')

In [8]:
len(sec_dict)

831

### Sorting Out Primaries and Secondaries

In [9]:
red_sec_dict = {}
for i in tqdm(list(sec_dict.keys())):
    sec_id = sec_dict[i]['sec_id']
    if sec_id not in prim_ids:
        red_sec_dict[i] = sec_dict[i]

100%|██████████| 831/831 [00:00<00:00, 36834.91it/s]


In [10]:
len(red_sec_dict)

781

In [11]:
red_sec_df = pd.DataFrame.from_dict(red_sec_dict, orient = 'index')
red_sec_df

Unnamed: 0,Stage,sec_id
4000705540519,stage1,1324784.0
4000705547180,stage1,1266054.0
4000706649921,stage3,840582.0
4000707046655,stage1,214433.0
4000707047768,stage1,246358.0
...,...,...
4001024667582,stage3,353054.0
4001051778929,stage1,1293175.0
4001111524196,stage2,988878.0
4001185627710,stage2,1295082.0


In [12]:
df_prims_secs = (
    pd.concat([
        df_prim.reset_index().rename(columns = {'index' : 'ORyanID_23', 'prim_id' : 'ID'}), 
        red_sec_df.reset_index().rename(columns = {'index' : 'ORyanID_23', 'sec_id' : 'ID'})
    ])
    .astype({'ID' : 'int'})
)

In [13]:
df_prims_secs

Unnamed: 0,ORyanID_23,Stage,ID
0,4000705532455,stage4,816891
1,4000705533383,stage3,860686
2,4000705539941,stage2,1301188
3,4000705540034,stage3,1304538
4,4000705540064,stage3,1310001
...,...,...,...
776,4001024667582,stage3,353054
777,4001051778929,stage1,1293175
778,4001111524196,stage2,988878
779,4001185627710,stage2,1295082


### Looking at Extras

In [14]:
all_ids = list(df_prims_secs.ID)

In [15]:
extra_dict = df_extra.to_dict(orient = 'index')

In [16]:
extra_ids = {}
extra_ids_list = []
for i in tqdm(list(extra_dict.keys())):
    
    if type(extra_dict[i]['other_ints']) != str:
        if np.isnan(extra_dict[i]['other_ints']):
            continue
        else:
            print('Something else going on.')
            sys.exit()
    
    ids_list = ast.literal_eval(extra_dict[i]['other_ints'])
    
    if len(ids_list) < 0.5:
        continue
            
    for j in ids_list:
        if j not in all_ids and j not in extra_ids_list:
            
            if i not in extra_ids.keys():
                extra_ids[i] = []
            
            extra_ids[i].append(j)
            extra_ids_list.append(j)
    

100%|██████████| 3043/3043 [00:00<00:00, 22248.18it/s]


### Time to Classify Now...

In [17]:
with fits.open(f'{cosmos_folder}/COSMOS2020_CLASSIC_R1_v2.1_p3.fits.gz') as hdul:
    data = hdul[1].data

FileNotFoundError: [Errno 2] No such file or directory: 'E:/cosmos-data/COSMOS2020_CLASSIC_R1_v2.1_p3.fits.gz'

In [18]:
def getting_secondary(ra, dec, z, prim_id):
    record = data[(data['ALPHA_J2000'] > ra - (15 / (60*60))) & (data['ALPHA_J2000'] < ra + (15 / (60*60))) & (data['DELTA_J2000'] > dec - (15 / (60*60))) & (data['DELTA_J2000'] < dec + (15 / (60*60)))]
    
    df = Table(record).to_pandas()[['ID', 'ALPHA_J2000', 'DELTA_J2000', 'ez_z_phot', 'lp_zBEST']]
    
    df_z = (
        df
        .assign(redshift = df.apply(lambda row: picking_z(row.ez_z_phot, row.lp_zBEST), axis = 1))
        .drop(columns = ['ez_z_phot', 'lp_zBEST'])
        .dropna()
        .query('ID != @prim_id')
    )   
    
    return df_z

In [19]:
def picking_z(ez_z, lp_z):
    if np.isnan(ez_z):
        if not np.isnan(lp_z):
            return lp_z
        else:
            return np.nan
    
    if np.isnan(lp_z):
        if not np.isnan(ez_z):
            return ez_z
        else:
            return np.nan
    
    if lp_z < ez_z:
        return lp_z
    elif ez_z <= lp_z:
        return ez_z
    else:
        return 'what'

In [20]:
def get_correct_pix(cen_pix):
    
    cen_range = [cen_pix[0] - 150, cen_pix[0] + 150, cen_pix[1] - 150, cen_pix[1] + 150]
        
    for i in range(len(cen_range)):
        if cen_range[i] >= 1001:
            cen_range[i] = 1000
        elif cen_range[i] < 0:
            cen_range[i] = 0
    
    return np.array(cen_range).astype(int)

In [21]:
dor_extra_clsf_df = pd.read_csv(f'{combine_folder}/dor-extra-staged.csv', index_col = 0).set_index('ID')

In [22]:
dor_extra_clsf = dor_extra_clsf_df.to_dict(orient = 'index')

In [None]:
for i in tqdm(list(extra_ids.keys())):
    if len(extra_ids[i]) < 0.5:
        print('Empty list?')
        sys.exit()
        
    filename = f'{fits_folder}/{i}.fits'

    with fits.open(filename) as hdul:
        im_data = hdul[0].data
        header = hdul[0].header

    w = WCS(header)
    
    for j in extra_ids[i]:
        if j in list(dor_extra_clsf.keys()):
            continue
        
        row = data[data['ID'] == j]

        ra = row['ALPHA_J2000'][0]
        dec = row['DELTA_J2000'][0]
        z_ez = row['ez_z_phot'][0]
        z_lp = row['lp_zBEST'][0]

        redshift = picking_z(z_ez, z_lp)

        prim_coord = SkyCoord(ra = ra * u.deg, dec = dec * u.deg, frame = 'fk5')

        df_secs = getting_secondary(ra, dec, z_ez, j)

        df_coords = (
            df_secs
            .assign(coords = df_secs.apply(lambda row: SkyCoord(ra = row.ALPHA_J2000 * u.deg, dec = row.DELTA_J2000 * u.deg, frame = 'fk5'), axis = 1))
        )

        coords_list = list(df_coords.coords)
        z_list = list(df_coords.redshift)
        ids_list = list(df_coords.ID)    
        pixs_list = []
        for k in coords_list:
            pixs_list.append(skycoord_to_pixel(k, w, origin = 0))
        pixs_arr = np.array(pixs_list)

        prim_pix = np.array(skycoord_to_pixel(prim_coord, w, origin = 0))

        fig, ax = plt.subplots(ncols = 2, figsize = (16,8))
        ax[0].imshow(np.log10(im_data), origin = 'lower')
        ax[0].scatter(prim_pix[0], prim_pix[1], s = 10, color='black')
        ax[0].annotate(np.round(redshift, 3), (prim_pix[0], prim_pix[1]), color = 'black')
        ax[0].scatter(pixs_arr[:,0], pixs_arr[:,1], s = 10, color = 'red')
        for k, txt in enumerate(z_list):
            ax[0].annotate(f'{np.round(txt, 3)}, {k}', (pixs_arr[k,0], pixs_arr[k,1]), color = 'red')
        correct_cen_pix = get_correct_pix(prim_pix)
        ax[1].imshow(np.log10(im_data[correct_cen_pix[2]:correct_cen_pix[3], correct_cen_pix[0]:correct_cen_pix[1]]), origin = 'lower')
        plt.show()

        stage_ans = input('What stage is the object?')

        if stage_ans == 'nm':
            dor_extra_clsf[j] = {'stage': np.nan}
            clear_output(wait=True)
            continue

        dor_extra_clsf[j] = {'stage' : stage_ans}
    
        clear_output(wait = True)
        plt.close()

In [23]:
list(dor_extra_clsf.keys())[-1-1]

1524909

In [26]:
df_dor_df = pd.DataFrame.from_dict(dor_extra_clsf, orient = 'index').reset_index().rename(columns = {'index' : 'ID', 0 : 'stage'})

In [25]:
# df_dor_df.to_csv(f'{combine_folder}/dor-extra-staged.csv')

In [27]:
df_dor_red = df_dor_df.dropna()

### Linking SourceID to ID

In [30]:
df_dor_red.head()

Unnamed: 0,ID,stage
1,191454,stage3
2,192955,stage3
3,193620,stage4
5,198213,stage3
6,200082,stage3


In [31]:
df_extra.head()

Unnamed: 0,other_ints
4000705532455,[]
4000705533383,[]
4000705539941,[]
4000705540034,"[1307416, 1304139]"
4000705540064,[]


In [32]:
dict_extra = df_extra.dropna().to_dict(orient = 'index')

In [33]:
def get_sourceid(cos_id, dict_extra):
    for i in list(dict_extra.keys()):
        id_list = dict_extra[i]['other_ints']
        
        if cos_id in ast.literal_eval(id_list):
            return str(i) + '_extra'
    print('Something wrong...')
    sys.exit()

In [34]:
df_dor_red_src = (
    df_dor_red
    .assign(ORyanID_23 = df_dor_red.ID.progress_apply(lambda x: get_sourceid(x, dict_extra)))
    .rename(columns = {'stage':'Stage'})
)

100%|██████████| 841/841 [00:06<00:00, 128.42it/s]


In [38]:
df_comb_clean = (
    df_dor_red_src
    .assign(category = df_dor_red_src.Stage.apply(lambda x: cleanup(x)))
)

### Combining Data

In [35]:
df_prims_secs

Unnamed: 0,ORyanID_23,Stage,ID
0,4000705532455,stage4,816891
1,4000705533383,stage3,860686
2,4000705539941,stage2,1301188
3,4000705540034,stage3,1304538
4,4000705540064,stage3,1310001
...,...,...,...
776,4001024667582,stage3,353054
777,4001051778929,stage1,1293175
778,4001111524196,stage2,988878
779,4001185627710,stage2,1295082


In [35]:
df_comb = pd.concat([df_prims_secs, df_dor_red_src])
len(df_comb)

4188

In [36]:
df_comb.Stage.value_counts()

stage3       1522
stage2       1043
stage1        858
stage4        755
staage3         3
n               3
m,              2
supernova       1
bn              1
Name: Stage, dtype: int64

### Cleaning Up

In [37]:
def cleanup(stage):
    if stage == 'stage1':
        return stage
    elif stage == 'stage2':
        return stage
    elif stage == 'stage3':
        return stage
    elif stage == 'stage4':
        return stage
    elif stage == 'staage3':
        return 'stage3'
    else:
        return np.nan

In [38]:
df_comb_clean = (
    df_comb
    .assign(category = df_comb.Stage.apply(lambda x: cleanup(x)))
)

In [39]:
df_comb_clean.dropna().category.value_counts()

stage3    1525
stage2    1043
stage1     858
stage4     755
Name: category, dtype: int64

In [40]:
df_dropna = (
    df_comb_clean
    .dropna()
    .drop(columns = 'Stage')
)

In [42]:
df_dropna.to_csv(f'{combine_folder}/dor-all-restaged.csv')

In [43]:
data_tab = Table(data)

In [44]:
del data

In [45]:
data_df = data_tab.to_pandas()

In [46]:
df_data_merged = df_dropna.merge(data_df, on = 'ID', how = 'left')

In [47]:
df_data_merged

Unnamed: 0,ORyanID_23,ID,category,ALPHA_J2000,DELTA_J2000,X_IMAGE,Y_IMAGE,ERRX2_IMAGE,ERRY2_IMAGE,ERRXY_IMAGE,...,ez_ssfr_p025,ez_ssfr_p160,ez_ssfr_p500,ez_ssfr_p840,ez_ssfr_p975,ez_Av_p025,ez_Av_p160,ez_Av_p500,ez_Av_p840,ez_Av_p975
0,4000705532455,816891,stage4,150.679846,2.196543,9200.399414,21737.232422,0.000052,0.000059,-1.043282e-05,...,-8.931825,-8.911355,-8.899668,-8.883307,-8.872990,0.529107,0.560142,0.599624,0.630920,0.652330
1,4000705533383,860686,stage3,150.644522,2.237226,10047.978516,22713.351562,0.000051,0.000046,1.571297e-06,...,-9.314209,-9.106611,-8.941274,-8.713107,-8.541205,0.341270,0.545498,0.732327,0.906665,1.020098
2,4000705539941,1301188,stage2,149.702462,2.654895,32637.949219,32737.013672,0.000064,0.000104,-2.881107e-06,...,-9.794128,-9.784252,-9.766013,-9.743796,-9.727879,0.018737,0.018737,0.018737,0.018737,0.018737
3,4000705540034,1304538,stage3,149.726828,2.658172,32053.712891,32815.468750,0.000071,0.000086,-6.036952e-06,...,-9.480532,-9.443615,-9.402433,-9.352081,-9.325659,0.302629,0.376027,0.440394,0.508153,0.567727
4,4000705540064,1310001,stage3,149.701230,2.660343,32667.441406,32867.789062,0.000026,0.000020,-1.023530e-05,...,-9.882174,-9.855921,-9.816085,-9.697225,-9.573506,0.455258,0.482555,0.522239,0.600383,0.660279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4176,4001204758614_extra,1520733,stage4,149.891374,2.866981,28107.876953,37826.437500,0.000086,0.000104,-9.313976e-07,...,-9.441206,-9.279022,-9.095134,-9.003501,-8.955496,0.120441,0.213251,0.306832,0.556012,0.731876
4177,4546055995_extra,1520749,stage2,150.383110,2.865367,16320.103516,37787.917969,0.000131,0.000236,7.123858e-05,...,-9.857880,-9.658215,-9.485625,-9.235816,-8.971795,0.162147,0.238640,0.432004,0.592193,0.687256
4178,4000927318133_extra,1523385,stage3,149.789910,2.865132,30540.179688,37782.648438,0.000045,0.000059,1.600747e-05,...,-9.525900,-9.472779,-9.353454,-9.273221,-9.213521,0.262676,0.355466,0.410356,0.489105,0.566003
4179,4000747939904_extra,1524909,stage3,150.450581,2.872371,14702.710938,37956.468750,0.000248,0.000292,7.378556e-06,...,-9.556987,-9.385500,-9.154130,-9.026810,-8.938077,0.153294,0.283249,0.436332,0.600615,0.693621


In [48]:
df_data_merged.to_csv(f'{combine_folder}/dor-all-restaged-full.csv')

In [49]:
df_data_merged.drop_duplicates(subset='ID')

Unnamed: 0,ORyanID_23,ID,category,ALPHA_J2000,DELTA_J2000,X_IMAGE,Y_IMAGE,ERRX2_IMAGE,ERRY2_IMAGE,ERRXY_IMAGE,...,ez_ssfr_p025,ez_ssfr_p160,ez_ssfr_p500,ez_ssfr_p840,ez_ssfr_p975,ez_Av_p025,ez_Av_p160,ez_Av_p500,ez_Av_p840,ez_Av_p975
0,4000705532455,816891,stage4,150.679846,2.196543,9200.399414,21737.232422,0.000052,0.000059,-1.043282e-05,...,-8.931825,-8.911355,-8.899668,-8.883307,-8.872990,0.529107,0.560142,0.599624,0.630920,0.652330
1,4000705533383,860686,stage3,150.644522,2.237226,10047.978516,22713.351562,0.000051,0.000046,1.571297e-06,...,-9.314209,-9.106611,-8.941274,-8.713107,-8.541205,0.341270,0.545498,0.732327,0.906665,1.020098
2,4000705539941,1301188,stage2,149.702462,2.654895,32637.949219,32737.013672,0.000064,0.000104,-2.881107e-06,...,-9.794128,-9.784252,-9.766013,-9.743796,-9.727879,0.018737,0.018737,0.018737,0.018737,0.018737
3,4000705540034,1304538,stage3,149.726828,2.658172,32053.712891,32815.468750,0.000071,0.000086,-6.036952e-06,...,-9.480532,-9.443615,-9.402433,-9.352081,-9.325659,0.302629,0.376027,0.440394,0.508153,0.567727
4,4000705540064,1310001,stage3,149.701230,2.660343,32667.441406,32867.789062,0.000026,0.000020,-1.023530e-05,...,-9.882174,-9.855921,-9.816085,-9.697225,-9.573506,0.455258,0.482555,0.522239,0.600383,0.660279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4176,4001204758614_extra,1520733,stage4,149.891374,2.866981,28107.876953,37826.437500,0.000086,0.000104,-9.313976e-07,...,-9.441206,-9.279022,-9.095134,-9.003501,-8.955496,0.120441,0.213251,0.306832,0.556012,0.731876
4177,4546055995_extra,1520749,stage2,150.383110,2.865367,16320.103516,37787.917969,0.000131,0.000236,7.123858e-05,...,-9.857880,-9.658215,-9.485625,-9.235816,-8.971795,0.162147,0.238640,0.432004,0.592193,0.687256
4178,4000927318133_extra,1523385,stage3,149.789910,2.865132,30540.179688,37782.648438,0.000045,0.000059,1.600747e-05,...,-9.525900,-9.472779,-9.353454,-9.273221,-9.213521,0.262676,0.355466,0.410356,0.489105,0.566003
4179,4000747939904_extra,1524909,stage3,150.450581,2.872371,14702.710938,37956.468750,0.000248,0.000292,7.378556e-06,...,-9.556987,-9.385500,-9.154130,-9.026810,-8.938077,0.153294,0.283249,0.436332,0.600615,0.693621
