# Redownloading ALL Cosmos Data
Realised that downloading parts of the data previously limits what I can do with it. Therefore, redownloading the data and saving everything. Using the previously matched catalogue though, for speed.

In [1]:
import pandas as pd
import sys
import time
from tqdm import tqdm
tqdm.pandas()

from IPython.display import clear_output

from astroquery.irsa import Irsa
from astropy.coordinates import SkyCoord
import astropy.units as u

  if __name__ == "__main__":


In [2]:
folder = 'C:/Users/oryan/Documents/mergers-in-cosmos/data'

In [3]:
my_cosmo_cat = pd.read_csv(f'{folder}/cosmos-matched-df.csv', index_col = 0)

In [4]:
my_cosmo_cat.head()

Unnamed: 0,SourceID,id_1,ssfr_best_1,ssfr_med_max68_1,ssfr_med_min68_1,ssfr_med_1,sfr_best_1,sfr_med_max68_1,sfr_med_min68_1,sfr_med_1,...,flag_shallow_2,l_r_2,l_k_2,l_nu_2,dist_2,RA,Dec,int_prediction,references,status
0,4000705532455,590539.0,-8.847,-8.463,-8.865,-8.712,0.508,0.835,0.477,0.655,...,1.0,27.26,27.124,27.074,9.361438,150.679664,2.196615,0.970096,['2007ApJS..172...99C'],Referenced
1,4000705532984,610283.0,-10.04,-9.922,-10.065,-9.99,0.536,0.486,0.335,0.411,...,1.0,27.868,27.811,27.848,7.778731,150.673546,2.226438,0.96116,['2007ApJS..172...99C'],Referenced
2,4000705533312,621053.0,-10.258,-10.232,-10.368,-10.3,-0.058,-0.032,-0.168,-0.1,...,1.0,29.62,29.874,28.661,2.627196,150.667975,2.242945,0.995066,"['2007ApJS..172...99C', '2007ApJS..172...99C',...",Referenced
3,4000705533383,617834.0,-9.445,-9.322,-9.47,-9.398,0.132,0.179,0.034,0.107,...,1.0,29.232,29.068,29.135,2.241379,150.645125,2.23765,0.985773,"['2007ApJS..172...99C', '2007ApJS..172...99C',...",Referenced
4,4000705539529,882817.0,-8.783,-8.753,-9.122,-8.868,1.319,1.346,1.025,1.232,...,0.0,28.572,28.466,28.323,3.301196,149.686626,2.637161,0.992917,"['2009ApJS..184..218L', '2007ApJS..172...99C']",Referenced


### Downloading all Data

In [5]:
limits_cosmos = [150.11916667 - 2, 150.11916667 + 2, 2.20583333 - 2, 2.20583333 + 2.20583333]

I get everything from the Catalogue! The columns I want are:

    1. id - Running Object Number
    2. SSFR_BEST - The Best Fit log Specific SFR Using BC03 Templates. Taken at the minimum Chi Squared
    3. SSFR_MED_MAX68 - Upper limit on the 68% Confidence Interval
    4. SSFR_MED_MIN68 - Lower limit on the 68% confidence interval.
    5. SSFR_MED - log sSFR from BC03 best-fit template. median of the PDF
    6. SFR_BEST - log SFR from BC03 best-fit template. Taken at the minimum chi2
    7. SFR_MED_MAX68 -  upper limit, 68% confidence level
    8. SFR_MED_MIN68 -  lower limit, 68% confidence level
    9. SFR_MED -  log sSFR from BC03 best-fit template. median of the PDF
    10. MASS_BEST - 	 log Stellar mass from BC03 best-fit template
    11. MASS_MED_MAX68 - 	 upper limit, 68% confidence level
    12. MASS_MED_MIN68 -  lower limit, 68% confidence level
    13. MASS_MED - log Stellar mass from BC03 best-fit template. median of the PDF
    14. AGE -  BC03 age
    15. TYPE - 	 Type: 0 = galaxy, 1 = star, 2 = X-ray source
    16. ZPDF - 	 photo-z measured using the galaxy templates
    17. FLAG_DEEP - 1: Ultra-deep stripes, 0: deep stripes
    18. FLAG_SHALLOW - Shallow Flag
    19. l_r - 	 log(dust corr lum in erg/s/Hz) in r filter
    20. l_k - 	 log(dust corr lum in erg/s/Hz) in NUV filter
    21. l_nu -  log(dust corr lum in erg/s/Hz) in NUV filter
    22. dist - I have assumed this is the seperation from my coordinates to the catalogue ones.

In [6]:
def convert_results(table, empty_dict):
    
    red_table = table.query('type == 0.0')
    
    ## Initialise dictionary
    export_dict = empty_dict.copy()
    
    sort_tab = red_table.sort_values('dist', ascending = True)
    
    prim_series = sort_tab.iloc[0]
    
    ## Add all values
    for i in (prim_series.keys()):
        export_dict[f'{i}_1'] = prim_series[f'{i}']
    
    target_z = prim_series.photoz
    prim_id = prim_series.id
    
    sec_df = red_table.query('id != @prim_id')
    
    if len(sec_df) == 0:
        return export_dict
    
    sec_df_z = (
        sec_df
        .assign(abs_z_diff = sec_df.apply(lambda row: abs(row.photoz - target_z), axis = 1))
    )
    
    sec_df_sort = sec_df_z.sort_values('abs_z_diff', ascending = True).drop(columns = ['abs_z_diff'])
    sec_series = sec_df_sort.iloc[0]
    
    ## Assign values for secondary.
    for i in (sec_series.keys()):
        export_dict[f'{i}_2'] = sec_series[f'{i}']
    
    return export_dict

In [11]:
def get_table(ra, dec, reg_limits, empty_dict):
    
    if ra < limits_cosmos[0] or ra > limits_cosmos[1] or dec < limits_cosmos[2] or dec > limits_cosmos[3]:
        return 'outwith_cosmos'
    
    coord = SkyCoord(ra = ra * u.deg, dec = dec * u.deg)
    for attempt in range(5):
        try:
            table = Irsa.query_region(coord, catalog = 'cosmos2015', radius = 10 * u.arcsec)
            break
        except:
            time.sleep(5)      
        
    if attempt == 4:
        return 'Failed'
            
    if len(table) == 0:
        return 'null'
    
    
    
    table_df = table.to_pandas()
    
    table_dict = convert_results(table_df, empty_dict)
    
    time.sleep(0.01)
    
    clear_output(wait = True)
    
    return table_dict

In [8]:
init_dict = {}
coord = SkyCoord(ra = my_cosmo_cat.RA.iloc[0] * u.deg, dec = my_cosmo_cat.Dec.iloc[0] * u.deg)
table = Irsa.query_region(coord, catalog = 'cosmos2015', radius = 10 * u.arcsec)
table_df = table.to_pandas()

for i in range(2):
    for j in list(table_df.columns):
        init_dict[f'{j}_{i+1}'] = None

In [9]:
dict_red = my_cosmo_cat[['SourceID', 'RA', 'Dec']].set_index('SourceID').to_dict(orient='index')

In [12]:
dict_results = {key : get_table(dict_red[key]['RA'], dict_red[key]['Dec'], limits_cosmos, init_dict) for key in tqdm(list(dict_red.keys()))}


100%|██████████| 2919/2919 [4:23:06<00:00,  5.41s/it][A


In [15]:
df_in_cosmos = pd.DataFrame.from_dict(dict_results, orient = 'index').rename(columns = {0 : 'dict_results'})

 12%|█▏        | 344/2919 [4:51:00<36:18:17, 50.76s/it]


In [20]:
df_in_cosmos.reset_index().rename(columns = {'index' : 'SourceID'}).to_csv('C:/Users/oryan/Documents/mergers-in-cosmos/data/cosmos-matched-all-df.csv')

In [16]:
dict_cosmos = df_in_cosmos.to_dict()

KeyError: 'dict_results'

In [None]:
exp_df = pd.DataFrame.from_dict(dict_cosmos, orient = 'index').reset_index().rename(columns = {'index' : 'SourceID'}).dropna()

In [None]:
exp_df.columns

In [None]:
exp_df[['SourceID', 'photoz_1', 'photoz_2']]

In [101]:
full_df = exp_df.merge(df, on = 'SourceID', how = 'left')

In [102]:
full_df_dedup = full_df.drop_duplicates('id_1', keep = 'first')

In [105]:
full_df_dedup.to_csv('C:/Users/oryan/Documents/mergers-in-cosmos/data/cosmos-matched-all-df.csv')