## SkyMapper Data Release 2 (DR2)

Sherry Yang - 2022-11-30

Information on the columns each of tables contains can be found here:\
[Browse Table Metadata | DR2](https://skymapper.anu.edu.au/table-browser/dr2)\
**References**:\
 https://dx.doi.org/10.25914/5ce60d31ce759

In [4]:
import pyvo as vo
import pandas as pd
import numpy as np
import os
from astropy.time import Time
from astropy import units as u
from tqdm import tqdm, trange
from math import ceil
from astropy.time import TimeDelta
import time
import threading
import multiprocessing

In [None]:
def querySkymapper(start, end, newStart):
    
    tap_service = vo.dal.TAPService("https://api.skymapper.nci.org.au/public/tap/")
    
    #Find the range of expourse_id in this date
    regionQuery = f"""
    SELECT COUNT(image_id) AS num, MAX(image_id) AS max_id,MIN(image_id) AS min_id FROM dr2.images WHERE "date" >= {start} AND "date" < {end}
    """
    region = tap_service.search(regionQuery)
    region = pd.DataFrame(region) 
    
    #tarnsform date format here for file name
    date = Time(start, format='mjd')
    date = date.to_value('iso',subfmt='date')
    
    if(region['num'][0] == 0):
        return None
    
    #Defined the range of expourse_id
    minID = region['min_id'][0]
    maxID = region['max_id'][0]
    if(newStart > minID):
        minID = newStart
        
    #The main qurey
    query = f"""SELECT 
    a.object_id as obj_id,
    b.image_id as exposure_id,
    b.ccd as ccd,
    c."date" as mjd_utc,
    b.ra_img as ra,
    b.decl_img as dec,
    a.e_raj2000 as ra_sigma,
    a.e_dej2000 as dec_sigma,
    b.filter as filter,
    b.mag_psf as mag,
    b.e_mag_psf as mag_sigma,
    'Q55'as obscode,
    c.exp_time as exposure_time

    FROM
    (Select image_id, "date", exp_time, filter, object from dr2.images) as c 
    INNER JOIN 
    (select object_id, ccd, class_star,image_id,filter,ra_img,decl_img, mag_psf, e_mag_psf from dr2.photometry)as b
    ON b.image_id = c.image_id
    INNER JOIN 
    (select object_id, raj2000,dej2000,e_raj2000,e_dej2000, ngood from dr2.master) as a
    on a.object_id = b.object_id
    
    WHERE c.image_id >= {minID}
    AND  c.image_id <= {maxID}"""
    
    
    tap_results = tap_service.search(query, timeout = 600)
    
    if(len(tap_results)==0):
        return None
    
    # A constant convert second to Mjd
    dt2 = TimeDelta(1, format='sec')
    to_mjd = dt2.to_value('jd')   
    
    df = pd.DataFrame(tap_results) 
    
    #If reach the Skymapper maximum query limit, the last exposure_id data is incomplete, drop it. 
    newStart = start;
    over = (df.shape[0] == 1000000)
    if(over):
        newStart = df.exposure_id[1000000- 1]
        df = df[df['exposure_id'] < newStart] 
    
    #Format transformation
    exposure_id = df["exposure_id"].values
    ccd = df["ccd"].values
    object_id = df["obj_id"].values
    obs_id = [f"{o}{c}{e}" for o,c, e in zip(object_id, ccd,exposure_id)] #make a unique id for each observation of each object
    df["obs_id"] = obs_id
    df['obscode'] = "Q55"
    df['dec_sigma'] = df['dec_sigma']*u.mas.to(u.deg)
    df['ra_sigma'] = df['ra_sigma']*u.mas.to(u.deg)
    df['mjd_utc'] = df['exposure_time']*to_mjd/2 +df['mjd_utc'] #change data to mid point
    df = df.rename(columns={"obscode":"observatory_code"})
    df = df.astype({"exposure_id": str}, errors='raise') 
    df = df.astype({"observatory_code": str}, errors='raise') 
    df = df.astype({"obs_id": str}, errors='raise') 
    
    df = df.drop(columns=['exposure_time'])
    df = df.drop(columns=['ccd'])
    
    #Output 
    DATA_DIR = "/epyc/projects/adam_datasets/skyMapper_dr2/data"
    file_name = os.path.join(DATA_DIR, f"dr2_observations_{date}.h5")
    df.to_hdf(path_or_buf= file_name,index=False,append=True,key='data',format = 'table',min_itemsize={ 'obs_id' : 30 })
    
    #If reach the row maximum of skyMapper qurey, do recursion
    if (over):
        querySkymapper(start, end, newStart)
    

In [None]:
def groupQuery(start, end):
    for i in range(start,end): 
        querySkymapper(i, i+1, -1)

In [None]:
jobs = []

for i in range(20):
	process = multiprocessing.Process(target=groupQuery, args=(56764 + i *100,56764+ i*100+100))
	jobs.append(process)

for j in jobs: 
	j.start()

for j in jobs:
	j.join()

### Handle Object_id is NULL in dr2.photometry
There some Observations don't have a object_id, and there no attributes to distinct those NULL objects in the same exposure_id and same ccd, so here by make a query for all NULL objects, and indexs them by a index number + ccd + exposureid. Also clear up duplicate index which caused by recursion.

In [8]:
def index_null(start,end):
    for index in range(start,end):
        date = Time(index, format='mjd') #Timeobject
        date = date.to_value('iso',subfmt='date') #String formate 2014-03-20
        DATA_DIR = "/epyc/projects/adam_datasets/skyMapper_dr2/data_test"
        file_name = os.path.join(DATA_DIR, f"dr2_observations_{date}.h5")
        if(not os.path.isfile(file_name)):
            continue

        store = pd.read_hdf(file_name, "data",mode = 'r')
        store = store.reset_index(drop=True)
        nullobject = store.loc[store['obj_id'] == 0]
        store.loc[store['obj_id'] == 0,'obs_id'] =  [f"{i}#{o}" for i,o in zip(nullobject.index, store['obs_id'])]
        
        file = open("final_stat.txt", "a")  # write mode
        file.write(f"{date},{store.shape[0]},{nullobject.shape[0]}\n")
        file.close()
        
        DATA_DIR = "/epyc/projects/adam_datasets/skyMapper_dr2/data"
        file_name = os.path.join(DATA_DIR, f"dr2_observations_{date}.h5")
        store.to_hdf(path_or_buf= file_name,index=False,append=True,key='data',format = 'table',min_itemsize={ 'obs_id' : 35 })

In [10]:
#56730,58192
jobs = []
Slice = ceil ((58192-56730) / 20)
for i in range(20):
	process = multiprocessing.Process(target=index_null, args=(56730+Slice * i, 56730+Slice * (i+1)))
	jobs.append(process)

for j in jobs: 
	j.start()

for j in jobs:
	j.join()

### Indexing Code:
python index_observations.py data dataset SKYMAPPER_DR2 --nside 32 --dataset_name "SkyMapper Southern Sky Survey (DR2)" --reference_doi https://doi.org/10.25914/5ce60d31ce759 --documentation_url https://skymapper.anu.edu.au/data-release/dr2/ --sia_url https://api.skymapper.nci.org.au/public/siap/dr2/query?