# WALLABY public data release notebook

This notebook is intended to support with exporting the WALLABY source and kinematic data tables and associated products to be ingested into a public archive (CADC and CASDA). It is intended that the user of this notebook will be a member of the WALLABY project team, or a member of the WALLABY TWG7 group.

---

In [None]:
import os
import shutil
import getpass
import requests
import getpass
import pyvo as vo
from pyvo.auth import authsession, securitymethods
import numpy as np
from astropy.io.votable import from_table, parse_single_table
from astropy.table import vstack

### Authenticate

<span style="font-weight: bold; color: #FF0000;">⚠ Update the cell below with your username and enter your password</span>

In [None]:
# Enter WALLABY user username and password

username = 'wallaby_user'
password = getpass.getpass('Enter your password')

In [None]:
# Connect with TAP service

URL = "https://wallaby.aussrc.org/tap"
auth = vo.auth.AuthSession()
auth.add_security_method_for_url(URL, vo.auth.securitymethods.BASIC)
auth.credentials.set_password(username, password)
tap = vo.dal.TAPService(URL, session=auth)

---

# 1. Decide Release

Determine which internal releases you would like to bundle in this public data release. You will also need to set a name for this public data release.

In [None]:
# Get all tags

query = "SELECT * FROM wallaby.tag"
votable = tap.search(query)
table = votable.to_table()
print(table)

<span style="font-weight: bold; color: #FF0000;">⚠ Update the cell below. Add tags to the list for release, and update `release_name` variable</span>

In [None]:
# List of tags
tags = ['Norma DR1', 'NGC 5044 DR2']

# Release name
release_name = "WALLABY Test PDR"
release_name = release_name.replace(' ', '_')

---

# 2. Sources

## Catalog

In [None]:
# Retrieve catalog as Astropy table

query = """SELECT d.*, ivo_string_agg(t.name || ': ' || t.description, '; ') AS tags, ivo_string_agg(c.comment, '; ') AS comments
        FROM wallaby.detection d
        LEFT JOIN wallaby.tag_detection td ON d.id = td.detection_id 
        LEFT JOIN wallaby.tag t ON t.id = td.tag_id
        LEFT JOIN wallaby.comment c ON d.id = c.detection_id
        WHERE t.name IN ('Internal Data Release', '$TAG_NAME')
        GROUP BY d.id"""

In [None]:
table = None
for idx, tag_name in enumerate(tags):
    q = query.replace('$TAG_NAME', tag_name)
    result = tap.search(q)
    if idx == 0:
        table = result.to_table()
    else:
        table = vstack([table, result.to_table()])
table

## Modifying the catalog

There are some additional columns and calculated properties that are required for the release. The column metadata (e.g. UCDs, units, description etc as required to conform with VO standards) also need to be included for these additional columns. These include:

| Column | Description |
| --- | --- |
| `qflag` |  |
| `kflag` | column to indicate whether or not there is a kinematic model associated with the detection |
| `team_release` | |
| `f_sum_corr` | |
| `err_f_sum_corr` | |
| `dist_h` | |
| `log_m_hi_corr` | Uses `v_est` and `dist_est` which are calculated properties |

In [None]:
# Table corrections

rest_freq = 1.42040575179E+09
c = 2.9979245e8
H0 = 70.0

table['name'] = table['source_name']
table['qflag'] = table['flag']
table['kflag'] = np.zeros(len(table['flag']))
table['team_release'] = ['' * len(table['flag'])]
table['f_sum_corr'] = table['f_sum'] / 10.0 ** (0.0285 * np.log10(table['f_sum'])**3.0 -0.439 * np.log10(table['f_sum'])**2.0 + 2.294 * np.log10(table['f_sum']) - 4.097)
table['err_f_sum_corr'] = table['err_f_sum'] / table['f_sum'] * table['f_sum_corr']
table['v_est'] = ((rest_freq - table['freq']) / table['freq'] * c / 1000.0)
table['dist_h'] = table['v_est'] / H0
table['log_m_hi'] = np.log10(49.7 * table['dist_h']**2.0 * table['f_sum'])
table['log_m_hi_corr'] = np.log10(49.7 * table['dist_h']**2.0 * table['f_sum_corr'])

In [None]:
# Remove certain columns from the astropy table

write_table = table
write_table.remove_columns(['id', 'run_id', 'instance_id', 'access_url', 'access_format', 'source_name', 'flag', 'v_est', 'l', 'b', 'v_rad', 'v_opt', 'v_app', 'tags'])
votable = from_table(write_table)

In [None]:
# Update derived quantity columns of votable

f_sum_corr_field = votable.get_field_by_id('f_sum_corr')
f_sum_corr_field.ucd = "phot.flux;meta.main"
f_sum_corr_field.unit = "Jy*Hz"
f_sum_corr_field.description = "The integrated flux within 3D source mask statistically corrected to match single dish observations"

err_f_sum_corr_field = votable.get_field_by_id('err_f_sum_corr')
err_f_sum_corr_field.ucd = "stat.error;phot.flux"
err_f_sum_corr_field.unit = "Jy*Hz"
err_f_sum_corr_field.description = "Statistical uncertainty of the single dish corrected integrated flux"

dist_h_field = votable.get_field_by_id('dist_h')
dist_h_field.ucd = "pos.distance"
dist_h_field.unit = "Mpc"
dist_h_field.description = "Local Hubble distance derived from the barycentric source frequency"

log_m_hi_field = votable.get_field_by_id('log_m_hi')
log_m_hi_field.ucd = "phys.mass"
log_m_hi_field.unit = "log10(Msol)"
log_m_hi_field.description = "The estimated log10 mass of the cube using f_sum and freq"

log_m_hi_corr_field = votable.get_field_by_id('log_m_hi_corr')
log_m_hi_corr_field.ucd = "phys.mass"
log_m_hi_corr_field.unit = "log10(Msol)"
log_m_hi_corr_field.description = "The estimated log10 mass of the cube using f_sum_corr and freq"

qflag_field = votable.get_field_by_id('qflag')
qflag_field.datatype = "double"
qflag_field.ucd = "meta.code.qual"
qflag_field.description = "Quality flag"

kflag_field = votable.get_field_by_id('kflag')
kflag_field.datatype = "double"
kflag_field.ucd = "meta.code"
kflag_field.description = "Kinematic model flag"

comments_field = votable.get_field_by_id('comments')
comments_field.datatype = "char"
comments_field.ucd = "meta.note"
comments_field.description = "Comments on individual sources"

team_release_field = votable.get_field_by_id('team_release')
team_release_field.datatype = "char"
team_release_field.ucd = "meta.dataset;meta.main"
team_release_field.description = "Internal team release identifier"

In [None]:
print(write_table.columns)
print(len(write_table.columns))

In [None]:
# Download catalog table

votable.version = '1.3'
votable_filename = f'{release_name}_SourceCatalogue.xml'
votable.to_xml(votable_filename)

## Products

In [None]:
# useful function for downloading table products (requires authentication)

def download_products(row, products_filename, chunk_size=8192):
    """Download products for a row of the table (a detection entry)
    
    """
    name = row['source_name']
    access_url = row['access_url']
    votable = parse_single_table(access_url)
    product_table = votable.to_table()
    url = product_table[product_table['description'] == 'SoFiA-2 Detection Products'][0]['access_url']
    with requests.get(url, auth=(username, password), stream=True) as r:
        r.raise_for_status()
        with open(products_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=chunk_size):
                f.write(chunk)
    print(f'Downloaded completed for {name}')
    return

def download_table_products(table, directory, chunk_size=8192):
    """Download WALLABY products from ADQL queried table

    """
    if not os.path.exists(directory):
        os.mkdir(directory)
    print(f'Saving products to {directory}')
    for row in table:
        name = row['source_name']
        products_filename = os.path.join(directory, f'{name}.tar')
        download_products(row, products_filename, chunk_size)
    print('Downloads complete')
    return

In [None]:
# Write output products for a source

download_table_products(table[0:3], release_name)

In [None]:
# Update product files

import tarfile
import glob
from astropy.io import fits

In [None]:
# Update all product files

product_tarfiles = glob.glob(os.path.join(release_name, '*.tar'))
product_files = [f.replace('.tar', '') for f in product_tarfiles]

# Extract
for f in product_tarfiles:
    filename = f.replace('.tar', '')
    with tarfile.open(f) as tf:
        tf.extractall(path=filename)
    # os.remove(f)

# Update fits files
for idx_pf, pf in enumerate(product_files):
    print(f'Folder {pf} [{idx_pf + 1}/{len(product_files)}]')
    fits_files = glob.glob(os.path.join(pf, '*.fits'))
    for idx_ff, ff in enumerate(fits_files):
        print(f'[{idx_ff + 1}/{len(fits_files)}] {ff}')
        source_name = ff.split('/')[1]
        with fits.open(ff, mode='update') as hdul:
            header = hdul[0].header
            # NOTE: DATE card?
            header['SRCVERS'] = header['ORIGIN']  # Get SoFiA version from ORIGIN header
            header['SRCTR'] = release_name
            header['OBJECT'] = source_name
            hdul.flush()

## CASDA release

Move product files to required directory structure for CASDA public data releases:

- catalogue (VOTable version 1.3)
- cubelets (mask and cube files)
- moment_maps (all moment maps, including a mom0 .png file if you want a preview)
- spectra (.spec file in fits format)

File formats are: `f'{WALLABY_name}_{release_version}.fits`

In [None]:
# copy files over for CASDA release

# Create directory_structure
basedir = os.path.join('CASDA', release_name)
os.makedirs(os.path.join(basedir, 'catalogue'), exist_ok=True)
os.makedirs(os.path.join(basedir, 'cubelets'), exist_ok=True)
os.makedirs(os.path.join(basedir, 'moment_maps'), exist_ok=True)
os.makedirs(os.path.join(basedir, 'spectra'), exist_ok=True)

# Copy catalogue xml
shutil.copy(votable_filename, os.path.join(basedir, 'catalogue', votable_filename))

# Copy product files
for idx_pf, pf in enumerate(product_files):
    source_name = pf.split('/')[1].replace(' ', '_')
    print(f'Source {source_name} [{idx_pf + 1}/{len(product_files)}]')
    p_files = glob.glob(os.path.join(pf, '*'))
    for f in p_files:
        suffix = f.rsplit('_', 1)[1]
        new_filename = f'{source_name}_{release_name}_{suffix}'
        
        # moment maps
        if any([t in suffix for t in ['mom0', 'mom1', 'mom2', 'chan']]):
            shutil.copy(f, os.path.join(basedir, 'moment_maps', new_filename))
            
        # cubelets
        elif any([t in suffix for t in ['cube', 'mask']]):
            shutil.copy(f, os.path.join(basedir, 'cubelets', new_filename))
        
        # spectra
        elif any([t in suffix for t in ['spec']]):
            shutil.copy(f, os.path.join(basedir, 'spectra', new_filename))
        
        else:
            print(f'Skipping file {f}')

## CADC Release

Copy product files for CADC public data release required file structure.

- Each detection (file format: WALLABY name, release version) folder contains the product files
- Each product file has the format: `f'{WALLABY_name}_{release_version}_<ext>.fits`

In [None]:
# copy files over for CADC release

# Create directory_structure
basedir = os.path.join('CADC', release_name)
os.makedirs(basedir, exist_ok=True)

# Copy catalogue xml
shutil.copy(votable_filename, os.path.join(basedir, votable_filename))

# Copy product files
for idx_pf, pf in enumerate(product_files):
    source_name = pf.split('/')[1].replace(' ', '_')
    print(f'Source {source_name} [{idx_pf + 1}/{len(product_files)}]')
    source_dir = os.path.join(basedir, f'{source_name}_{release_name}')
    os.makedirs(source_dir, exist_ok=True)
    p_files = glob.glob(os.path.join(pf, '*'))
    for f in p_files:
        suffix = f.rsplit('_', 1)[1]
        new_filename = f'{source_name}_{release_name}_{suffix}'
        if any([t in suffix for t in ['mom0', 'mom1', 'mom2', 'chan', 'cube', 'mask', 'spec']]):
            shutil.copy(f, os.path.join(source_dir , new_filename))
        else:
            print(f'Skipping file {f}')

# 3. Kinematic models

---

In [None]:
# Export kinematic models

pass