In [2]:
import numpy as np
import random

import h5py
import astropy.io.fits as fits
from astropy.table import Table
from time import gmtime, strftime


import sys
sys.path.append('../')
from file_path import *

import os
from tqdm import tqdm


# Define functions

In [3]:
def merge(files, output, class_id=None, z_vi=None):
    is_init = True
    for file in tqdm(files):
        f = h5py.File(file, 'r')         
        # initialize by the first file
        if is_init == True:
            wavelengths = f['wavelength'][:] # wavelength only retrive from init loop
            wavelengths_log = f['wavelength_log'][:] # wavelength only retrive from init loop
            
            sdss_names = f['sdss_name'][:]
            thing_ids = f['thing_id'][:]
            plates = f['plate'][:]
            mjds = f['mjd'][:]
            fiberids = f['fiberid'][:]
            
            class_persons = f['class_person'][:]
            is_qso_qns = f['is_qso_qn'][:]
            is_qso_dr12qs = f['is_qso_dr12q'][:]
            is_qso_finals = f['is_qso_final'][:]
            autoclass_pqns = f['autoclass_pqn'][:]
            autoclass_dr14qs = f['autoclass_dr14q'][:]
            sn_median_alls = f['sn_median_all'][:]
            bal_probs = f['bal_prob'][:]
            
            z_vis = f['z_vi'][:]
            z_confs = f['z_conf'][:]
            z_dr12qs = f['z_dr12q'][:]
            zs = f['z'][:]
            z_pipes = f['z_pipe'][:]
            z_pcas = f['z_pca'][:]
            z_qns = f['z_qn'][:]
            zwarn_pcas = f['zwarn_pca'][:]
            deltachi2_pcas = f['deltachi2_pca'][:]
            
            urls = f['url'][:]
            
            fluxs = f['flux'][:]
            flux_norms = f['flux_norm'][:]
        
            is_init = False
            f.close()
            continue
        
        # concatenate files
        if is_init == False:
            
            sdss_names = np.concatenate((sdss_names, f['sdss_name'][:]))
            thing_ids = np.concatenate((thing_ids, f['thing_id'][:]))
            plates = np.concatenate((plates, f['plate'][:]))
            mjds = np.concatenate((mjds, f['mjd'][:]))
            fiberids = np.concatenate((fiberids, f['fiberid'][:]))
            
            class_persons = np.concatenate((class_persons, f['class_person'][:]))
            is_qso_qns = np.concatenate((is_qso_qns, f['is_qso_qn'][:]))
            is_qso_dr12qs = np.concatenate((is_qso_dr12qs, f['is_qso_dr12q'][:]))
            is_qso_finals = np.concatenate((is_qso_finals, f['is_qso_final'][:]))
            autoclass_pqns = np.concatenate((autoclass_pqns, f['autoclass_pqn'][:]))
            autoclass_dr14qs = np.concatenate((autoclass_dr14qs, f['autoclass_dr14q'][:]))
            sn_median_alls = np.concatenate((sn_median_alls, f['sn_median_all'][:]))
            bal_probs = np.concatenate((bal_probs, f['bal_prob'][:]))
            
            z_vis = np.concatenate((z_vis, f['z_vi'][:]))
            z_confs = np.concatenate((z_confs, f['z_conf'][:]))
            z_dr12qs = np.concatenate((z_dr12qs, f['z_dr12q'][:]))
            zs = np.concatenate((zs, f['z'][:]))
            z_pipes = np.concatenate((z_pipes, f['z_pipe'][:]))
            z_pcas = np.concatenate((z_pcas, f['z_pca'][:]))
            z_qns = np.concatenate((z_qns, f['z_qn'][:]))
            zwarn_pcas = np.concatenate((zwarn_pcas, f['zwarn_pca'][:]))
            deltachi2_pcas = np.concatenate((deltachi2_pcas, f['deltachi2_pca'][:]))
            
            urls = np.concatenate((urls, f['url'][:]))
            
            fluxs = np.concatenate((fluxs, f['flux'][:]))
            flux_norms = np.concatenate((flux_norms,f['flux_norm'][:]))
            f.close()
    
    # filter by class_person
    # 0: Not Inspected / 1: Star / 3: Quasar / 30: BAL Quasar / 4: Galaxy / 50: Blazar
    if class_id:
        class_mask = np.array([True if item == class_id else False for item in class_persons])
        
        sdss_names = sdss_names[class_mask]
        thing_ids = thing_ids[class_mask]
        plates = plates[class_mask]
        mjds = mjds[class_mask]
        fiberids = fiberids[class_mask]
        
        class_persons = class_persons[class_mask]
        is_qso_qns = is_qso_qns[class_mask]
        is_qso_dr12qs = is_qso_dr12qs[class_mask]
        is_qso_finals = is_qso_finals[class_mask]
        autoclass_pqns = autoclass_pqns[class_mask]
        autoclass_dr14qs = autoclass_dr14qs[class_mask]
        sn_median_alls = sn_median_alls[class_mask]
        bal_probs = bal_probs[class_mask]
        
        z_vis = z_vis[class_mask]
        z_confs = z_confs[class_mask]
        z_dr12qs = z_dr12qs[class_mask]
        zs = zs[class_mask]
        z_pipes = z_pipes[class_mask]
        z_pcas = z_pcas[class_mask]
        z_qns = z_qns[class_mask]
        zwarn_pcas = zwarn_pcas[class_mask]
        deltachi2_pcas = deltachi2_pcas[class_mask]
        
        urls = urls[class_mask]
        
        fluxs = fluxs[class_mask]
        flux_norms = flux_norms[class_mask]
        
    # filter by class_person
    # 0: Not Inspected / 1: Star / 3: Quasar / 30: BAL Quasar / 4: Galaxy / 50: Blazar
    if z_vi:
        class_mask = np.array([True if item > 0 else False for item in z_vis])
        
        sdss_names = sdss_names[class_mask]
        thing_ids = thing_ids[class_mask]
        plates = plates[class_mask]
        mjds = mjds[class_mask]
        fiberids = fiberids[class_mask]
        
        class_persons = class_persons[class_mask]
        is_qso_qns = is_qso_qns[class_mask]
        is_qso_dr12qs = is_qso_dr12qs[class_mask]
        is_qso_finals = is_qso_finals[class_mask]
        autoclass_pqns = autoclass_pqns[class_mask]
        autoclass_dr14qs = autoclass_dr14qs[class_mask]
        sn_median_alls = sn_median_alls[class_mask]
        bal_probs = bal_probs[class_mask]
        
        z_vis = z_vis[class_mask]
        z_confs = z_confs[class_mask]
        z_dr12qs = z_dr12qs[class_mask]
        zs = zs[class_mask]
        z_pipes = z_pipes[class_mask]
        z_pcas = z_pcas[class_mask]
        z_qns = z_qns[class_mask]
        zwarn_pcas = zwarn_pcas[class_mask]
        deltachi2_pcas = deltachi2_pcas[class_mask]
        
        urls = urls[class_mask]
        
        fluxs = fluxs[class_mask]
        flux_norms = flux_norms[class_mask]
        
    # write to file
    print('Writing to file:', output)
    f = h5py.File(output,'w')
    indexs = np.arange(len(sdss_names))
    dset_index = f.create_dataset(name="index", data = indexs, dtype='i8')
    
    dset_sdss_name = f.create_dataset(name="sdss_name", data = sdss_names, dtype=h5py.string_dtype())
    dset_thing_id = f.create_dataset(name="thing_id", data = thing_ids, dtype='i8')
    dset_plate = f.create_dataset(name="plate", data = plates, dtype='i8')
    dset_mjd = f.create_dataset(name="mjd", data = mjds, dtype='i8')
    dset_fiberid = f.create_dataset(name="fiberid", data = fiberids, dtype='i8')
    
    dset_class_person = f.create_dataset(name="class_person", data = class_persons, dtype='i8')
    dset_is_qso_dr12q = f.create_dataset(name="is_qso_dr12q", data = is_qso_dr12qs, dtype='i8')
    dset_is_qso_final = f.create_dataset(name="is_qso_final", data = is_qso_finals, dtype='i8')
    dset_is_qso_qn = f.create_dataset(name="is_qso_qn", data = is_qso_qns, dtype='i8')
    dset_autoclass_pqn = f.create_dataset(name="autoclass_pqn", data = autoclass_pqns, dtype=h5py.string_dtype())
    dset_autoclass_dr14q = f.create_dataset(name="autoclass_dr14q", data = autoclass_dr14qs, dtype=h5py.string_dtype())
    dset_sn_median_all = f.create_dataset(name="sn_median_all", data = sn_median_alls, dtype=np.float32)
    dset_bal_prob = f.create_dataset(name="bal_prob", data = bal_probs, dtype=np.float32)
    
    dset_z_vi = f.create_dataset(name="z_vi", data = z_vis, dtype=np.float32)
    dset_z_conf = f.create_dataset(name="z_conf", data = z_confs, dtype=np.float32)
    dset_z_dr12q = f.create_dataset(name="z_dr12q", data = z_dr12qs, dtype=np.float32)
    dset_z = f.create_dataset(name="z", data = zs, dtype=np.float32)
    dset_z_pipe = f.create_dataset(name="z_pipe", data = z_pipes, dtype=np.float32)
    dset_z_pca = f.create_dataset(name="z_pca", data = z_pcas, dtype=np.float32)
    dset_z_qn = f.create_dataset(name="z_qn", data = z_qns, dtype=np.float32)
    dset_zwarn_pca = f.create_dataset(name="zwarn_pca", data = zwarn_pcas, dtype='i8')
    dset_deltachi2_pca = f.create_dataset(name="deltachi2_pca", data = deltachi2_pcas, dtype=np.float32)
    
    dset_url = f.create_dataset(name="url", data = urls, dtype=h5py.string_dtype())

    dset_wavelength = f.create_dataset(name="wavelength", data = wavelengths, dtype=np.float32)
    dset_wavelength_log = f.create_dataset(name="wavelength_log", data = wavelengths_log, dtype=np.float32)
    dset_flux = f.create_dataset(name="flux", data = fluxs, dtype=np.float32)
    dset_flux_norm = f.create_dataset(name="flux_norm", data = flux_norms, dtype=np.float32)
    
    f.attrs['CLASS'] = '0: Not Inspected / 1: Star / 3: Quasar / 30: BAL Quasar / 4: Galaxy / 50: Blazar'
    f.attrs['TIMECREA'] = '2020-07-30 11:54:19'
    f.attrs['TIMEMODI'] = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    f.attrs['REFERENCE'] = 'https://data.sdss.org/datamodel/files/BOSS_QSO/DR16Q/DR16Q_v4.html'
    f.attrs['COMMENTS'] = 'SDSS Spectrum'
    f.attrs['CREATOR'] = 'yu.wang@uniroma1.it'
    f.attrs['MODIFIER'] = 'yu.wang@uniroma1.it'
    
    dset_wavelength.attrs['unit'] = 'A'
    dset_flux.attrs['unit'] = '1e-17 erg/cm2/s/A'
    
    f.close()
    
    print('Saved')

# Merge files

In [4]:
#files = [feature_path+'dr16q-spectrum-3600-10350-n3000.h5', 
#         feature_path+'dr16q-spectrum-3600-10350-n4000.h5',
#         feature_path+'dr16q-spectrum-3600-10350-n5000.h5',
#         feature_path+'dr16q-spectrum-3600-10350-n6000.h5',
#         feature_path+'dr16q-spectrum-3600-10350-n7000.h5',
#         feature_path+'dr16q-spectrum-3600-10350-n7500.h5']

filenames = [feature_path+x for x in os.listdir(feature_path) if x.endswith(".h5")]

In [5]:
len(filenames)

74

In [None]:
output = feature_path+'dr16q-full-spectrum.h5'

merge(filenames, output, class_id=False, z_vi=False)

 80%|███████▉  | 59/74 [05:42<02:38, 10.59s/it]

In [4]:
output = feature_path+'dr16q-spectrum-3600-10350-n3456-zvi-bal.h5'

merge(files, output, class_id=30, z_vi=True)

100%|██████████| 4/4 [00:13<00:00,  3.46s/it]


Writing to file: ../feature/dr16q-spectrum-3600-10350-n3456-zvi-bal.h5
Saved


In [5]:
output = feature_path+'dr16q-spectrum-3600-10350-n3456-zvi-galaxy.h5'

merge(files, output, class_id=4, z_vi=True)

100%|██████████| 4/4 [00:13<00:00,  3.46s/it]


Writing to file: ../feature/dr16q-spectrum-3600-10350-n3456-zvi-galaxy.h5
Saved


In [6]:
output = feature_path+'dr16q-spectrum-3600-10350-n3456-zvi-star.h5'

merge(files, output, class_id=1, z_vi=True)

100%|██████████| 4/4 [00:13<00:00,  3.44s/it]


Writing to file: ../feature/dr16q-spectrum-3600-10350-n3456-zvi-star.h5
Saved


In [7]:
output = feature_path+'dr16q-spectrum-3600-10350-n3456-zvi-blazer.h5'

merge(files, output, class_id=50, z_vi=True)

100%|██████████| 4/4 [00:13<00:00,  3.46s/it]

Writing to file: ../feature/dr16q-spectrum-3600-10350-n3456-zvi-blazer.h5
Saved





In [12]:
files = [feature_path+'dr16q-spectrum-3600-10350-n9000.h5']

output = feature_path+'dr16q-spectrum-3600-10350-n9000-zvi.h5'

merge(files, output, class_id=None, z_vi=True)

100%|██████████| 1/1 [00:00<00:00,  1.12it/s]


Writing to file: ../feature/dr16q-spectrum-3600-10350-n9000-zvi.h5
Saved
