# Data Inspection and Preparation of Dataset Creation

In [3]:
#import fitsio
import os
import os.path
import matplotlib.pyplot as plt
import numpy as np
import tqdm
import tensorflow as tf
from astropy.io import fits
import astropy
import json
from astropy.table import Table,vstack 


Sloan data at NERSC is stored in this directory: /global/project/projectdirs/cosmo/data/sdss/dr16/sdss/spectro/redux/plates-dr16

to build dataset, run: tfds build --data_dir=/global/cscratch1/sd/vboehm/Datasets/SDSS_BOSS_all --register_checksums

Data is in subdirectories.

Spectra are organied by plate.

For the moment work with "combined" spectra.  A single object may be observed several times and the spectrograph has blue and red channels.  These data are available but the simple thing is to work where these data are combined to make one spectrum per object.

The combined spectra for the plate are in fits files with the name "spPlate*.fits

see https://data.sdss.org/datamodel/files/BOSS_SPECTRO_REDUX/RUN2D/PLATE4/spPlate.html

In [4]:
local_path = '/global/cscratch1/sd/vboehm/Datasets/SDSS_BOSS_data/'

### Get list of good plates

In [5]:
hdulist = fits.open("/global/project/projectdirs/cosmo/data/sdss/dr16/sdss/spectro/redux/plates-dr16.fits")
plateid = hdulist[1].data['PLATEID']
plates  = hdulist[1].data['PLATE']
mjds    = hdulist[1].data['MJD']
platequality = hdulist[1].data['PLATEQUALITY']
print(len(plates))
endings = np.unique(hdulist[1].data['RUN2D'])
print(endings)
print(mjds[1], plates[1], platequality[1], endings[1])

6826
['103' '104' '26' 'v5_13_0']
51630 266 good 104


In [6]:
plates[np.where(mjds==51608)]

array([267, 279], dtype=int32)

In [7]:
51608,267

(51608, 267)

In [8]:
good_plates = np.where(platequality=='good')
good_data = {}

In [9]:
plates = plates[good_plates]
mjds   = mjds[good_plates]
folder = hdulist[1].data['RUN2D'][good_plates]

count = 0 
for tag in np.unique(folder):
    good_data[tag]={}
    good_data[tag]['plates']=plates[folder==tag]
    good_data[tag]['mjds']=mjds[folder==tag]
    count+=len(mjds[folder==tag])
print(count)

6062


In [10]:
np.unique(folder)

chararray(['103', '104', '26', 'v5_13_0'], dtype='<U7')

In [11]:
# save the names of the data and redshift files
char1 = '-'
char2 = '.'
datafiles_wz=[]
datafiles_nz=[]
datafiles_ending=[]
z_files=[]
for ending in endings:
    sdss_dir = os.path.join("/global/project/projectdirs/cosmo/data/sdss/dr16/sdss/spectro/redux/", ending)
    for dirpath, dirnames, filenames in os.walk(sdss_dir):
        for filename in [f for f in filenames if f.startswith("spPlate")]:
            try:
                this_plate = np.int(filename[filename.find(char1)+1 : filename.find(char2)-6])
                this_mjd   = np.int(filename[-10:-5])
            except:
                print(filename)
            if this_plate in good_data[ending]['plates']:
                ind = np.where(good_data[ending]['plates']==this_plate)[0]
                if this_mjd in good_data[ending]['mjds'][ind]:
                    data_file   = os.path.join(dirpath, filename)
                    file_ending = filename[7::]
                    if ending=='v5_13_0':
                        z_file      = os.path.join(dirpath,ending,'spZbest'+file_ending)
                    else:
                        z_file      = os.path.join(dirpath,'spZbest'+file_ending)
                    if os.path.exists(z_file):
                        z_files.append(z_file)
                        datafiles_wz.append(os.path.join(dirpath, data_file))
                        datafiles_ending.append(ending)
                    else:
                        datafiles_nz.append(os.path.join(dirpath, data_file))
                        print(z_file)

/global/project/projectdirs/cosmo/data/sdss/dr16/sdss/spectro/redux/103/2887/pass2/spZbest-2887-54521.fits
/global/project/projectdirs/cosmo/data/sdss/dr16/sdss/spectro/redux/103/2887/pass1/spZbest-2887-54521.fits
spPlate-1962-53321.fits.save
spPlate-1963-54331.fits.save
spPlate-2247-53857.fits.save
spPlate-2667-54142.fits.save
spPlate-2912-54499.fits.save
spPlate-2174-53521.fits.save
spPlate-2255-53565.fits.save
spPlate-2800-54326.fits.save
spPlate-2256-53613.fits.save
spPlate-2256-53859.fits.save
spPlate-2377-53991.fits.save
spPlate-2078-53378.fits.save
spPlate-2338-53679.fits.save
spPlate-1960-53289.fits.save
spPlate-2079-53379.fits.save
spPlate-2185-53532.fits.save
spPlate-1961-53299.fits.save
spPlate-2671-54141.fits.save
spPlate-2333-53676.fits.save
spPlate-2821-54393.fits.save
spPlate-2887-54521.fits.save
spPlate-2475-53845.fits.save
spPlate-2476-53826.fits.save


In [12]:
## file numbers match, pass folders are something different
print(len(datafiles_wz))
print(len(z_files)+len(datafiles_nz))

6062
6064


In [13]:
with open(os.path.join(local_path,'endings_good.txt'), 'w') as file:
    json.dump(datafiles_ending, file)

In [14]:
with open(os.path.join(local_path,'datafiles_good.txt'), 'w') as file:
    json.dump(datafiles_wz, file)

In [15]:
with open(os.path.join(local_path,'z_files_good.txt'), 'w') as file:
    json.dump(z_files, file)

### target selection

In [27]:
test   = '/global/project/projectdirs/cosmo/data/sdss/dr16/sdss/spectro/redux/specObj-dr16.fits'
table  = Table.read(test, format='fits',hdu=1, memmap=True)
target = table['LEGACY_TARGET1']
MJD    = table['MJD']
plate  = table['PLATE']
fiber  = table['FIBERID']
objid  = table['SPECOBJID']

In [28]:
selection = np.where(target%(64+2+4)!=0)[0]
print(len(selection), len(np.unique( objid[selection].astype(int))), len(np.unique(MJD[selection])), len(np.unique(plate[selection])), len(np.unique(fiber[selection])))
np.save(os.path.join(local_path,'target_selection_QSO_GAL.npy'),[target[selection],MJD[selection],plate[selection],fiber[selection], objid[selection].astype(int)])

1137770 1137770 765 1806 640
