In [7]:
import pandas as pd
import numpy as np
import os
pd.set_option("display.max_columns", None)
import glob

from astroquery.sdss import SDSS, Conf
from astropy.config import set_temp_cache
from astropy.utils.data import clear_download_cache

In [2]:
DATA_DIR = '/astro/store/epyc3/data3/adam_datasets/sdss_dr9'

In [3]:
'''
old method - filter by field and camcol, however grew too large
try:
    df = pd.read_csv('/epyc/data3/adam_datasets/sdss_dr9/sdss_dr9_photoObj.csv')
except FileNotFoundError:
    #cols = ['objID', 'fieldID', 'ra', 'dec', 'raErr', 'decErr', 'i', 'err_i', 'TAI_i', 'mjd', 'field', 'camcol']
    df = pd.DataFrame()
    #with set_temp_cache('/data/epyc/projects/adam_datasets/sdss_dr9/cache'):
        for i in range(11, 1002, 1): #field
            for j in range(1, 7, 1): #camcol
                #print(f'Field: {i}, Camcol: {j}')
                #with set_temp_cache('/data/epyc/projects/adam_datasets/sdss_dr9/cache'):
                print(f'Field: {i}, Camcol: {j}')
                query = f"""
                        SELECT objID, fieldID, field, ra, dec, raErr, decErr, i, err_i, TAI_i, mjd, field, camcol, CLEAN
                        FROM PhotoObj
                        WHERE camcol = {j} AND field = {i}
                        """
                camcol = SDSS.query_sql(query, timeout=5400, data_release=9).to_pandas()
                df = pd.concat([df, camcol], axis=0)
                
                df.to_csv('/epyc/projects/adam_datasets/sdss_dr9/sdss_dr9_photoObj_field:{i}_camcol{}.csv')
'''

'\nold method - filter by field and camcol, however grew too large\ntry:\n    df = pd.read_csv(\'/epyc/data3/adam_datasets/sdss_dr9/sdss_dr9_photoObj.csv\')\nexcept FileNotFoundError:\n    #cols = [\'objID\', \'fieldID\', \'ra\', \'dec\', \'raErr\', \'decErr\', \'i\', \'err_i\', \'TAI_i\', \'mjd\', \'field\', \'camcol\']\n    df = pd.DataFrame()\n    #with set_temp_cache(\'/data/epyc/projects/adam_datasets/sdss_dr9/cache\'):\n        for i in range(11, 1002, 1): #field\n            for j in range(1, 7, 1): #camcol\n                #print(f\'Field: {i}, Camcol: {j}\')\n                #with set_temp_cache(\'/data/epyc/projects/adam_datasets/sdss_dr9/cache\'):\n                print(f\'Field: {i}, Camcol: {j}\')\n                query = f"""\n                        SELECT objID, fieldID, field, ra, dec, raErr, decErr, i, err_i, TAI_i, mjd, field, camcol, CLEAN\n                        FROM PhotoObj\n                        WHERE camcol = {j} AND field = {i}\n                        """\

In [4]:
#filter to a tenth of a RA degree
ras = np.linspace(0, 360, 360 * 10 + 1)

def query_by_ra(ra_start, ra_stop, check_download_integrity=True):
    file_name = os.path.join(DATA_DIR, f"sdss_dr9_observations_{ra_start:06.2f}_{ra_stop:06.2f}.csv")
    
    if not os.path.exists(file_name):
        query = f"""
                SELECT objID, fieldID, field, ra, dec, raErr, decErr, i, err_i, TAI_i
                FROM PhotoObj
                WHERE (ra >= {ra_start}) AND (ra < {ra_stop}) AND i != -9999
                """
        raw_data = SDSS.query_sql(query, timeout=3600, data_release=9).to_pandas()
        formatted_data = {
                            'obs_id':raw_data['objID'],
                            'exposure_id':raw_data['fieldID'],
                            'mjd_utc':raw_data['TAI_i']/(24*3600),
                            'ra':raw_data['ra'],
                            'dec':raw_data['dec'],
                            'ra_sigma':raw_data['raErr'],
                            'dec_sigma':raw_data['decErr'],
                            'filter':'i',
                            'mag':raw_data['i'],
                            'mag_sigma':raw_data['err_i'],
                            'observatory_code':'645'
                        }
        results = pd.DataFrame(formatted_data)
        results.to_csv(file_name, index=False)
        clear_download_cache(pkg='astroquery')
        
        
    if check_download_integrity:

        downloaded_results = pd.read_csv(file_name, index_col=False)

        query = f"""
                SELECT COUNT(*) AS COUNT
                FROM PhotoObj
                WHERE (ra >= {ra_start}) AND (ra < {ra_stop}) AND i != -9999
                """
        results = SDSS.query_sql(query, timeout=3600, data_release=9)
        #print(results)

        n_results = results["COUNT"][0]
        n_downloaded_results = len(downloaded_results)

        if n_results != n_downloaded_results:
            err = (f"Downloaded file ({file_name}) contains {n_results} rows while query expected {n_downloaded_results} rows.")
            raise ValueError(err)

In [5]:
ra_start = 0
ra_stop = .1
query_by_ra(ra_start, ra_stop)
pd.read_csv(os.path.join(DATA_DIR, f"sdss_dr9_observations_{ra_start:06.2f}_{ra_stop:06.2f}.csv"))

Unnamed: 0,obs_id,exposure_id,mjd_utc,ra,dec,ra_sigma,dec_sigma,filter,mag,mag_sigma,observatory_code
0,1237645876871561542,1237645876871561216,51075.299190,0.044252,-0.937640,0.146648,0.136783,i,20.067837,0.081154,645
1,1237645876871561543,1237645876871561216,51075.299190,0.045429,-0.872684,0.065231,0.057989,i,20.505260,0.063039,645
2,1237645876871561544,1237645876871561216,51075.299190,0.045675,-0.949524,0.156135,0.097594,i,21.049526,0.125765,645
3,1237645876871561715,1237645876871561216,51075.299190,0.044419,-0.998018,0.096438,0.063111,i,20.161160,0.067760,645
4,1237645876871561884,1237645876871561216,51075.299190,0.045444,-0.978974,0.300121,0.218308,i,21.821657,0.324532,645
...,...,...,...,...,...,...,...,...,...,...,...
250919,1237680508800270408,1237680508800270336,55152.263657,0.063027,34.902034,0.073591,0.110997,i,20.367010,0.073811,645
250920,1237680508800270947,1237680508800270336,55152.263657,0.063049,35.086754,0.017250,0.017817,i,18.948711,0.018842,645
250921,1237680512016188289,1237680512016187392,55152.295602,0.063027,34.160764,0.135740,0.143180,i,21.614065,0.168043,645
250922,1237680513089994832,1237680513089994752,55152.295718,0.063044,34.902051,0.083852,0.119518,i,20.352047,0.058995,645


In [16]:
ras = np.linspace(0, 360, 360 * 10 + 1)
for ra in ras:
    query_by_ra(ra_start, ra_stop, check_download_integrity=False)

In [19]:
#times taken from casjobs query
TAI_i_min = 4412900195.78825
TAI_i_max = 4765233246.05795

mjd_min = TAI_i_min/(24*3600)
mjd_max = TAI_i_max/(24*3600)

mjd_min, mjd_max

(51075.23374754919, 55153.162570115164)

In [20]:
import glob
import multiprocessing as mp
from astropy.time import Time

window_size = 31
window_starts = np.arange(
    np.floor(mjd_min), 
    np.ceil(mjd_max), 
    window_size
)

observation_files = glob.glob(os.path.join(DATA_DIR, '*'))

In [23]:
def processWindow(window_file_name, observations):
    if len(observations) > 0:
        observations.to_hdf(
            window_file_name, 
            key="data", 
            mode="a", 
            append=True, 
            min_itemsize={'obs_id': 40, 'exposure_id': 40, 'filter' : 1, 'observatory_code': 1},
        )
    return

In [None]:
#joachim's code - sort csv file entries into months and save them as hdf5 files - windows

os.nice(10)

pool = mp.Pool(10)

observation_files_completed = np.array([])
for i, observation_file in enumerate(observation_files):
    observations = pd.read_csv(observation_file, index_col=False)
    
    windows = []
    window_file_names = []
    for window_start in window_starts:
        
        window_end = window_start + window_size
        start_isot = Time(window_start, scale="utc", format="mjd").isot.split("T")[0]
        end_isot = Time(window_end, scale="utc", format="mjd").isot.split("T")[0]
        
        window_file_name = os.path.join(DATA_DIR, "hdf5", f"sdss_dr9_observations_{start_isot}_{end_isot}.h5")
        window_file_names.append(window_file_name)
        
        observations_window = observations[(observations["mjd_utc"] >= window_start) & (observations["mjd_utc"] < window_end)]
        windows.append(observations_window)
        
    pool.starmap(
        processWindow,
        zip(window_file_names, windows)
    )
    
    observation_files_completed = np.concatenate([observation_files_completed, np.array([observation_file])])
    np.savetxt("files_processed.txt", observation_files_completed, delimiter="\n", fmt="%s")
        
    if (i + 1) % 20 == 0:
        print(f"Processed {i + 1} observations files.")
        
pool.close()

In [None]:
#run index_observations py script (slide 7 of precovery)