Prerequisites:
* astroquery
* vaex
* numpy
* pandas
* signal (optional)
* time (optional)
* datetime (optional)

In [36]:
from astroquery.utils.tap.core import Tap
import vaex
import numpy as np
from datetime import datetime
from time import time
from astroquery.gaia import Gaia
from os.path import join, abspath
from os import pardir, curdir, mkdir
from glob import glob

In [34]:
root_data_dir = abspath(join(pardir, "Data"))
data_dir = join(root_data_dir, "Gaia-2MASS")
try:
  mkdir(data_dir)
  print("Creating Gaia-2MASS dir in Data dir")
except FileExistsError:
  print("Directory already exist. Good to go!")


Creating Gaia-2MASS dir in Data dir


### utility functions

In [3]:
# progress bar
def progress(percent=0, width=50):
    left = int((width * percent) // 100)
    right = width - left
    
    tags = "#" * left
    spaces = " " * right
    percents = f"{percent:.0f}%"
    
    print("\r[", tags, spaces, "]", percents, sep="", end="", flush=True)

In [5]:
# add timeout, such that sending request again after some period of time
def timeout(func, args=(), kwargs={}, timeout_duration=1, default=None):
    import signal
    from time import time
    from requests import HTTPError
    from time import sleep

    class TimeoutError(Exception):
        pass

    def handler(signum, frame):
        raise TimeoutError()

    # set the timeout handler
    t0 = time()
    signal.signal(signal.SIGALRM, handler) 
    signal.alarm(timeout_duration)
    try:
        result = func(*args, **kwargs)
    except TimeoutError as exc:
        result = default
        t1 = time()
        print("too long, requesting again...")
        print(f"time = {round(t1-t0,2)}s")
    except HTTPError:
        result = default
        t1 = time()
        # a litte hacky, need some fixes
        if(t1-t0 < 1):
            print("service unavailable, sleep for 300s")
            print(f"time = {round(t1-t0,2)}s")
            sleep(300)
            print("continue")
        else:
            print("server not responding, try again")
            print(f"time = {round(t1-t0,2)}s")
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception:
        result = default
        t1 = time()
        print("some error")
        print(Exception)
        print(f"time = {round(t1-t0,2)}s")
    finally:
        signal.alarm(0)
    
    return result

### defining columns

In [6]:
column_gaia = ["source_id", "pm", "pmra", "pmra_error AS e_pmra", "pmdec", 
           "pmdec_error AS e_pmdec", "parallax", "parallax_error AS e_parallax", "phot_g_mean_mag AS Gmag",	"phot_bp_mean_mag AS BPmag", 
           "phot_rp_mean_mag AS RPmag", "radial_velocity AS rv_gaia",	"radial_velocity_error AS e_rv_gaia",
           "bp_rp", "l AS GLON", "b AS GLAT", "teff_gspphot", "teff_gspphot_lower", "teff_gspphot_upper",
           "logg_gspphot", "logg_gspphot_lower", "logg_gspphot_upper"]

column_astrophysical = ["mh_gspphot", "mh_gspphot_lower", "mh_gspphot_upper", "distance_gspphot", "distance_gspphot_lower", 
                         "distance_gspphot_upper", "ag_gspphot", "ag_gspphot_lower", "ag_gspphot_upper",
                         "mh_gspspec", "mh_gspspec_lower", "mh_gspspec_upper", "alphafe_gspspec", "alphafe_gspspec_lower", 
                         "alphafe_gspspec_upper", "fem_gspspec", "fem_gspspec_lower", "fem_gspspec_upper" ,"spectraltype_esphs"]

column_xmatch_tmass = ["original_ext_source_id AS tmass"]

def appendName(element, name):
    string = element.split(" AS ")
    if(len(string) == 1):
        return f"{name}.\"{element}\""
    else:
        return f"{name}.\"{string[0]}\" AS {string[1]}"

column_gaia = list(map(lambda x: appendName(x, "gdr3"), column_gaia))
column_astrophysical = list(map(lambda x: appendName(x, "astrophysical"), column_astrophysical))
column_xmatch_tmass = list(map(lambda x: appendName(x, "tmass"), column_xmatch_tmass))

columns = column_gaia + column_astrophysical  + column_xmatch_tmass
columns

['gdr3."source_id"',
 'gdr3."pm"',
 'gdr3."pmra"',
 'gdr3."pmra_error" AS e_pmra',
 'gdr3."pmdec"',
 'gdr3."pmdec_error" AS e_pmdec',
 'gdr3."parallax"',
 'gdr3."parallax_error" AS e_parallax',
 'gdr3."phot_g_mean_mag" AS Gmag',
 'gdr3."phot_bp_mean_mag" AS BPmag',
 'gdr3."phot_rp_mean_mag" AS RPmag',
 'gdr3."radial_velocity" AS rv_gaia',
 'gdr3."radial_velocity_error" AS e_rv_gaia',
 'gdr3."bp_rp"',
 'gdr3."l" AS GLON',
 'gdr3."b" AS GLAT',
 'gdr3."teff_gspphot"',
 'gdr3."teff_gspphot_lower"',
 'gdr3."teff_gspphot_upper"',
 'gdr3."logg_gspphot"',
 'gdr3."logg_gspphot_lower"',
 'gdr3."logg_gspphot_upper"',
 'astrophysical."mh_gspphot"',
 'astrophysical."mh_gspphot_lower"',
 'astrophysical."mh_gspphot_upper"',
 'astrophysical."distance_gspphot"',
 'astrophysical."distance_gspphot_lower"',
 'astrophysical."distance_gspphot_upper"',
 'astrophysical."ag_gspphot"',
 'astrophysical."ag_gspphot_lower"',
 'astrophysical."ag_gspphot_upper"',
 'astrophysical."mh_gspspec"',
 'astrophysical."mh_gs

In [7]:
# 2MASS tap endpoint
tap_tmass = Tap(url="https://irsa.ipac.caltech.edu/TAP/sync")

columns_tmass = ["ra", "dec","j_m", "h_m", "k_m", "designation"]
columns_tmass_names = ["ra", "dec", "Jmag", "Hmag", "Kmag", "designation"]

### Divide into multiple RAs
this strategy is to reduce the size of response

In [8]:
# divide into 360 RAs, depend on preference
ras = np.arange(0,361, 1).astype(int)
ras

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

#### Divide further in DECs direction

In [9]:
decs = np.linspace(-90,90,19).astype(int)
decs

array([-90, -80, -70, -60, -50, -40, -30, -20, -10,   0,  10,  20,  30,
        40,  50,  60,  70,  80,  90])

run in loop

In [11]:
TOP = 50_000 # cap maximum rows for each response, so that the response is not exploding

# ra0 for lower boundry and ra1 for upper boundary
# same with dec0 and dec1
for i, (ra0, ra1) in enumerate(zip(ras[:-1], ras[1:])):
    df_com = [] #initial table
    time0 = time()
    progress(0)
    j = 0
    while j < len(decs) -1:
        dec0 = decs[j]
        dec1 = decs[j+1]
        # query gaia data
        # taking wider ra and dec constrains than 2MASS, because of different epoch
        # the contrains are based on https://doi.org/10.1093/mnras/stab3671
        query_gaia = f"""
        SELECT TOP {TOP} {', '.join(columns)}
        FROM gaiadr3.gaia_source AS gdr3
        LEFT JOIN gaiadr3.astrophysical_parameters AS astrophysical ON astrophysical.source_id = gdr3.source_id
        RIGHT JOIN gaiadr3.tmass_psc_xsc_best_neighbour AS tmass ON tmass.source_id = gdr3.source_id
        WHERE gdr3.ra BETWEEN {ra0-1} AND {ra1+1}
        AND gdr3.dec BETWEEN {dec0-1} AND {dec1+1}
        AND parallax > 0
        AND parallax_error/parallax < 0.15
        AND bp_rp BETWEEN -3 AND 6
        AND gdr3.phot_g_mean_mag BETWEEN 3 AND 21
        AND phot_bp_mean_flux_error/phot_bp_mean_flux < 0.15
        AND phot_rp_mean_flux_error/phot_rp_mean_flux < 0.15
        AND ruwe < 1.4	
        AND phot_bp_rp_excess_factor > 1 + 0.015*gdr3.bp_rp*gdr3.bp_rp
        AND phot_bp_rp_excess_factor < 1.3 + 0.06*gdr3.bp_rp*gdr3.bp_rp
        """
        job_gaia = timeout(Gaia.launch_job, args=(query_gaia,), timeout_duration=120)
        if job_gaia == None: #if failed, try again
            print("fail to fetch gaia")
            print("length = ", len(df_com))
            continue
        result_gaia = job_gaia.get_results()
        df_gaia = vaex.from_pandas(result_gaia.to_pandas())
        # query 2MASS data
        query_tmass = f"""
        SELECT TOP {TOP} {", ".join(columns_tmass)} 
        FROM fp_psc
        WHERE ra BETWEEN {ra0} AND {ra1}
        AND dec BETWEEN {dec0} AND {dec1} 
        AND ph_qual = 'AAA'
        """
        job_tmass = timeout(tap_tmass.launch_job, args=(query_tmass,), timeout_duration=120)
        if job_tmass == None: 
            print("fail to fetch tmass")
            print("length = ", len(df_com))
            continue
        result_tmass = job_tmass.get_results()
        df_tmass = result_tmass.to_pandas()
        df_tmass.columns = columns_tmass_names
        # join
        df_tmass = vaex.from_pandas(df_tmass)
        join = df_tmass.join(df_gaia, left_on="designation", right_on="tmass", how="left", allow_duplication=True)
        join.drop(["designation", "tmass"], inplace=True)
        progress((j+1)/(len(decs)-1)*100)
        if(len(df_com) == 0):
            df_com = join
        else:
            df_com = df_com.concat(join)
        j += 1
        t1 = time()
    time1 = time()  
    df_com.rename("sdss13", "SDSS13")
    df_com.rename("rave6", "RAVE6")
    df_com.export(join(data_dir, f"gaia-{ra0:03d}-{ra1:03d}.hdf5"), progress=True)
    print(f"{len(df_com)} || {round((time1-time0)/60, 2)}m")
    print(f"{i} saved {ra0}-{ra1} || {datetime.now()}")

[##                                                ]6%

Approximately 95 mil rows (23 GB)

taking ~ 48 hours

### Preview

In [39]:
files = glob(join(data_dir, "*.hdf5"))
files[:5]

['/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-219-220.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-079-080.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-218-219.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-182-183.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-045-046.hdf5']

In [40]:
gaia = vaex.open_many(files)
gaia

#,source_id,ra,dec,pm,pmra,e_pmra,pmdec,e_pmdec,parallax,e_parallax,Gmag,BPmag,RPmag,rv_gaia,e_rv_gaia,bp_rp,GLON,GLAT,teff_gspphot,teff_gspphot_lower,teff_gspphot_upper,logg_gspphot,logg_gspphot_lower,logg_gspphot_upper,mh_gspphot,mh_gspphot_lower,mh_gspphot_upper,distance_gspphot,distance_gspphot_lower,distance_gspphot_upper,ag_gspphot,ag_gspphot_lower,ag_gspphot_upper,RAVE6,SDSS13,Jmag,Hmag,Kmag,mh_gspspec,mh_gspspec_lower,mh_gspspec_upper,alphafe_gspspec,alphafe_gspspec_lower,alphafe_gspspec_upper,fem_gspspec,fem_gspspec_lower,fem_gspspec_upper,spectraltype_esphs
0,5764635079544432128,219.77805455719,-89.56922832436,7.511,1.522,0.021,-7.355,0.020949371,2.8775,0.0184,10.847249,11.118501,10.412085,-5.29,0.73,0.706416,303.15029042357,-26.74401558194,6141.6,6130.1,6152.1,4.043,4.0398,4.0469,-0.4365,-0.4457,-0.4269,344.0585,341.9788,346.0511,0.0799,0.0745,0.0847,,--,9.953,9.698,9.683,-0.26,-0.32,-0.22,0.33,0.26,0.4,,,,F
1,5764635393079767296,219.15195997564,-89.5318109641,11.203,-8.352,0.012,-7.467,0.012214055,0.4305,0.011,13.174702,13.789571,12.425439,67.09,0.91,1.364132,303.16401990137,-26.70832039314,4718.7,4713.3,4724.8,2.5833,2.5655,2.6014,-0.1946,-0.1952,-0.1941,2070.6995,2031.6134,2112.0928,0.3619,0.3574,0.3668,,--,11.418,10.819,10.704,,,,,,,,,,K
2,5764635496158988544,219.83101507284,-89.50979783241,2.65,-2.095,0.022,1.623,0.024982337,0.2155,0.0218,14.993793,15.584641,14.263248,,,1.321393,303.18075914642,-26.69113646047,4881.6,4875.7,4888.8,3.0708,3.0509,3.0862,-0.1415,-0.15,-0.1356,3446.1995,3376.546,3517.8865,0.4145,0.4093,0.4207,,--,13.306,12.725,12.634,,,,,,,,,,K
3,5764638554175708672,219.24632619569,-89.48870252979,11.672,-10.811,0.017,-4.399,0.01951931,1.9675,0.0166,14.446186,14.915824,13.815229,13.17,3.9,1.100595,303.18615009602,-26.66999019517,5226.6,5217.9,5235.5,4.5456,4.5418,4.5518,-0.1967,-0.2063,-0.1864,490.8246,487.5747,493.967,0.2623,0.2565,0.2679,,--,13.026,12.587,12.512,,,,,,,,,,K
4,5764639241370488064,219.24010479539,-89.44765029195,14.783,-10.759,0.113,-10.139,0.12553297,1.2643,0.1066,17.817894,18.871086,16.80843,,,2.062656,303.2064053892,-26.63317348591,4119.2,4089.7,4135.9,4.6477,4.6411,4.6547,0.3467,0.2681,0.3855,822.1085,799.1945,831.1044,0.7264,0.6964,0.7513,,--,15.434,14.768,14.468,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94994937,576406227696244864,33.65804765403,89.33197663578,4.686,-4.139,0.072,-2.197,0.064973824,0.6416,0.0561,16.92992,17.625141,16.097548,,,1.527594,123.19695964994,26.50349463792,4337.4,4217.8,4422.2,4.3911,4.2846,4.4237,-0.3812,-0.4888,-0.3243,1117.2708,1065.5874,1205.549,0.305,0.1938,0.3925,,--,14.993,14.263,14.201,,,,,,,,,,K
94994938,576431585183142528,33.61268607589,89.48206639541,1.894,-1.763,0.02,0.693,0.017867079,0.6989,0.0159,14.399571,15.067607,13.604876,-14.88,3.59,1.462731,123.13723815775,26.64377331058,5178.8,5133.0,5367.7,3.5746,3.4492,3.8413,-0.4349,-0.4827,-0.291,1165.2085,893.3535,1341.377,0.9013,0.8756,0.9909,,--,12.616,12.051,11.924,,,,,,,,,,K
94994939,576445638316125184,33.60957894134,89.57301292445,4.728,-2.691,0.018,3.887,0.015934618,0.7782,0.015,14.292352,14.78063,13.633098,-36.36,7.38,1.147533,123.10127650839,26.72887005884,5855.7,5835.1,5878.4,4.0048,3.9971,4.0145,-0.583,-0.6066,-0.555,1217.041,1196.2493,1234.8632,0.7542,0.7434,0.7665,,--,12.857,12.457,12.404,,,,,,,,,,G
94994940,576447459382243200,33.57768719537,89.70251709954,17.162,10.764,0.021,13.367,0.021056626,0.9942,0.0197,14.856944,15.333351,14.206361,-6.87,13.13,1.12699,123.04990412536,26.85002805556,5420.3,5395.1,5501.3,4.2236,4.2103,4.2407,-1.28,-1.323,-1.1485,946.7403,929.4686,962.1341,0.4847,0.4702,0.5309,,--,13.478,13.002,12.953,,,,,,,,,,G
