Prerequisites:
* astroquery
* vaex
* numpy
* pandas
* signal (optional)
* time (optional)
* datetime (optional)

In [11]:
from astroquery.utils.tap.core import Tap
import vaex
import numpy as np
from datetime import datetime
from time import time
from astroquery.gaia import Gaia
from os.path import join, abspath
from os import pardir, mkdir
from glob import glob
import sys

In [1]:
# ras = np.linspace(0, 360, 13).astype(int)
# for ra0, ra1 in zip(ras[:-1], ras[1:]):
#   print(f"python gaia.py {ra0} {ra1} -90 90 log{ra0:03d}-{ra1:03d}.txt")

In [5]:
# import utils
util_dir = abspath(pardir)
sys.path.insert(0, util_dir)

from utils import timeout, progressbar, appendName

In [6]:
root_data_dir = abspath(join(pardir, "Data"))
root_data_dir

'/home2/s20321005/Thesis-Project/Data'

In [7]:
name = "Gaia-2MASS-3"
gaia_data_dir = join(root_data_dir, name)
try:
    mkdir(gaia_data_dir)
    print(f"Creating {gaia_data_dir} dir in Data dir")
except FileExistsError:
    print(f"Directory {gaia_data_dir} already exist. Good to go!")


Directory /home2/s20321005/Thesis-Project/Data/Gaia-2MASS-3 already exist. Good to go!


In [8]:
name = "TWOMASS"
tmass_data_dir = join(root_data_dir, name)
try:
    mkdir(tmass_data_dir)
    print(f"Creating {tmass_data_dir} dir in Data dir")
except FileExistsError:
    print(f"Directory {tmass_data_dir} already exist. Good to go!")

Creating /home2/s20321005/Thesis-Project/Data/TWOMASS dir in Data dir


### defining columns

In [6]:
column_gaia = ["source_id", "pm", "pmra", "pmra_error AS e_pmra", "pmdec", 
           "pmdec_error AS e_pmdec", "parallax", "parallax_error AS e_parallax", "phot_g_mean_mag AS Gmag",	"phot_bp_mean_mag AS BPmag", 
           "phot_rp_mean_mag AS RPmag", "phot_bp_mean_flux_over_error AS Fb_over_err", "phot_rp_mean_flux_over_error AS Fr_over_err", 
           "ruwe", "phot_bp_rp_excess_factor AS excess_factor", "radial_velocity AS rv_gaia",	"radial_velocity_error AS e_rv_gaia",
           "l AS GLON", "b AS GLAT", "teff_gspphot", "teff_gspphot_lower", "teff_gspphot_upper",
           "logg_gspphot", "logg_gspphot_lower", "logg_gspphot_upper"]

column_astrophysical = ["mh_gspphot", "mh_gspphot_lower", "mh_gspphot_upper", "distance_gspphot", "distance_gspphot_lower", 
                         "distance_gspphot_upper", "ag_gspphot", "ag_gspphot_lower", "ag_gspphot_upper",
                         "mh_gspspec", "mh_gspspec_lower", "mh_gspspec_upper", "alphafe_gspspec", "alphafe_gspspec_lower", 
                         "alphafe_gspspec_upper", "fem_gspspec", "fem_gspspec_lower", "fem_gspspec_upper" ,"spectraltype_esphs"]

column_join = ["original_psc_source_id AS tmass"]

column_gaia = list(map(lambda x: appendName(x, "gdr3"), column_gaia))
column_astrophysical = list(map(lambda x: appendName(x, "astrophysical"), column_astrophysical))
column_join = list(map(lambda x: appendName(x, "join_table"), column_join))

columns = column_gaia + column_astrophysical + column_join
columns

['gdr3."source_id"',
 'gdr3."pm"',
 'gdr3."pmra"',
 'gdr3."pmra_error" AS e_pmra',
 'gdr3."pmdec"',
 'gdr3."pmdec_error" AS e_pmdec',
 'gdr3."parallax"',
 'gdr3."parallax_error" AS e_parallax',
 'gdr3."phot_g_mean_mag" AS Gmag',
 'gdr3."phot_bp_mean_mag" AS BPmag',
 'gdr3."phot_rp_mean_mag" AS RPmag',
 'gdr3."phot_bp_mean_flux_over_error" AS Fb_over_err',
 'gdr3."phot_rp_mean_flux_over_error" AS Fr_over_err',
 'gdr3."ruwe"',
 'gdr3."phot_bp_rp_excess_factor" AS excess_factor',
 'gdr3."radial_velocity" AS rv_gaia',
 'gdr3."radial_velocity_error" AS e_rv_gaia',
 'gdr3."l" AS GLON',
 'gdr3."b" AS GLAT',
 'gdr3."teff_gspphot"',
 'gdr3."teff_gspphot_lower"',
 'gdr3."teff_gspphot_upper"',
 'gdr3."logg_gspphot"',
 'gdr3."logg_gspphot_lower"',
 'gdr3."logg_gspphot_upper"',
 'astrophysical."mh_gspphot"',
 'astrophysical."mh_gspphot_lower"',
 'astrophysical."mh_gspphot_upper"',
 'astrophysical."distance_gspphot"',
 'astrophysical."distance_gspphot_lower"',
 'astrophysical."distance_gspphot_upper

In [9]:
# 2MASS tap endpoint
tap_tmass = Tap(url="https://irsa.ipac.caltech.edu/TAP/sync")

columns_tmass = ["ra", "dec","j_m", "k_m", "designation", "ph_qual"]
columns_tmass_names = ["ra", "dec", "Jmag", "Kmag", "designation", "ph_qual"]

### Divide into multiple RAs
this strategy is to reduce the size of response

In [10]:
# divide into 360 RAs, depend on preference
ras = np.arange(241,243+0.1, 1).astype(int)
dra = ras[1] - ras[0]
ras

array([241, 242, 243])

#### Divide further in DECs direction

In [13]:
decs = np.arange(-30,0+0.1,1).astype(int)
ddec = decs[1] - decs[0]
decs

array([-30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18,
       -17, -16, -15, -14, -13, -12, -11, -10,  -9,  -8,  -7,  -6,  -5,
        -4,  -3,  -2,  -1,   0])

run in loop
recomended to run in dedicated py script

In [11]:
# ra0 for lower boundry and ra1 for upper boundary
# same with dec0 and dec1
for i, (ra0, ra1) in enumerate(zip(ras[:-1], ras[1:])):
    TOP = 5_000_000 # cap maximum rows for each response, so that the response is not exploding
    df_com = [] #initial table
    df_com_tmass = [] #initial tmass table
    time0 = time()
    progressbar(0, info=f"{ra0}-{ra1}")
    j = 0
    skip = False
    trying = 0
    while j < len(decs) -1:
        if trying > 10:
            print("too many tries, raise error")
            raise Exception("too many tries")
        if ~skip:
            t0 = time()
        dec0 = decs[j]
        dec1 = decs[j+1]
        # query gaia data
        # taking wider ra and dec constrains than 2MASS, because of different epoch
        # the contrains are based on https://doi.org/10.1093/mnras/stab3671
        query_gaia = f"""
        SELECT TOP {TOP} {', '.join(columns)}
        FROM gaiadr3.gaia_source AS gdr3
        LEFT JOIN gaiadr3.astrophysical_parameters AS astrophysical ON astrophysical.source_id = gdr3.source_id
        RIGHT JOIN gaiadr3.tmass_psc_xsc_best_neighbour AS tmass ON tmass.source_id = gdr3.source_id
        RIGHT JOIN gaiadr3.tmass_psc_xsc_join as join_table ON join_table.clean_tmass_psc_xsc_oid = tmass.clean_tmass_psc_xsc_oid
        WHERE gdr3.ra BETWEEN {ra0-dra*0.5} AND {ra1+dra*0.5}
        AND gdr3.dec BETWEEN {dec0-ddec*0.5} AND {dec1+ddec*0.5} 
        AND parallax > 0
        AND parallax_error/parallax < 0.15
        AND bp_rp BETWEEN -3 AND 6
        AND gdr3.phot_g_mean_mag BETWEEN 3 AND 21
        AND phot_bp_mean_flux_error/phot_bp_mean_flux < 0.15
        AND phot_rp_mean_flux_error/phot_rp_mean_flux < 0.15
        AND ruwe < 1.4	
        AND phot_bp_rp_excess_factor > 1 + 0.015*gdr3.bp_rp*gdr3.bp_rp
        AND phot_bp_rp_excess_factor < 1.3 + 0.06*gdr3.bp_rp*gdr3.bp_rp
        """
        job_gaia = timeout(Gaia.launch_job, args=(query_gaia,), timeout_duration=120)
        if job_gaia == None: #if failed, try again
            print("fail to fetch gaia")
            print("length = ", len(df_com))
            skip = True
            trying += 1
            continue
        result_gaia = job_gaia.get_results()
        if(len(result_gaia) == TOP):
            print(f"gaia data is capped, increase TOP | {TOP}")
            TOP *= 2
            skip = True
            continue
        df_pandas = result_gaia.to_pandas()
        df_pandas = df_pandas.drop_duplicates(subset=['tmass'], keep="first")
        df_pandas.rename(columns={'glon': 'GLON', 'glat': 'GLAT'}, inplace=True)
        df_gaia = vaex.from_pandas(df_pandas)
        # query 2MASS data
        query_tmass = f"""
        SELECT TOP {TOP} {", ".join(columns_tmass)} 
        FROM fp_psc
        WHERE ra BETWEEN {ra0} AND {ra1}
        AND dec BETWEEN {dec0} AND {dec1} 
        AND j_m < 13.5
        AND ph_qual LIKE 'A__'
        AND ph_qual LIKE '__A'
        """
        job_tmass = timeout(tap_tmass.launch_job, args=(query_tmass,), timeout_duration=120)
        if job_tmass == None: 
            print("fail to fetch tmass")
            print("length = ", len(df_com))
            skip = True
            trying += 1
            continue
        result_tmass = job_tmass.get_results()
        if(len(result_tmass) == TOP):
            print(f"tmass data is capped, increase TOP | {TOP}")
            TOP *= 2
            skip = True
            continue
        df_tmass = result_tmass.to_pandas()
        df_tmass.columns = columns_tmass_names
        # join
        df_tmass = vaex.from_pandas(df_tmass)
        join_table = df_tmass.join(df_gaia, left_on="designation", right_on="tmass", how="left")
        join_table.drop(["designation", "tmass"], inplace=True)
        if(len(df_com) == 0):
            df_com = join_table
            df_com_tmass = df_tmass
        else:
            df_com = df_com.concat(join_table)
            df_com_tmass = df_com_tmass.concat(df_tmass)
        j += 1
        t1 = time()
        skip = False
        trying = 0
        TOP = np.max([int(len(df_tmass) * 2), 50_000, int(len(df_gaia) * 2)])
        progressbar((j)/(len(decs)-1)*100, info=f"{ra0}-{ra1} | [{dec0}]-[{dec1}] | {round(t1-t0,2)} s | join = {len(join_table)} | tmass = {len(df_tmass)} | gaia = {len(df_gaia)}| TOP = {TOP}")
    time1 = time()  
    df_com.export(join(gaia_data_dir, f"gaia-{ra0:03d}-{ra1:03d}.hdf5"), progress=True)
    df_com_tmass.export(join(tmass_data_dir, f"tmass-{ra0:03d}-{ra1:03d}.hdf5"), progress=True)
    print(f"{len(df_com)} || {round((time1-time0)/60, 2)}m")
    print(f"{i} saved {ra0}-{ra1} || {datetime.now()}")
    break


[                                                  ]0% 0-1
[##                                                ]6% 0-1 | [-90]-[-80] | 15.9 s | join = 388 | tmass = 388 | TOP = 50000
[#####                                             ]11% 0-1 | [-80]-[-70] | 22.37 s | join = 900 | tmass = 900 | TOP = 50000
[########                                          ]17% 0-1 | [-70]-[-60] | 27.57 s | join = 1182 | tmass = 1182 | TOP = 50000
[###########                                       ]22% 0-1 | [-60]-[-50] | 25.0 s | join = 1266 | tmass = 1266 | TOP = 50000
[#############                                     ]28% 0-1 | [-50]-[-40] | 28.14 s | join = 1355 | tmass = 1355 | TOP = 50000
[################                                  ]33% 0-1 | [-40]-[-30] | 28.72 s | join = 1527 | tmass = 1527 | TOP = 50000
[###################                               ]39% 0-1 | [-30]-[-20] | 21.54 s | join = 1590 | tmass = 1590 | TOP = 50000
[######################                            ]44% 0-1

In [14]:
# ra0 for lower boundry and ra1 for upper boundary
# same with dec0 and dec1
ORI_TOP = 50_000_000
for i, (ra0, ra1) in enumerate(zip(ras[:-1], ras[1:])):
    TOP = ORI_TOP # cap maximum rows for each response, so that the response is not exploding
    df_com = [] #initial table
    df_com_tmass = [] #initial tmass table
    time0 = time()
    progressbar(0, info=f"{ra0}-{ra1}")
    j = 0
    skip = False
    trying = 0
    while j < len(decs) -1:
        if trying > 15:
            print("too many tries, raise error")
            raise Exception("too many tries")
        if ~skip:
            t0 = time()
        dec0 = decs[j]
        dec1 = decs[j+1]
        # query gaia data
        # taking wider ra and dec constrains than 2MASS, because of different epoch
        query_gaia = f"""
        SELECT TOP {TOP} {', '.join(columns)}
        FROM gaiadr3.gaia_source AS gdr3
        LEFT JOIN gaiadr3.astrophysical_parameters AS astrophysical ON astrophysical.source_id = gdr3.source_id
        RIGHT JOIN gaiadr3.tmass_psc_xsc_best_neighbour AS tmass ON tmass.source_id = gdr3.source_id
        RIGHT JOIN gaiadr3.tmass_psc_xsc_join as join_table ON join_table.clean_tmass_psc_xsc_oid = tmass.clean_tmass_psc_xsc_oid
        WHERE gdr3.ra BETWEEN {ra0-dra*1} AND {ra1+dra*1}
        AND gdr3.dec BETWEEN {dec0-ddec*1} AND {dec1+ddec*1} 
        """
        job_gaia = timeout(Gaia.launch_job, args=(query_gaia,), timeout_duration=600)
        print("UWU")
        if job_gaia == None: #if failed, try again
            print("fail to fetch gaia")
            print("length = ", len(df_com))
            skip = True
            trying += 1
            continue
        result_gaia = job_gaia.get_results()
        if(len(result_gaia) == TOP):
            print(f"gaia data is capped, increase TOP | {TOP}")
            TOP *= 2
            skip = True
            continue
        df_pandas = result_gaia.to_pandas()
        df_pandas = df_pandas.drop_duplicates(subset=['tmass'], keep="first")
        df_pandas.rename(columns={'glon': 'GLON', 'glat': 'GLAT'}, inplace=True)
        df_gaia = vaex.from_pandas(df_pandas)
        # query 2MASS data
        query_tmass = f"""
        SELECT TOP {TOP} {", ".join(columns_tmass)} 
        FROM fp_psc
        WHERE ra BETWEEN {ra0} AND {ra1}
        AND dec BETWEEN {dec0} AND {dec1} 
        AND j_m < 13.5
        """
        job_tmass = timeout(tap_tmass.launch_job, args=(query_tmass,), timeout_duration=120)
        if job_tmass == None: 
            print("fail to fetch tmass")
            print("length = ", len(df_com))
            skip = True
            trying += 1
            continue
        result_tmass = job_tmass.get_results()
        if(len(result_tmass) == TOP):
            print(f"tmass data is capped, increase TOP | {TOP}")
            TOP *= 2
            skip = True
            continue
        df_tmass = result_tmass.to_pandas()
        df_tmass.columns = columns_tmass_names
        # join
        df_tmass = vaex.from_pandas(df_tmass)
        join_table = df_tmass.join(df_gaia, left_on="designation", right_on="tmass", how="left")
        join_table.drop(["designation", "tmass"], inplace=True)
        if(len(df_com) == 0):
            df_com = join_table
            df_com_tmass = df_tmass
        else:
            df_com = df_com.concat(join_table)
            df_com_tmass = df_com_tmass.concat(df_tmass)
        j += 1
        t1 = time()
        skip = False
        trying = 0
        TOP = np.max([int(len(df_tmass) * 2), ORI_TOP, int(len(df_gaia) * 2)])
        progressbar((j)/(len(decs)-1)*100, info=f"{ra0}-{ra1} | [{dec0}]-[{dec1}] | {round(t1-t0,2)} s | join = {len(join_table)} | tmass = {len(df_tmass)} | gaia = {len(df_gaia)}| TOP = {TOP}")
    time1 = time()  
    df_com.export(join(gaia_data_dir, f"gaia-{ra0:03d}-{ra1:03d}.hdf5"), progress=True)
    df_com_tmass.export(join(tmass_data_dir, f"tmass-{ra0:03d}-{ra1:03d}.hdf5"), progress=True)
    print(f"{len(df_com)} || {round((time1-time0)/60, 2)}m")
    print(f"{i} saved {ra0}-{ra1} || {datetime.now()}")
    break


[                                                  ] 0% 255-256
UWU
[#                                                 ] 3% 255-256 | [-30]-[-29] | 129.7 s | join = 7963 | tmass = 7963 | gaia = 202932| TOP = 50000000


KeyboardInterrupt: 

In [15]:
df_com

#,ra,dec,Jmag,Kmag,ph_qual,source_id,pm,pmra,e_pmra,pmdec,e_pmdec,parallax,e_parallax,gmag,bpmag,rpmag,fb_over_err,fr_over_err,ruwe,excess_factor,rv_gaia,e_rv_gaia,GLON,GLAT,teff_gspphot,teff_gspphot_lower,teff_gspphot_upper,logg_gspphot,logg_gspphot_lower,logg_gspphot_upper,mh_gspphot,mh_gspphot_lower,mh_gspphot_upper,distance_gspphot,distance_gspphot_lower,distance_gspphot_upper,ag_gspphot,ag_gspphot_lower,ag_gspphot_upper,mh_gspspec,mh_gspspec_lower,mh_gspspec_upper,alphafe_gspspec,alphafe_gspspec_lower,alphafe_gspspec_upper,fem_gspspec,fem_gspspec_lower,fem_gspspec_upper,spectraltype_esphs
0,0.000464,-84.272873,12.869,12.374,AAA,4617686126396602368,3.6212091,-2.123925825256161,0.018093891,2.9329329266790864,0.015576468,1.2414398898080128,0.014017576,14.117249,14.546528,13.5242605,1097.9094,1719.5553,1.011804,1.2151662,20.115417,3.325204,304.44429715164756,-32.70355447144384,5658.605,5653.9224,5668.203,4.2226,4.2043,4.2496,-0.0402,-0.0457,-0.0316,770.2898,746.2748,786.9589,0.2956,0.2926,0.2995,,,,,,,,,,G
1,0.001247,-82.041222,12.825,12.156,AAA,4630918680117199104,1.7595602,-0.8643814806435555,0.022085961,-1.532611183039537,0.02249431,0.05787829500347088,0.018220302,14.542501,15.125132,13.804353,748.2229,1359.1959,1.0981786,1.254802,243.7743,7.210859,305.0842797396042,-34.87090952692804,5361.0396,5358.2354,5364.41,2.2003,2.2,2.2009,-1.3006,-1.3027,-1.299,11844.438,11838.314,11851.41,0.799,0.7968,0.8014,,,,,,,,,,G
2,0.001998,-84.804314,12.523,11.93,AAA,4617598371624860928,17.00417,16.266358206309686,0.015283653,4.954528511167747,0.016042866,0.686194890423576,0.013194535,14.117252,14.645076,13.433549,974.97577,2022.8025,1.0050431,1.2361251,-23.821648,3.2447915,304.29624940743884,-32.18708166475209,,,,,,,,,,,,,,,,,,,,,,,,,K
3,0.002857,-84.718956,11.746,10.877,AAA,4617602769671367936,5.9807105,5.927853781783437,0.014620288,-0.7933771944304462,0.01440706,0.17823265910718195,0.011743693,13.724135,14.456079,12.896074,962.67413,2366.5325,1.1000682,1.272043,131.92238,1.2741442,304.31980747322217,-32.26998931504778,,,,,,,,,,,,,,,,,,,,,,,,,K
4,0.004052,-82.750267,13.333,12.872,AAA,4630074530064792448,11.209091,10.524865569146224,0.019602386,3.856285092898922,0.02012978,1.027615140072866,0.01769279,14.624002,15.035072,14.034426,890.7594,1179.6777,0.9856547,1.2211474,-43.84897,5.5962825,304.8768564308389,-34.1827911363649,6107.482,6101.212,6117.21,4.2386,4.2239,4.2526,-0.8342,-0.8443,-0.8225,911.0151,895.5346,927.2803,0.619,0.6154,0.624,,,,,,,,,,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499,0.995764,-61.164822,13.447,12.679,AAA,4905628808458200832,35.228916,32.904049421000536,0.024540035,-12.58571385047603,0.024466725,2.6240438140165567,0.023590863,15.270068,15.984568,14.45243,456.00812,1121.7408,1.036435,1.2694062,,,312.89517546174005,-55.03594452086884,4456.8164,4449.856,4470.296,4.6667,4.6624,4.6713,0.3071,0.2708,0.3548,401.7614,399.8343,404.555,0.2997,0.2939,0.3081,,,,,,,,,,K
2500,0.995932,-60.366158,11.679,10.749,AAA,4905913753768390272,237.948,131.28845455960143,0.03185758,-198.45048189872955,0.02766468,28.49055973997624,0.02807787,14.892837,16.820797,13.593732,189.0663,556.4315,1.2046098,1.5155118,28.436165,2.244312,313.3474057627129,-55.79141635510841,3030.2434,3029.2935,3032.0671,5.0505,5.0407,5.0525,-0.1082,-0.1147,-0.077,35.2146,35.0966,36.1397,0.0002,0.0,0.0026,,,,,,,,,,M
2501,0.99779,-63.871353,13.314,12.743,AAA,4901335078112927872,9.282132,3.1022623093525334,0.019708283,8.748368312069108,0.020035382,2.045443061782102,0.019035911,14.784108,15.29691,14.116489,712.97504,1327.6842,0.9908932,1.2307202,20.504078,4.5867405,311.4760884991196,-52.46271075234208,4805.785,4795.141,4815.1313,4.5813,4.5751,4.5861,-0.0132,-0.0263,-0.0011,491.9373,489.5939,495.7933,0.0543,0.0457,0.0621,,,,,,,,,,K
2502,0.997923,-65.695496,12.933,12.619,AAA,4900000683313411712,7.618164,-7.155225363672131,0.012795682,2.6151818149044477,0.014741806,1.2284931126430936,0.014049208,13.847409,14.128094,13.409544,1651.2437,1994.4031,1.0021201,1.1900352,9.937208,4.361201,310.61109286234984,-50.71941234580597,5959.57,5956.904,5963.808,4.3601,4.3534,4.3683,-0.4475,-0.4558,-0.4384,787.2617,776.929,796.3514,0.0016,0.0003,0.004,,,,,,,,,,G


Approximately 95 mil rows (23 GB)

taking ~ 48 hours

### Preview adn Cleaning

In [6]:
files = glob(join(gaia_data_dir, "*.hdf5"))
files.sort()
len(files)

360

In [16]:
gaia = vaex.open_many(files)
gaia = gaia[gaia.pm.notnan()].extract()
gaia

#,ra,dec,Jmag,Kmag,source_id,pm,pmra,e_pmra,pmdec,e_pmdec,parallax,e_parallax,gmag,bpmag,rpmag,rv_gaia,e_rv_gaia,GLON,GLAT,teff_gspphot,teff_gspphot_lower,teff_gspphot_upper,logg_gspphot,logg_gspphot_lower,logg_gspphot_upper,mh_gspphot,mh_gspphot_lower,mh_gspphot_upper,distance_gspphot,distance_gspphot_lower,distance_gspphot_upper,ag_gspphot,ag_gspphot_lower,ag_gspphot_upper,mh_gspspec,mh_gspspec_lower,mh_gspspec_upper,alphafe_gspspec,alphafe_gspspec_lower,alphafe_gspspec_upper,fem_gspspec,fem_gspspec_lower,fem_gspspec_upper,spectraltype_esphs
0,0.000464,-84.272873,12.869,12.374,4617686126396602368,3.6212091,-2.123925825256161,0.018093891,2.9329329266790864,0.015576468,1.2414398898080128,0.014017576,14.117249,14.546528,13.5242605,20.115417,3.325204,304.44429715164756,-32.70355447144384,5658.605,5653.9224,5668.203,4.2226,4.2043,4.2496,-0.0402,-0.0457,-0.0316,770.2898,746.2748,786.9589,0.2956,0.2926,0.2995,,,,,,,,,,G
1,0.001998,-84.804314,12.523,11.93,4617598371624860928,17.00417,16.266358206309686,0.015283653,4.954528511167747,0.016042866,0.686194890423576,0.013194535,14.117252,14.645076,13.433549,-23.821648,3.2447915,304.29624940743884,-32.18708166475209,,,,,,,,,,,,,,,,,,,,,,,,,K
2,0.002857,-84.718956,11.746,10.877,4617602769671367936,5.9807105,5.927853781783437,0.014620288,-0.7933771944304462,0.01440706,0.17823265910718195,0.011743693,13.724135,14.456079,12.896074,131.92238,1.2741442,304.31980747322217,-32.26998931504778,,,,,,,,,,,,,,,,,,,,,,,,,K
3,0.004052,-82.750267,13.333,12.872,4630074530064792448,11.209091,10.524865569146224,0.019602386,3.856285092898922,0.02012978,1.027615140072866,0.01769279,14.624002,15.035072,14.034426,-43.84897,5.5962825,304.8768564308389,-34.1827911363649,6107.482,6101.212,6117.21,4.2386,4.2239,4.2526,-0.8342,-0.8443,-0.8225,911.0151,895.5346,927.2803,0.619,0.6154,0.624,,,,,,,,,,G
4,0.007844,-82.031616,12.605,12.209,4630918748836674432,18.801056,18.300275763425613,0.016882967,-4.310403819421376,0.018285993,1.1269417803187838,0.013919161,13.798758,14.187072,13.238532,-27.08683,4.7260613,305.0858976745071,-34.88045519689782,5503.932,5470.177,5528.491,4.1392,4.1308,4.148,-0.7153,-0.7508,-0.6893,856.8555,845.9437,867.5156,0.1912,0.1723,0.2052,,,,,,,,,,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27001405,359.991162,81.723877,11.903,11.228,2286403269975358080,5.2602525,5.127952329131903,0.016783616,-1.172331463561274,0.016823703,0.45434838996692956,0.014370571,13.8628235,14.56874,13.046526,-14.39366,2.213932,120.98842984113523,19.046996592015013,,,,,,,,,,,,,,,,,,,,,,,,,K
27001406,359.992311,86.652588,13.491,13.03,2301267842548323584,9.127414,8.573259109347845,0.023041524,3.1319166932358895,0.024572177,1.3792355955819116,0.021768766,14.949071,15.4501915,14.290141,,,122.11727494824203,23.862584460651433,5192.0254,5167.729,5216.4297,4.4538,4.4444,4.4631,-0.2157,-0.2348,-0.1937,711.6166,700.939,723.2191,0.2872,0.2746,0.2996,,,,,,,,,,K
27001407,359.992988,81.013268,9.471,8.67,2283307320109693312,4.6164536,-4.616426557011018,0.019822646,-0.015870456188812835,0.020180052,1.0655880137082043,0.016312053,11.427629,12.153831,10.600687,-54.240005,0.26263505,120.83167313285159,18.352152471500148,,,,,,,,,,,,,,,,-0.05,-0.07,-0.01,-0.07,-0.08,-0.06,-0.09,-0.19,-0.02,K
27001408,359.99454,86.22448,13.406,13.128,2301053231623032064,5.500912,-5.448609975674909,0.018498827,-0.7567577462820062,0.016916338,0.7946582215573009,0.014681197,14.476669,14.788514,13.9952135,-44.64216,8.348065,122.01626514394927,23.444591962057483,6267.614,6218.716,6299.795,4.3056,4.2676,4.351,-0.5391,-0.5769,-0.5064,1108.1714,1040.2518,1159.0392,0.2963,0.2748,0.3101,,,,,,,,,,F


In [18]:
for file in files:
    df = vaex.open(file)
    df_ = df[df.pm.notnan()].extract()
    name = file.split("/")[-1]
    df_.export(join(gaia_data_dir, name), progress=True)
    print(name)

export(hdf5) [########################################] 100.00% elapsed time  :     0.28s =  0.0m =  0.0h
 gaia-000-001.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.32s =  0.0m =  0.0h
 gaia-001-002.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.36s =  0.0m =  0.0h 
 gaia-002-003.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.32s =  0.0m =  0.0h
 gaia-003-004.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.32s =  0.0m =  0.0h
 gaia-004-005.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.31s =  0.0m =  0.0h
 gaia-005-006.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.33s =  0.0m =  0.0h 
 gaia-006-007.hdf5
export(hdf5) [########################################] 100.00% elapsed time  :     0.32s =  0.0m =  0.0h
 gaia-007-008.hdf

In [21]:
files = glob(join(gaia_data_dir,  "*.hdf5"))
files.sort()
len(files)

360

In [22]:
gaia = vaex.open_many(files)
gaia

#,ra,dec,Jmag,Kmag,source_id,pm,pmra,e_pmra,pmdec,e_pmdec,parallax,e_parallax,gmag,bpmag,rpmag,rv_gaia,e_rv_gaia,GLON,GLAT,teff_gspphot,teff_gspphot_lower,teff_gspphot_upper,logg_gspphot,logg_gspphot_lower,logg_gspphot_upper,mh_gspphot,mh_gspphot_lower,mh_gspphot_upper,distance_gspphot,distance_gspphot_lower,distance_gspphot_upper,ag_gspphot,ag_gspphot_lower,ag_gspphot_upper,mh_gspspec,mh_gspspec_lower,mh_gspspec_upper,alphafe_gspspec,alphafe_gspspec_lower,alphafe_gspspec_upper,fem_gspspec,fem_gspspec_lower,fem_gspspec_upper,spectraltype_esphs
0,0.000464,-84.272873,12.869,12.374,4617686126396602368,3.6212091,-2.123925825256161,0.018093891,2.9329329266790864,0.015576468,1.2414398898080128,0.014017576,14.117249,14.546528,13.5242605,20.115417,3.325204,304.44429715164756,-32.70355447144384,5658.605,5653.9224,5668.203,4.2226,4.2043,4.2496,-0.0402,-0.0457,-0.0316,770.2898,746.2748,786.9589,0.2956,0.2926,0.2995,,,,,,,,,,G
1,0.001998,-84.804314,12.523,11.93,4617598371624860928,17.00417,16.266358206309686,0.015283653,4.954528511167747,0.016042866,0.686194890423576,0.013194535,14.117252,14.645076,13.433549,-23.821648,3.2447915,304.29624940743884,-32.18708166475209,,,,,,,,,,,,,,,,,,,,,,,,,K
2,0.002857,-84.718956,11.746,10.877,4617602769671367936,5.9807105,5.927853781783437,0.014620288,-0.7933771944304462,0.01440706,0.17823265910718195,0.011743693,13.724135,14.456079,12.896074,131.92238,1.2741442,304.31980747322217,-32.26998931504778,,,,,,,,,,,,,,,,,,,,,,,,,K
3,0.004052,-82.750267,13.333,12.872,4630074530064792448,11.209091,10.524865569146224,0.019602386,3.856285092898922,0.02012978,1.027615140072866,0.01769279,14.624002,15.035072,14.034426,-43.84897,5.5962825,304.8768564308389,-34.1827911363649,6107.482,6101.212,6117.21,4.2386,4.2239,4.2526,-0.8342,-0.8443,-0.8225,911.0151,895.5346,927.2803,0.619,0.6154,0.624,,,,,,,,,,G
4,0.007844,-82.031616,12.605,12.209,4630918748836674432,18.801056,18.300275763425613,0.016882967,-4.310403819421376,0.018285993,1.1269417803187838,0.013919161,13.798758,14.187072,13.238532,-27.08683,4.7260613,305.0858976745071,-34.88045519689782,5503.932,5470.177,5528.491,4.1392,4.1308,4.148,-0.7153,-0.7508,-0.6893,856.8555,845.9437,867.5156,0.1912,0.1723,0.2052,,,,,,,,,,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27001405,359.991162,81.723877,11.903,11.228,2286403269975358080,5.2602525,5.127952329131903,0.016783616,-1.172331463561274,0.016823703,0.45434838996692956,0.014370571,13.8628235,14.56874,13.046526,-14.39366,2.213932,120.98842984113523,19.046996592015013,,,,,,,,,,,,,,,,,,,,,,,,,K
27001406,359.992311,86.652588,13.491,13.03,2301267842548323584,9.127414,8.573259109347845,0.023041524,3.1319166932358895,0.024572177,1.3792355955819116,0.021768766,14.949071,15.4501915,14.290141,,,122.11727494824203,23.862584460651433,5192.0254,5167.729,5216.4297,4.4538,4.4444,4.4631,-0.2157,-0.2348,-0.1937,711.6166,700.939,723.2191,0.2872,0.2746,0.2996,,,,,,,,,,K
27001407,359.992988,81.013268,9.471,8.67,2283307320109693312,4.6164536,-4.616426557011018,0.019822646,-0.015870456188812835,0.020180052,1.0655880137082043,0.016312053,11.427629,12.153831,10.600687,-54.240005,0.26263505,120.83167313285159,18.352152471500148,,,,,,,,,,,,,,,,-0.05,-0.07,-0.01,-0.07,-0.08,-0.06,-0.09,-0.19,-0.02,K
27001408,359.99454,86.22448,13.406,13.128,2301053231623032064,5.500912,-5.448609975674909,0.018498827,-0.7567577462820062,0.016916338,0.7946582215573009,0.014681197,14.476669,14.788514,13.9952135,-44.64216,8.348065,122.01626514394927,23.444591962057483,6267.614,6218.716,6299.795,4.3056,4.2676,4.351,-0.5391,-0.5769,-0.5064,1108.1714,1040.2518,1159.0392,0.2963,0.2748,0.3101,,,,,,,,,,F


# 2MASS

In [23]:
files = glob(join(tmass_data_dir, "*.hdf5"))
files.sort()
len(files)

360

In [24]:
tmass = vaex.open_many(files)
tmass

#,ra,dec,Jmag,Kmag,designation
0,0.000464,-84.272873,12.869,12.374,00000011-8416223
1,0.001247,-82.041222,12.825,12.156,00000029-8202283
2,0.001998,-84.804314,12.523,11.93,00000047-8448155
3,0.002857,-84.718956,11.746,10.877,00000068-8443082
4,0.004052,-82.750267,13.333,12.872,00000097-8245009
...,...,...,...,...,...
50234210,359.992232,81.593384,12.276,11.887,23595813+8135361
50234211,359.992311,86.652588,13.491,13.03,23595815+8639093
50234212,359.992988,81.013268,9.471,8.67,23595831+8100477
50234213,359.99454,86.22448,13.406,13.128,23595868+8613281
