In [1]:
import vaex
from glob import glob
from os.path import join, abspath
from os import pardir, mkdir
import numpy as np
from matplotlib import pyplot as plt

In [2]:
root_data_dir = abspath(join(pardir, "Data"))

In [3]:
name = "Combine"
data_dir = join(root_data_dir, name)
try:
  mkdir(data_dir)
  print(f"Creating {name} dir in Data dir")
except FileExistsError:
  print("Directory already exist. Good to go!")
data_dir

Directory already exist. Good to go!


'/home2/s20321005/Thesis-Project/Data/Combine'

In [4]:
# load Gaia-2MASS
name = "Gaia-2MASS"
gaia_dir = join(root_data_dir, name)
gaia_files = glob(join(gaia_dir, "*.hdf5"))
gaia_files.sort()
gaia_files[:5]

['/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-000-001.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-001-002.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-002-003.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-003-004.hdf5',
 '/home2/s20321005/Thesis-Project/Data/Gaia-2MASS/gaia-004-005.hdf5']

In [5]:
gaia_ = vaex.open_many(gaia_files)
gaia_

In [None]:
gaia = (gaia_.filter("parallax > 0")
                    .filter("e_parallax/parallax < 0.15")
                    .filter("bpmag - rpmag > -3")
                    .filter("bpmag - rpmag < 6")
                    .filter("gmag > 3")
                    .filter("gmag < 21")
                    .filter("fb_over_err > 10")
                    .filter("fr_over_err > 10")
                    .filter("ruwe < 1.4")
                    .filter("1.0+0.015*(bpmag - rpmag)**2 < excess_factor")
                    .filter("1.3+0.06*(bpmag - rpmag)**2 > excess_factor")
                    .filter("Jmag < 13.5")
                    .filter("Jmag > 0")
               )
gaia

# RAVE6

In [None]:
name = "rave"
data_dir_rave = join(data_dir, name)
try:
  mkdir(data_dir_rave)
  print(f"Creating {name} dir in Data/Combine dir")
except FileExistsError:
  print("Directory already exist. Good to go!")
data_dir_rave

In [None]:
name = "RAVE6"
rave_dir = join(root_data_dir, name)
rave = vaex.open(join(rave_dir, "rave6.hdf5"))
rave

In [None]:
# Join
cols = rave.column_names
for file in gaia_files:
    gaia = vaex.open(file)
    gaia_clean = (gaia.filter("parallax > 0")
                      .filter("e_parallax/parallax < 0.15")
                      .filter("bpmag - rpmag > -3")
                      .filter("bpmag - rpmag < 6")
                      .filter("gmag > 3")
                      .filter("gmag < 21")
                      .filter("fb_over_err > 10")
                      .filter("fr_over_err > 10")
                      .filter("ruwe < 1.4")
                      .filter("1.0+0.015*(bpmag - rpmag)**2 < excess_factor")
                      .filter("1.3+0.06*(bpmag - rpmag)**2 > excess_factor")
                      .filter("Jmag < 13.5")
                      .filter("Jmag > 0")
                 )
    ph_qual_filter = gaia_clean['ph_qual'].str.contains('^A.+A$')
    gaia_clean = gaia_clean[ph_qual_filter]
    gaia_clean = gaia_clean.extract()
    name = file.split("/")[-1]
    print(f"{name}, clean: {len(gaia_clean)}, raw: {len(gaia)}, {np.round(len(gaia_clean)/len(gaia)*100,2)}%")
    join_rave = gaia_clean.join(rave, on="source_id", how="left")
    for col in cols:
        if col == "source_id": continue
        join_rave[col] = join_rave[col].to_pandas_series().to_numpy() # Do this to make np.nan value to no value
    join_rave.export(join(data_dir_rave, name), progress=True) # save the result if necessary
join_rave

In [None]:
#quick look
quick = vaex.open_many(glob(join(data_dir_rave, "*.hdf5")))
quick

In [None]:
gaia_filt = quick.filter("1000/parallax < 200")
gaia_filt = gaia_filt.extract()
gaia_filt

In [None]:
gaia_filt["GLON"] = gaia_filt["GLON"]*np.pi/180 - np.pi
gaia_filt["GLAT"] = gaia_filt["GLAT"]*np.pi/180
gaia_filt

In [None]:
plt.figure(figsize=(14,7))
plt.subplot(111, projection='aitoff')
gaia_filt.viz.scatter("GLON", "GLAT", length_check=False, s=0.1, alpha=1)
plt.grid(True)
plt.show()

# GALAH

prereq: After joining with rave

In [None]:
name = "rave-galah"
data_dir_galah = join(data_dir, name)
try:
  mkdir(data_dir_galah)
  print(f"Creating {name} dir in Data/Combine dir")
except FileExistsError:
  print("Directory already exist. Good to go!")
data_dir_galah

In [None]:
name = "GALAH"
galah_dir = join(root_data_dir, name)
galah_files = glob(join(galah_dir, "*.hdf5"))
galah_files.sort()
galah = vaex.open_many(galah_files)
galah

In [None]:
# let's say gaia + rave is in the following path
files_join_rave = glob(join(data_dir_rave, "*.hdf5"))
files_join_rave.sort()
files_join_rave[:5]

In [None]:
# Join with galah
cols = galah.column_names
for file in files_join_rave:
    gaia = vaex.open(file)
    join_galah = gaia.join(galah, on="source_id", how="left")
    for col in cols:
        if col == "source_id": continue
        join_galah[col] = join_galah[col].to_pandas_series().to_numpy() # Do this to make np.nan value to no value
    name = file.split('/')[-1]
    join_galah.export(join(data_dir_galah, name), progress=True) # save if necessary
    print(f"saved {name}")
    
# sneak peek
join_galah

In [None]:
#quick look
quick = vaex.open_many(glob(join(data_dir_galah, "*.hdf5")))
quick

# Lamost

prereq: galah and rave

In [6]:
name = "rave-galah-lamost"
data_dir_lamost = join(data_dir, name)
try:
  mkdir(data_dir_lamost)
  print(f"Creating {name} dir in Data/Combine dir")
except FileExistsError:
  print("Directory already exist. Good to go!")
data_dir_lamost

Directory already exist. Good to go!


'/home2/s20321005/Thesis-Project/Data/Combine/rave-galah-lamost'

In [None]:
name = "LAMOST"
lamost_dir = join(root_data_dir, name)
lamost_files = glob(join(lamost_dir, "*.hdf5"))
lamost_files.sort()
lamost = vaex.open_many(lamost_files)
lamost

In [None]:
# let's say gaia + rave + galah is in the following path
files_join_galah = glob(join(data_dir_galah, "*.hdf5"))
files_join_galah.sort()
files_join_galah[:5]

In [None]:
# Join with lamost
cols = lamost.column_names
for file in files_join_galah:
    gaia = vaex.open(file)
    join_lamost = gaia.join(lamost, on="source_id", how="left")
    for col in cols:
        if col == "source_id": continue
        join_lamost[col] = join_lamost[col].to_pandas_series().to_numpy() # Do this to make np.nan value to no value
    name = file.split('/')[-1]
    join_lamost.export(join(data_dir_lamost, name), progress=True) # save if necessary
    print(f"saved {name}")
# sneak peek
join_lamost

In [None]:
#quick look
quick = vaex.open_many(glob(join(data_dir_lamost, "*.hdf5")))
quick

# Apogee
prereq: rave + galah + lamost

In [7]:
name = "rave-galah-lamost-apogee"
data_dir_apogee = join(data_dir, name)
try:
  mkdir(data_dir_apogee)
  print(f"Creating {name} dir in Data/Combine dir")
except FileExistsError:
  print("Directory already exist. Good to go!")
data_dir_apogee

Directory already exist. Good to go!


'/home2/s20321005/Thesis-Project/Data/Combine/rave-galah-lamost-apogee'

In [None]:
name = "APOGEE-2"
apogee_dir = join(root_data_dir, name)
apogee_files = glob(join(apogee_dir, "*.hdf5"))
apogee_files.sort()
apogee = vaex.open_many(apogee_files)
apogee

In [None]:
# let's say gaia + rave + galah + lamost is in the following path
files_join_lamost = glob(join(data_dir_lamost, "*.hdf5"))
files_join_lamost.sort()
files_join_lamost[:5]

In [None]:
# Join with apogee
cols = apogee.column_names
for file in files_join_lamost:
    gaia = vaex.open(file)
    join_apogee = gaia.join(apogee, on="source_id", how="left")
    for col in cols:
        if col == "source_id": continue
        join_apogee[col] = join_apogee[col].to_pandas_series().to_numpy() # Do this to make np.nan value to no value
    name = file.split('/')[-1]
    join_apogee.export(join(data_dir_apogee, name), progress=True) # save if necessary
    print(f"saved {name}")

In [8]:
# sneak peek
df_all = vaex.open_many(glob(join(data_dir_apogee, "*.hdf5")))
df_all

#,ra,dec,Jmag,Kmag,ph_qual,source_id,pm,pmra,e_pmra,pmdec,e_pmdec,parallax,e_parallax,gmag,bpmag,rpmag,fb_over_err,fr_over_err,ruwe,excess_factor,rv_gaia,e_rv_gaia,GLON,GLAT,teff_gspphot,teff_gspphot_lower,teff_gspphot_upper,logg_gspphot,logg_gspphot_lower,logg_gspphot_upper,mh_gspphot,mh_gspphot_lower,mh_gspphot_upper,distance_gspphot,distance_gspphot_lower,distance_gspphot_upper,ag_gspphot,ag_gspphot_lower,ag_gspphot_upper,mh_gspspec,mh_gspspec_lower,mh_gspspec_upper,alphafe_gspspec,alphafe_gspspec_lower,alphafe_gspspec_upper,fem_gspspec,fem_gspspec_lower,fem_gspspec_upper,spectraltype_esphs,rv_rave,e_rv_rave,teff_rave,logg_rave,mh_rave,alphafe_rave,rv_galah,e_rv_galah,feh_galah,alphafe_galah,teff_galah,e_teff_galah,logg_galah,e_logg_galah,subclass_lamost,teff_lamost,e_teff_lamost,logg_lamost,e_logg_lamost,feh_lamost,e_feh_lamost,rv_lamost,e_rv_lamost,alpham_lamost,e_alpham_lamost,rv_apogee,e_rv_apogee,teff_apogee,e_teff_apogee,logg_apogee,e_logg_apogee,mh_apogee,e_mh_apogee,alpham_apogee,e_alpham_apogee,feh_apogee,e_feh_apogee
0,219.15519,-89.531815,11.418,10.704,AAA,5764635393079767296,11.202992,-8.352078761712255,0.012033119,-7.466580849038771,0.012214055,0.43047135125671315,0.011004682,13.174702,13.789571,12.425439,1228.2557,2930.1003,0.99114627,1.250944,67.09427,0.9067575,303.1640199013711,-26.708320393143175,4718.7,4713.292,4724.818,2.5833,2.5655,2.6014,-0.1946,-0.1952,-0.1941,2070.6995,2031.6134,2112.0928,0.3619,0.3574,0.3668,,,,,,,,,,K,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
1,219.158806,-89.413895,11.663,11.294,AAA,5764640100363957504,17.092428,-15.001155516411039,0.012871818,-8.19246106070145,0.015000838,1.1756548246142724,0.013411725,12.782229,13.126454,12.259919,2108.8503,2770.0098,1.1272085,1.2091599,-44.15717,4.249649,303.2222502636439,-26.60253359940945,5956.387,5928.9277,5977.1416,3.7695,3.7597,3.7775,-0.5645,-0.585,-0.5521,867.1803,861.9395,874.5553,0.2911,0.2763,0.3022,,,,,,,,,,F,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
2,219.247148,-89.488724,13.026,12.512,AAA,5764638554175708672,11.671728,-10.811040433849826,0.017244274,-4.3989353543881675,0.01951931,1.9674662903618854,0.01660323,14.446186,14.915824,13.815229,914.79407,1357.1978,0.9770324,1.2232119,13.169717,3.8980281,303.1861500960215,-26.669990195169934,5226.577,5217.9365,5235.52,4.5456,4.5418,4.5518,-0.1967,-0.2063,-0.1864,490.8246,487.5747,493.967,0.2623,0.2565,0.2679,,,,,,,,,,K,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
3,219.781174,-89.569244,9.953,9.683,AAA,5764635079544432128,7.510926,1.5220681400491376,0.021005264,-7.355087608746858,0.020949371,2.877481398679552,0.018422326,10.847249,11.118501,10.412085,2177.0447,4133.433,1.3881594,1.1933558,-5.286206,0.73470575,303.150290423574,-26.744015581943707,6141.639,6130.1123,6152.07,4.043,4.0398,4.0469,-0.4365,-0.4457,-0.4269,344.0585,341.9788,346.0511,0.0799,0.0745,0.0847,-0.26,-0.32,-0.22,0.33,0.26,0.4,,,,F,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
4,219.801943,-89.006889,12.509,11.912,AAA,5764731359829174400,11.346082,-10.54234747031648,0.016789988,-4.1943390093826975,0.015902767,0.2086716186891874,0.014617706,14.17896,14.733065,13.476229,735.02185,2048.8267,1.051371,1.2394285,42.34929,3.1309774,303.43352796578637,-26.242058683093564,5244.1147,5202.4478,5301.4165,3.1862,3.0736,3.2597,-0.1073,-0.1138,-0.1047,2680.891,2446.6587,3078.62,0.5634,0.538,0.5992,,,,,,,,,,K,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29127326,33.273628,89.207031,11.324,10.496,AAA,576401108095248256,1.9213098,0.526383419144043,0.013179262,-1.8477965023751626,0.012728541,0.4409312794625807,0.011253249,13.301876,14.039884,12.468293,817.5752,1747.642,1.0414628,1.2745866,-15.732807,0.7937079,123.2407090213187,26.38475107004395,,,,,,,,,,,,,,,,,,,,,,,,,K,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
29127327,33.576328,89.702515,13.478,12.953,AAA,576447459382243200,17.162077,10.764419310289625,0.021213133,13.366531632012926,0.021056626,0.9941574578584899,0.019672494,14.856944,15.333351,14.206361,540.5943,905.125,1.115906,1.2340161,-6.8745933,13.125175,123.04990412535467,26.85002805556056,5420.282,5395.14,5501.2827,4.2236,4.2103,4.2407,-1.28,-1.323,-1.1485,946.7403,929.4686,962.1341,0.4847,0.4702,0.5309,,,,,,,,,,G,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
29127328,33.610021,89.572998,12.857,12.404,AAA,576445638316125184,4.728188,-2.6913600182617476,0.017589992,3.887459494000998,0.015934618,0.7781937360453499,0.014969293,14.292352,14.78063,13.633098,1025.585,1091.8604,1.0927142,1.235076,-36.36254,7.384203,123.1012765083853,26.728870058838822,5855.668,5835.1274,5878.3657,4.0048,3.9971,4.0145,-0.583,-0.6066,-0.555,1217.041,1196.2493,1234.8632,0.7542,0.7434,0.7665,,,,,,,,,,G,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,
29127329,33.612083,89.48204,12.616,11.924,AAA,576431585183142528,1.8941993,-1.7630446880574788,0.020475639,0.6925779534480914,0.017867079,0.6988763424739932,0.015851708,14.399571,15.067607,13.604876,722.4221,1057.6432,0.99978435,1.267137,-14.877426,3.5944436,123.13723815775352,26.643773310583594,5178.78,5133.0166,5367.6597,3.5746,3.4492,3.8413,-0.4349,-0.4827,-0.291,1165.2085,893.3535,1341.377,0.9013,0.8756,0.9909,,,,,,,,,,K,,,,,,,,,,,,,,,--,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_rave = df_all[df_all.rv_rave.notnan()]
df_rave

In [None]:
df_lamost = df_all[df_all.rv_lamost.notnan()]
df_lamost

In [None]:
df_apogee = df_all[df_all.rv_apogee.notnan()]
df_apogee

In [None]:
df_galah = df_all[df_all.rv_galah.notnan()]
df_galah

In [None]:
df_gaia = df_all[df_all.rv_gaia.notnan()]
df_gaia