In [None]:
import vaex
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
from time import time

In [None]:
# file path
file = "1rv.hdf5"
data = vaex.open(file)

In [None]:
# divide into multiple RAs for stability
ras = np.linspace(0,360, 361).astype(int)

In [None]:
# function to select the BEST rv to select, not optimized tho, but works

def selectRV(RVs, e_RVs, RV_cats, index, debug=False):
    rvs = np.array(RVs)
    e_rvs = np.array(e_RVs)
    rv_cats = np.array(RV_cats)
    if len(rvs) != len(e_rvs) != len(rv_cats):
        raise TypeError('rvs, e_rvs, or rv_cat has different dimension')
    else:
        mask = []
        if debug:
            print(rvs, e_rvs, rv_cats)
        if len(rvs) == 1:
            return [rvs[0], e_rvs[0], rv_cats[0]]
        for e_rv in e_rvs:
            if e_rv > 0:
                mask.append(True)
            else:
                mask.append(False)
        rvs = rvs[mask]
        e_rvs = e_rvs[mask]
        rv_cats = rv_cats[mask]
        if len(rvs) == 0:
            raise ValueError('rvs is empty')
        elif len(rvs) == 1:
            return [rvs[0], e_rvs[0], rv_cats[0]]
        elif len(rvs) == 2:
            i = np.argmin(e_rvs)
            return [rvs[i], e_rvs[i], rv_cats[i]]
        else:
            avg = np.average(rvs, weights=1/e_rvs)
            selected_rvs = []
            selected_e_rvs = []
            selected_rv_cats = []
            for rv, e_rv, rv_cat in zip(rvs, e_rvs, rv_cats):
                if (rv + e_rv > avg) and (rv - e_rv < avg):
                    selected_rvs.append(rv)
                    selected_e_rvs.append(e_rv)
                    selected_rv_cats.append(rv_cat)
            if (len(selected_rvs) == 0):
                i = np.argmin(e_rvs)
                return [rvs[i], e_rvs[i], rv_cats[i]]
            else:
                i = np.argmin(selected_e_rvs)
                return [selected_rvs[i], selected_e_rvs[i], selected_rv_cats[i]]

In [None]:
# for my project, these columns are not necessary
removed_columns = ['teff_gspphot', 'teff_gspphot_lower', 'teff_gspphot_upper',
                   'logg_gspphot', 'logg_gspphot_lower', 'logg_gspphot_upper',
                   'mh_gspphot', 'mh_gspphot_lower', 'mh_gspphot_upper',
                   'ag_gspphot', 'ag_gspphot_lower', 'ag_gspphot_upper', 
                   'mh_gspspec', 'mh_gspspec_lower', 'mh_gspspec_upper',
                   'alphafe_gspspec', 'alphafe_gspspec_lower', 'alphafe_gspspec_upper',
                   'fem_gspspec', 'fem_gspspec_lower', 'fem_gspspec_upper',
                    'teff_rave', 'logg_rave', 'mh_rave', 'alphafe_rave', 
                   'feh_galah', 'alphafe_galah', 'teff_galah', 'e_teff_galah', 
                   'logg_galah', 'e_logg_galah', 'teff_lamost',
                   'e_teff_lamost', 'logg_lamost', 'e_logg_lamost', 'feh_lamost',
                   'e_feh_lamost', 'alpham_lamost', 'e_alpham_lamost',
                   'teff_apogee', 'e_teff_apogee', 'logg_apogee', 'e_logg_apogee',
                   'mh_apogee', 'e_mh_apogee', 'feh_apogee', 'e_feh_apogee',
                   'alpham_apogee', 'e_alpham_apogee']


In [None]:
# run the operation!

rv_columns = ["rv_gaia", "rv_rave", "rv_lamost", "rv_apogee", "rv_galah"]
e_rv_columns = ["e_rv_gaia", "e_rv_rave", "e_rv_lamost", "e_rv_apogee", "e_rv_galah"]
df_com = []
for i, (ra0, ra1) in enumerate(zip(ras[:-1], ras[1:])):
    # if ra0 <= 69: continue
    RVs = []
    e_RVs = []
    RV_cats = []
    data_filtered = data.filter(data.ra > ra0).filter(data.ra <= ra1)
    df = data_filtered.to_pandas_df()
    t0 = time()
    for index, row in df.iterrows():
        rvs = []
        e_rvs = []
        rv_cats = []
        for e_rv, rv in zip(e_rv_columns, rv_columns):
            if (row[rv] == row[rv]):
                rvs.append(row[rv])
                e_rvs.append(row[e_rv])
                rv_cats.append(rv)
        if(len(rvs) == 0):
            raise ValueError('no radial velocity found')
        rv, e_rv, rv_cat = selectRV(rvs, e_rvs, rv_cats, index)
        RVs.append(rv)
        e_RVs.append(e_rv)
        RV_cats.append(rv_cat)
    df['rv'] = RVs
    df['e_rv'] = e_RVs
    df['rv_cat'] = RV_cats
    df.drop(labels=(rv_columns+e_rv_columns+removed_columns),axis=1, inplace=True)
    if len(df_com) == 0:
        df_com = vaex.from_pandas(df)
    else:
        df_com = df_com.concat(vaex.from_pandas(df))
    t1 = time()
    print(f"saved {ra0:03d}-{ra1:03d} | {round(t1-t0, 2)}s")

In [None]:
# if you want tp see rv distribution by which catalog it was choosen, run this
df_com['rv_cat'].value_counts().plot(kind='bar')

In [None]:
# export/save the result
df_com.export("rv/combined.hdf5", progress=True)