In [1]:
import numpy as np
import pandas as pd
import glob
import astropy.coordinates as coord
import astropy.units as un
from astroquery.simbad import Simbad
import time

# Collate flare star catalogues

Here we read and collate catalogues of flare type objects. The catalogues we use include flare stars, white dwarfs, RS CVns, unidentified transient flares and more. The catalogues we use here are:

- Lepine: bright M-dwarfs - https://ui.adsabs.harvard.edu/abs/2011AJ....142..138L/abstract
- JGagne: ultra-cool dwarfs - jgagneastro.wordpress.com/list-of-ultracool-dwarfs
- sdss_mwds: magnetic white dwarfs from SDSS: https://ui.adsabs.harvard.edu/abs/2013MNRAS.429.2934K/abstract
- Catalina: Transients in the Catalina Surveys Data Release 2 - https://ui.adsabs.harvard.edu/abs/2009ApJ...696..870D/abstract
- simbad_fstars: flare stars (F*) from the SIMBAD catalogue (2000A&AS..143....9W)
- simbad_RSCVns: flare stars (RS*) from the SIMBAD catalogue (2000A&AS..143....9W)
- whitedwarfs: white dwarfs from the Montreal White Dwarf Database - http://www.montrealwhitedwarfdatabase.org/references.html

We read in each catalogue using pandas, make columns for the proper motion if there are none and rename the RA and DEC columns. Then we combine the catalogues. If the source does not have an official SIMBAD name already, we check SIMBAD by radius to see if the source is known. Please note that this means that the SIMBAD names are *not* confirmed, but are a good starting point for looking into the source. Also, not all sources are identified in SIMBAD, particularly Catalina transients.

In [2]:
# Get the files for the catalogues

# Path to catalogue files
path = '/raid/driessen/Catalogues/'

# The catalogue files, downloaded from the
# above sources
lepine = ('{}Lepine_BrightMDwarfs.csv'.format(path))
jgagne = ('{}List_of_UltraCool_Dwarfs.csv'.format(path))
sdss_mwds = ('{}magWDs.tsv'.format(path))
simbad_fstars = ('{}simbad_flarestars_pm.txt'.format(path))
catalina = ('{}CRTS_all_transients.tsv'.format(path))
catalinas = glob.glob('{}css_transientcandidates_?.tsv'.format(path))
whitedwarfs = ('{}MWDD-export.csv'.format(path))
simbad_RSCVns = ('{}SIMBAD_RSCVns.txt'.format(path))

## Read the files

Read in the files usin pandas. Each one has to be read in slightly differently because the catalogues are formatted very differently.

In [3]:
lepine_source_info = []
columns = ['lepine_name', 'cns3_name',
           'ra(deg)', 'dec(deg)',
           'pmra(arcsec/yr)', 'pmdec(arcsec/yr)']

# Lepine uses an odd file format that Pandas
# doesn't like, so read it in line by line
with open(lepine, 'r') as fn:
    for l, line in enumerate(fn):
        if l > 40:
            lepinename = line[:17].strip()
            cnsname = line[38:55].strip()
            ra = line[56:66].strip()
            dec = line[68:78].strip()
            pmra = line[86:92].strip()
            pmdec = line[93:99].strip()
            
            lepine_source_info.append([lepinename, cnsname,
                                       ra, dec, pmra, pmdec])  
lepine_source_info = np.array(lepine_source_info)

# Put everything into a Pandas table
lepine_dict = dict()
for c, col in enumerate(columns):
    lepine_dict[col] = lepine_source_info[:, c]
lepine_table = pd.DataFrame(data=lepine_dict)

# Remove rows that don't have known
# coordinates
lepine_table = lepine_table.dropna(axis='rows', subset=['ra(deg)',
                                                        'dec(deg)'])
# Make a column stating which catalogue these
# sources are from
lepine_table['Catalogue'] = 'Lepine'
# Add an empty column for SIMBAD names
lepine_table['simbad_names'] = ''

In [4]:
jgagne_table = pd.read_csv(jgagne)
# Change the column names so that every
# catalogue has the same name
# for the RA and Dec columns
jgagne_table = jgagne_table.rename(columns={'R.A. (deg)':'ra(deg)',
                                            'Decl. (deg)':'dec(deg)'})
jgagne_table = jgagne_table.dropna(axis='rows', subset=['ra(deg)',
                                                        'dec(deg)'])
# Make a column stating which catalogue these
# sources are from
jgagne_table['Catalogue'] = 'J.Gagne'
# Add an empty column for SIMBAD names
# and proper motions
jgagne_table['pmra(arcsec/yr)'] = ''
jgagne_table['pmdec(arcsec/yr)'] = ''
jgagne_table['simbad_names'] = ''

In [5]:
vizier_mwds_table = pd.read_csv(sdss_mwds, header=[71, 72], delimiter='\t')
vizier_mwds_table = vizier_mwds_table[1:]
vizier_mwds_table.columns = vizier_mwds_table.columns.map('_'.join)

# Change the column names so that every
# catalogue has the same name
# for the RA and Dec columns
vizier_mwds_table = vizier_mwds_table.rename(columns={'#_RAJ2000_#deg':'ra(deg)',
                                                      '_DEJ2000_deg':'dec(deg)'})
# Remove rows that don't have known
# coordinates
vizier_mwds_table = vizier_mwds_table.dropna(axis='rows', subset=['ra(deg)', 'dec(deg)'])
# Make a column stating which catalogue these
# sources are from
vizier_mwds_table['Catalogue'] = 'Vizier_MWDs'
# Add an empty column for SIMBAD names
# and proper motions
vizier_mwds_table['pmra(arcsec/yr)'] = ''
vizier_mwds_table['pmdec(arcsec/yr)'] = ''
vizier_mwds_table['simbad_names'] = ''

In [6]:
simbad_fstars_table = pd.read_csv(simbad_fstars, header=[4], delimiter='|')
simbad_fstars_table = simbad_fstars_table[1:-1]
simbad_fstars_table.columns = simbad_fstars_table.columns.str.strip()

# This catalogue has an odd coordinate
# format, so I first correct that
simbad_coords = []
simbad_coords_orig = simbad_fstars_table['coord1 (ICRS,J2000/2000)']

for c, coo in enumerate(simbad_coords_orig):
    if '+' in coo:
        coordi = coo.split(' +')
        coordi = [coordi[0], '+'+coordi[1]]
    elif '-' in coo:
        coordi = coo.split(' -')
        coordi = [coordi[0], '-'+coordi[1]]
    simbad_coords.append(coordi)
simbad_coords = coord.SkyCoord(simbad_coords, unit=(un.hourangle, un.deg))
# Add the corrected RA and DEC columns
simbad_fstars_table['ra(deg)'] = np.array(simbad_coords.ra.deg)
simbad_fstars_table['dec(deg)'] = np.array(simbad_coords.dec.deg)
# Remove rows that don't have known
# coordinates
simbad_fstars_table = simbad_fstars_table.dropna(axis='rows', subset=['ra(deg)', 'dec(deg)'])

# Get the proper motions
# and put them into the correct
# format
pmra = []
pmdec = []
for pm in simbad_fstars_table['pm']:
    if '~' in pm:
        pmra.append(np.nan)
        pmdec.append(np.nan)
    else:
        pms = pm.split()
        pmra.append(pms[0])
        pmdec.append(pms[1])
simbad_fstars_table['pmra(arcsec/yr)'] = pmra
simbad_fstars_table['pmdec(arcsec/yr)'] = pmdec

# Change the column name that has
# the SIMBAD names in it, for
# consistancy
simbad_fstars_table = simbad_fstars_table.rename(columns={'identifier':
                                                          'simbad_names'})
# Make a column stating which catalogue these
# sources are from
simbad_fstars_table['Catalogue'] = 'SimbadFlareStars'

In [7]:
# Reading in the Catalina sources is a bit more
# complicated because the sources are in
# a set of files, rather than one file

headers = ['CRTS_ID', 'ra(deg)', 'dec(deg)', 'classification']
new_content = []
with open(catalina, 'r') as content:
    for l, line in enumerate(content):
        if l>0:
            line = line.split('\t')
            if len(line) >= 10:
                nc = [line[0].strip(), line[1].strip(), line[2].strip(), line[-1].strip()]
                new_content.append(nc)
values = np.array(new_content, dtype=str)

catalina_dict = dict()

for h, head in enumerate(headers):
    catalina_dict[head] = values[:, h]
catalina_table = pd.DataFrame(data=catalina_dict)

# Remove rows that don't have known
# coordinates
catalina_table = catalina_table.dropna(axis='rows', subset=['ra(deg)', 'dec(deg)'])

for cat in catalinas:
    headers = ['CRTS_ID', 'ra(deg)', 'dec(deg)', 'classification']
    new_content = []
    with open(cat, 'r') as content:
        for l, line in enumerate(content):
            if l>0:
                line = line.split('\t')
                if len(line) >= 10:
                    nc = [line[0].strip(), line[1].strip(), line[2].strip(), line[-1].strip()]
                    new_content.append(nc)
    values = np.array(new_content, dtype=str)

    cat_dict = dict()

    for h, head in enumerate(headers):
        cat_dict[head] = values[:, h]
    cat_table = pd.DataFrame(data=cat_dict)
    
    # Remove rows that don't have known
    # coordinates
    cat_table = cat_table.dropna(axis='rows', subset=['ra(deg)', 'dec(deg)'])

    catalina_table = catalina_table.append(cat_table)

subs = ['SN', 'Ast', 'AGN', 'Nothing', 'Blazar']
for sub in subs:
    catalina_table['match_indexes'] = catalina_table['classification'].str.find(sub)
    catalina_table = catalina_table[catalina_table['match_indexes'] == -1]
    catalina_table.drop('match_indexes', axis='columns', inplace=True)

# Make a column stating which catalogue these
# sources are from
catalina_table['Catalogue'] = 'CRTS'
# Add an empty column for SIMBAD names
# and proper motions
catalina_table['pmra(arcsec/yr)'] = ''
catalina_table['pmdec(arcsec/yr)'] = ''
catalina_table['simbad_names'] = ''

In [8]:
wd_table = pd.read_csv(whitedwarfs)

# Put the coordinates in the right
# format then add them as columns
wd_coords = coord.SkyCoord(wd_table['icrsra'],
                           wd_table['icrsdec'],
                           unit=(un.hourangle, un.deg))
wd_table['ra(deg)'] = wd_coords.ra.deg
wd_table['dec(deg)'] = wd_coords.dec.deg
# Change the name column for consistency
wd_table = wd_table.rename(columns={'wdid':'simbad_names'})
# Make a column stating which catalogue these
# sources are from
wd_table['Catalogue'] = 'MWDD'
# Make empty columns for the
# proper motions
wd_table['pmra(arcsec/yr)'] = ''
wd_table['pmdec(arcsec/yr)'] = ''

In [9]:
simbad_rscvns_table = pd.read_csv(simbad_RSCVns, header=[4], delimiter='|')
simbad_rscvns_table = simbad_rscvns_table[1:-1]
simbad_rscvns_table.columns = simbad_rscvns_table.columns.str.strip()

# This catalogue has an odd coordinate
# format, so I first correct that
simbad_coords = []
simbad_coords_orig = simbad_rscvns_table['coord1 (ICRS,J2000/2000)']

for c, coo in enumerate(simbad_coords_orig):
    if '+' in coo:
        coordi = coo.split(' +')
        coordi = [coordi[0], '+'+coordi[1]]
    elif '-' in coo:
        coordi = coo.split(' -')
        coordi = [coordi[0], '-'+coordi[1]]
    simbad_coords.append(coordi)
simbad_coords = coord.SkyCoord(simbad_coords, unit=(un.hourangle,
                                                    un.deg))
# Add the corrected RA and DEC columns
simbad_rscvns_table['ra(deg)'] = np.array(simbad_coords.ra.deg)
simbad_rscvns_table['dec(deg)'] = np.array(simbad_coords.dec.deg)
# Remove rows that don't have known
# coordinates
simbad_rscvns_table = simbad_rscvns_table.dropna(axis='rows',
                                                 subset=['ra(deg)',
                                                         'dec(deg)'])

# Get the proper motions
# and put them into the correct
# format
pmra = []
pmdec = []
for pm in simbad_rscvns_table['pm']:
    if '~' in pm:
        pmra.append(np.nan)
        pmdec.append(np.nan)
    else:
        pms = pm.split()
        pmra.append(pms[0])
        pmdec.append(pms[1])
simbad_rscvns_table['pmra(arcsec/yr)'] = pmra
simbad_rscvns_table['pmdec(arcsec/yr)'] = pmdec
# Change the column name that has
# the SIMBAD names in it, for
# consistancy
simbad_rscvns_table = simbad_rscvns_table.rename(columns={'identifier':
                                                          'simbad_names'})

# Make a column stating which catalogue these
# sources are from
simbad_rscvns_table['Catalogue'] = 'SimbadRSCVns'

## Combine the dataframes

Combine the pandas dataframes for each catalogue. Use "inner" so that only identical columns are combined.

In [10]:
dataframes = [lepine_table,
              jgagne_table,
              vizier_mwds_table,
              simbad_fstars_table,
              catalina_table,
              wd_table,
              simbad_rscvns_table]

combined_df = pd.concat(dataframes, join='inner', ignore_index=True)
combined_df = combined_df.drop_duplicates()

combined_df[['ra(deg)',
             'dec(deg)',
             'pmra(arcsec/yr)',
             'pmdec(arcsec/yr)']] = combined_df[['ra(deg)',
                                                 'dec(deg)',
                                                 'pmra(arcsec/yr)',
                                                 'pmdec(arcsec/yr)']].apply(pd.to_numeric,
                                                                            downcast='float')
# Split the catalogue into sources that already
# have SIMBAD names, and sources that don't
no_names = combined_df[combined_df['simbad_names'] == '']
names = combined_df[combined_df['simbad_names'] != '']

## Find SIMBAD names for sources that don't have them

In [16]:
# Get the coordinates of the sources
# that don't have names yet
no_name_coords = coord.SkyCoord(np.array(no_names['ra(deg)'])*un.deg,
                                np.array(no_names['dec(deg)'])*un.deg,
                                pm_ra_cosdec=np.array(no_names['pmra(arcsec/yr)'])*un.arcsec/un.yr,
                                pm_dec=np.array(no_names['pmdec(arcsec/yr)'])*un.arcsec/un.yr)
no_name_names = []

# Divide the sources up into chunks,
# otherwise astroquery will chuck
# a hissy fit
for s, starts in enumerate(np.arange(0, 16)):
    start = starts * 1000
    end = start + 1000
    coords = no_name_coords[start:end]

    # Use the coordinates to search for nearby
    # sources for each source
    result_table = Simbad.query_region(coords, radius=2.*un.arcsec)
    # Match the results to the sources
    try:
        result_coords = coord.SkyCoord(list(result_table['RA']),
                                       list(result_table['DEC']),
                                       unit=(un.hourangle, un.deg))

        for c, coo in enumerate(coords):
            seps = coo.separation(result_coords)

            if np.nanmin(seps.deg) < 2./60./60.:
                no_name_names.append(result_table[np.nanargmin(seps.deg)]['MAIN_ID'].decode('UTF-8'))
            else:
                no_name_names.append('')
    except (TypeError, KeyError) as e:
        # Add a space for any source that doesn't
        # have any SIMBAD matches
        print('No SIMBAD matches within 2 asec: ', start, end)
        for c, coo in enumerate(coords):
            no_name_names.append('')
    # Take a break, because otherwise astroquery
    # and SIMBAD will have a different hissy fit
    time.sleep(10)

# Add the SIMBAD names you just found
# to the source without names
no_names['simbad_names'] = no_name_names

No SIMBAD matches within 2 asec:  5000 6000
No SIMBAD matches within 2 asec:  6000 7000
No SIMBAD matches within 2 asec:  7000 8000


In [13]:
# Combine the tables again
combined_df_names = pd.concat([names, no_names], join='inner', ignore_index=True)
# Split them into sources with and without names again
new_no_names = combined_df_names[combined_df_names['simbad_names'] == '']
new_names = combined_df_names[combined_df_names['simbad_names'] != '']
# Remove sources that have the same name
new_names = new_names.drop_duplicates(subset='simbad_names')
# Combine them again, and ta-da! You have your dataframe!
final_df = pd.concat([new_names, new_no_names], join='inner', ignore_index=True)

In [14]:
# Save the dataframe, because you'll be
# mad if you don't and you have to run all
# this again
final_df.to_csv('FlareTypeStars_Pandas_SimbadNames.csv')