# Creating pathfinder subsample

June 21, 2022  
Gully & Ryan H.

The goal of this notebook is to make the pathfinder sample.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightkurve as lk
from tqdm import tqdm
import time
import astropy.units as u
import concurrent.futures


sns.set_context('notebook', font_scale=1.5)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
names = ['EPIC','Campaign','Teff','log g','Prot','Î”Prot','hpeak','Rvar','Kp','MG']

In [None]:
df = pd.read_csv('../../data/Reinhold_Hekker2020/table2.dat', 
                 delim_whitespace=True, names=names, na_values='---')

Looks good!  We see the same trend we had in our proposal figure 2.

## Select a subsample of sources

First search for some high amplitude variable stars

In [None]:
criterion1 = (df.Prot > 1) & (df.Prot < 10)
criterion2 = (df.Rvar > 0.5) & (df.Rvar < 20)
criterion3 = (df.Teff > 4000) & (df.Teff < 4500)
criteria = criterion1 & criterion2 & criterion3

In [None]:
criteria.sum()

In [None]:
#plt.plot(df.Prot, df.Rvar, '.', alpha=0.02);
plt.plot(df.Prot[criterion3], df.Rvar[criterion3], '.', alpha=0.1);
plt.plot(df.Prot[criteria], df.Rvar[criteria], '.', alpha=0.5);
#plt.ylim(3e2, 2e5)
plt.xlim(1e0, 1e2)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('$P_{\mathrm{rot}}$')
plt.ylabel('$\propto$ Amplitude (%)')
plt.title('Reinhold & Hekker 2020 Table 2')

In [None]:
df[criteria].head()

In [None]:
df_subset=df[criteria].reset_index(drop=True)

In [None]:
df_subset

## Make a subsub sample

### Prepopulate our columns

In [None]:
df_subset['N_EVEREST'] = np.NaN
df_subset['N_TESS_SPOC'] = np.NaN
df_subset['Period_TESS'] = 0
df_subset['Amplitude_TESS'] = 0
df_subset['Period_K2'] = 0
df_subset['Amplitude_K2'] = 0
df_subset['Sector'] = np.NaN

In [None]:
df_tiny = df_subset.head(15)
df_tiny

In [None]:
df_tiny.iloc[0].to_frame()

### Predownload so that it runs faster later

Let's find one of the sources that *also* has TESS data available

Delete the cell below if you want to run on the entire subset of 400+ sources...

df_subset = df_tiny

In [None]:
n_sources = len(df_subset)
n_sources

We want to have at least 1 EVEREST lightcurve and 1 SPOC lightcurve for all sources.

In [None]:
def add_data(data):
    # data = [mission, index, search result]
    mission, idx, sr = data
    def add_data_helper(mission, idx, sr, num):
        lc = sr[num].download()
        # remove NaNs and normalize the data
        lc = lc.remove_nans().remove_outliers()
        # find the amplitude percentage
        vector = lc.flux.value
        lo, hi = np.percentile(vector, (5, 95))
        peak_to_valley = hi-lo
        # add the data to the table
        df_subset.loc[idx, f'Amplitude_{mission}'] = peak_to_valley
        # change the lightcurve into a periodogram and find its period
        period = float(lc.to_periodogram().period_at_max_power.to_value())
        # add the period to the data table
        df_subset.loc[idx, f'Period_{mission}'] = period
        if mission == 'TESS':
            # find the sector number and add it to the data table
            df_subset.loc[idx, 'Sector'] = lc.sector

    if len(sr) > 0:
        try:
            add_data_helper(mission, idx, sr, 0)
        except:
            add_data_helper(mission, idx, sr, 1)
        finally:
            return

def download(data):
    name, index, mission = data
    if mission == 0:
        sr = lk.search_lightcurve(name, mission='TESS')
        df_subset.loc[index, 'N_TESS_SPOC'] = len(sr)
    elif mission == 1:
        sr = lk.search_lightcurve(name, author='EVEREST')
        df_subset.loc[index, 'N_EVEREST'] = len(sr)
    return index, sr

In [None]:
def main():
    start = time.time()

    TESS_download = []
    K2_download = []
    for i in range(n_sources):
        # find the name of the star
        name = 'EPIC ' + df_subset.iloc[i].EPIC.astype(int).astype(str)
        TESS_download.append([name, i, 0])
        K2_download.append([name, i, 1])


    TESS_data = []
    K2_data = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        TESS_sr = executor.map(download, TESS_download)
        K2_sr = executor.map(download, K2_download)

        for result in TESS_sr:
            TESS_data.append(['TESS', result[0], result[1]])
        for result in K2_sr:
            K2_data.append(['K2', result[0], result[1]])

    for i in range(n_sources):
        add_data(TESS_data[i])
        add_data(K2_data[i])
    
    end = time.time()

In [None]:
if __name__ == '__main__':
    main()

In [None]:
df_subset

## Complitation times

**15 Stars:**  
Desktop:  
fresh download time ~ 42 seconds  
pre-downloaded time ~ 33.65 seconds  
cached time ~ 2.6 seconds  
  

Laptop:  
fresh download time ~ 46 seconds  
pre-downloaded time ~ 33.8 seconds  
cached time ~ 2.39 seconds  

------------------------------------------------------------  

**416 Stars:**  
Desktop:  
fresh download time ~ 1359.3 seconds ~ 22.6 minutes  
pre-downloaded time ~ 808 seconds ~ 13.5 minutes  
cached time ~ 100 seconds  
  

Laptop:  
fresh download time ~ 1747.5 seconds ~ 29.1 minutes  
pre-downloaded time ~ 936.8 seconds ~ 15.6 minutes  
cached time ~ 66.4 seconds  

In [None]:
df_subset.to_csv('pathfinder_sample.csv', index=False)

## Spot check one source...

In [None]:
name = 'EPIC 201245978'

In [None]:
lc_tess = lk.search_lightcurve(name, mission='TESS')#.download().remove_nans().remove_outliers()
len(lc_tess)
# lc_tess.plot()
# lc_tess

In [None]:
lc = lk.search_lightcurve(name, author="EVEREST")[0].download().remove_nans().remove_outliers()#.normalize().flatten()
lc.plot()#.to_periodogram().plot()

In [None]:
pg = lc.to_periodogram()
pg.plot()

In [None]:
pg.period_at_max_power

In [None]:
name = 'EPIC 220205464'

In [None]:
df_subset.iloc[236].EPIC

In [None]:
download(name, "TESS", 236)

In [None]:
tpf = lk.search_targetpixelfile('EPIC 246979864', author='K2').download()
pld = lk.correctors.PLDCorrector(tpf)
corrected_lc = pld.correct().remove_outliers().to_periodogram().period_at_max_power
corrected_lc.to_periodogram().period_at_max_power

## Plotting the data

In [None]:
plt.figure(figsize=(6,6))
plt.ylim(0.5, 10)
plt.xlim(0.5, 10)
plt.xlabel('$P_{\mathrm{Kepler}}$')
plt.ylabel('$P_{\mathrm{TESS}}$')
plt.title('Comparison between TESS and Kepler amplitudes')

x = [0.5, 10]
y = [0.5, 10]
plt.plot(x, y)

# plt.xticks(1)
# plt.yscale()

plt.plot(df_subset.Period_K2, df_subset.Period_TESS, 'r.')

plt.show()

## Receate fig2.pdf plot from proposal

In [None]:
plt.plot(df_subset.Period_K2, df_subset.Amplitude_K2, '.', color='black')
plt.plot(df_subset.Period_TESS, df_subset.Amplitude_TESS, '.', color='red')

# plt.ylim(3e2, 2e5)
# plt.xlim(1e0, 1e2)

plt.xscale('log')
plt.yscale('log')

plt.axhline(1e3, linestyle='dotted', label='1%', color='purple')
plt.axvline(27, linestyle='dashed', label='27 days', color='purple')
plt.legend()
plt.legend(fontsize=12)

plt.xlabel('$P_{\mathrm{rot}}$')
plt.ylabel('$\propto$ Amplitude (%)')
plt.title('Predicted for 4000 < $T_{\mathrm{eff}}$ < 4500 in TESS')

### Example manipulations ...

In [None]:
sr.table.to_pandas()

In [None]:
mask = (sr.table['author'] == 'EVEREST').data
mask.sum()

In [None]:
sr = lk.search_lightcurve("EPIC 202059229", mission='TESS')
sr
lc = sr[0].download()

In [None]:
lc = lc.remove_nans().normalize()

In [None]:
vector = lc.flux.value
vector

In [None]:
lo, hi = np.percentile(vector, (5, 95))

In [None]:
lo, hi

In [None]:
ax = lc.normalize().plot()
ax.axhline(hi)
ax.axhline(lo)

In [None]:
peak_to_valley = hi-lo
peak_to_valley

In [None]:
type(sr)

In [None]:
df_sr = sr.table.to_pandas()

In [None]:
df_subset

In [None]:
len(sr)

In [None]:
sr = lk.search_lightcurve("EPIC 211071889", author="EVEREST", mission="K2")
sr

In [None]:
lc_K2 = sr.download()

In [None]:
sr = lk.search_lightcurve("EPIC 211071889", author="SPOC", mission="TESS")
sr[0]

In [None]:
lc_TESS = sr[0].download()

In [None]:
scalar = np.percentile(lc_K2.flux, 98)
lc_K2 = lc_K2/scalar
ax = lc_K2.plot()
ax.axhline(1.0, linestyle='dashed')
ax.axhline(0.93, linestyle='dotted', color='#d35400', label='7 % flux loss')
ax.set_title('K2 data')
ax.set_ylim(0.8, 1.1)
ax.legend()

In [None]:
lc_TESS = lc_TESS.remove_nans().bin(binsize=5)
scalar = np.nanpercentile(lc_TESS.flux, 98)
lc_TESS = lc_TESS/scalar

In [None]:
ax = lc_TESS.plot()
ax.axhline(1.0, linestyle='dashed')
ax.axhline(0.93, linestyle='dotted', color='#d35400', label='7 % flux loss')
ax.axhline(0.955, linestyle='solid', color='#2ecc71', label='4.5 % flux loss')
ax.set_title('TESS data')
ax.set_ylim(0.8, 1.1)
ax.legend(fontsize=12)

In [None]:
assert len(sr) == 1

In [None]:
lc=sr.download()

In [None]:
lc = lc.remove_outliers(sigma=4,sigma_upper=3).normalize()

In [None]:
pg = lc.to_periodogram(nterms=5)

In [None]:
ax = pg.plot(view='period', scale='log')
ax.axvline(pg.period_at_max_power.value, linestyle='dotted', label=f'{pg.period_at_max_power:0.5f}')
ax.axvline(6.70, linestyle='dashed', label='6.7 d (Reinhold & Hekker 2020)', color = 'red')
ax.legend(fontsize = 12)

In [None]:
pg.period_at_max_power

In [None]:
ax = lc.plot()
pg.model(lc.time).plot(ax=ax)
pg.model(lc.time, frequency=pg.frequency_at_max_power/2).plot(ax=ax)