# Batch process Equivalent Widths with MCMC

The goal of this notebook is to distill our analysis into a programmatic loop over many spectra and save the Equivalent Width (EW) and its uncertainty to a results table.  The table will be in the form of a pandas dataframe, which we'll then save as a csv file.

In [1]:
import numpy as np
import pandas as pd
import os
import glob
from astropy.io import fits
import emcee
from astropy.time import Time

In [2]:
import warnings

import pandas as pd
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [3]:
goldilocks_files = glob.glob('../data/HPF/Helium-transit-data/**/Goldilocks*.fits', recursive=True)

In [4]:
filename_zero = goldilocks_files[0]
hdus = fits.open(filename_zero)

In [5]:
def get_goldilocks_dataframe(fn):
    """Return a pandas Dataframe given a Goldilocks FITS file name"""
    hdus = fits.open(fn)
    df_original = pd.DataFrame()
    header = hdus[0].header
    for j in range(28):
        df = pd.DataFrame()
        for i in range(1, 10):
            name = hdus[i].name
            df[name] = hdus[i].data[j, :]
        df['order'] = j
        df_original = df_original.append(df, ignore_index=True)
    keep_mask = df_original[df_original.columns[0:6]] != 0.0
    df_original = df_original[keep_mask.all(axis=1)].reset_index(drop=True)
    
    return df_original, header

In [6]:
def normalize_spectrum(df):
    """Normalizes spectrum to set to one"""
    for order in df.order.unique():
        mask = df.order == order
        norm_constant = df['Sci Flux'][mask].median() #mean takes outliers into account
        df['Sci Flux'][mask] = df['Sci Flux'][mask]/norm_constant
        df['Sci Error'][mask] = df['Sci Error'][mask]/norm_constant
        
    return df

Eventually we will loop over index.

In [7]:
order = 4
n_walkers = 32
n_params = 5
n_steps = 5000
labels = ["m", "b", "A", "mu", "w"]

In [8]:
df_results = pd.DataFrame()

In [14]:
for index in range(30, 50):

    fn = goldilocks_files[index]
    print(index, fn[-49:])
    df_orig, header = get_goldilocks_dataframe(fn)
    date_raw = header['DATE-OBS']
    date = date_raw[0:10]
    time = date_raw[11:19]
    obj = header['OBJECT']
    df = normalize_spectrum(df_orig)
    qidx = header['QIDX']
    j_date = date_raw
    t = Time(j_date, format='isot', scale='utc')
    jd = t.jd
    
    wavelength1 = calcium_line*0.999
    wavelength2 = calcium_line*1.001
    calcium_line = 8961 #8961 is Fe I
    
    sub_region = (df.order == order) & (df['Sci Wavl'] > wavelength1) & (df['Sci Wavl'] < wavelength2)
    wl = df['Sci Wavl'][sub_region].values
    flux = df['Sci Flux'][sub_region].values
    unc = df['Sci Error'][sub_region].values
    
    def generative_model(m, b, A, mu, logw, int_wl = calcium_line):
        """Generate the model given parameters"""
        continuum = m * (wl - int_wl) + b
        w = np.exp(logw)
        gaussian = A * np.exp(-0.5*(wl-mu)**2/w**2)
        return continuum - gaussian
    
    def log_likelihood(theta):
        m, b, A, mu, logw = theta
        model = generative_model(m, b, A, mu, logw, int_wl = calcium_line)
        residual = flux - model
        chi_squared = np.sum(residual** 2 / unc**2)
        return -0.5 * chi_squared
    
    m_guess, b_guess, A_guess, mu_guess, logw_guess = 0.01, 0.5, 3.1, calcium_line, np.log(0.2)
    theta_guess = np.array([m_guess, b_guess, A_guess, mu_guess, logw_guess])
    
    pos = theta_guess + 1e-4 * np.random.randn(n_walkers, n_params) #intial guess position
    
    sampler = emcee.EnsembleSampler(n_walkers, n_params, log_likelihood)
    sampler.run_mcmc(pos, n_steps, progress=True);
    
    flat_samples = sampler.get_chain(discard=1000, thin=15, flat=True)

    A_draws = flat_samples[:,2]
    b_draws = flat_samples[:,1]
    m_draws = flat_samples[:,0]
    mu_draws = flat_samples[:,3]
    w_draws = np.exp(flat_samples[:, 4])

    EW = ((2*np.pi)**.5)*(A_draws*w_draws)/(m_draws*(mu_draws-calcium_line)+b_draws)
    EW

    ew_mean = np.mean(EW)
    ew_std = np.std(EW)
    print(ew_mean)
    print(ew_std)
    temp = {'ew':ew_mean, 'ew_unc':ew_std, 'date':date, 'star_name':obj, 'time':time, 'int_wv':calcium_line, 'qidx':qidx, 'jd':jd}
    df_results = df_results.append(temp, ignore_index=True)

30 Goldilocks_20200331T052810_v1.0_0013.spectra.fits


  gaussian = A * np.exp(-0.5*(wl-mu)**2/w**2)
  w = np.exp(logw)
  return c[rint] - (c[rint] - s) * zz[:, None], factors
 45%|██████████████████████████████████▋                                          | 2249/5000 [00:06<00:07, 361.24it/s]


ValueError: At least one parameter value was infinite

In [10]:
df_results

Unnamed: 0,date,ew,ew_unc,int_wv,jd,qidx,star_name,time
0,2020-08-09,1.177059,0.038186,8542.0,2459071.0,4967.0,HAT-P-32_1_IN,08:23:13
1,2020-08-09,1.173182,0.030148,8542.0,2459071.0,4967.0,HAT-P-32_1_IN,08:37:26
2,2020-08-09,1.100334,0.025346,8542.0,2459071.0,4967.0,HAT-P-32_1_IN,08:51:38
3,2020-08-09,1.089765,0.022048,8542.0,2459071.0,4967.0,HAT-P-32_1_IN,09:05:51
4,2020-08-09,1.129137,0.022519,8542.0,2459071.0,4967.0,HAT-P-32_1_IN,09:20:04
5,2020-08-09,1.111387,0.025274,8542.0,2459071.0,4967.0,HAT-P-32_1_IN,09:34:17
6,2020-09-19,1.10434,0.02988,8542.0,2459112.0,6062.0,HAT-P-32_3_IN,05:42:56
7,2020-09-19,1.093289,0.025189,8542.0,2459112.0,6062.0,HAT-P-32_3_IN,05:57:09
8,2020-09-19,1.101152,0.022644,8542.0,2459112.0,6062.0,HAT-P-32_3_IN,06:11:22
9,2020-09-19,7.398148,5.315551,8542.0,2459112.0,6062.0,HAT-P-32_3_IN,06:25:35


Great! It works!  Let's save the results to a csv file.

In [11]:
df_results.to_csv('../data/preliminary_results.csv',index=False)