# Batch process Equivalent Widths with MCMC

The goal of this notebook is to distill our analysis into a programmatic loop over many spectra and save the Equivalent Width (EW) and its uncertainty to a results table.  The table will be in the form of a pandas dataframe, which we'll then save as a csv file.

In [27]:
import numpy as np
import pandas as pd
import os
import glob
from astropy.io import fits
import emcee
from astropy.time import Time

In [3]:
import warnings

import pandas as pd
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [4]:
goldilocks_files = glob.glob('../data/HPF/Helium-transit-data/**/Goldilocks*.fits', recursive=True)

In [5]:
filename_zero = goldilocks_files[0]
hdus = fits.open(filename_zero)

In [6]:
def get_goldilocks_dataframe(fn):
    """Return a pandas Dataframe given a Goldilocks FITS file name"""
    hdus = fits.open(fn)
    df_original = pd.DataFrame()
    header = hdus[0].header
    for j in range(28):
        df = pd.DataFrame()
        for i in range(1, 10):
            name = hdus[i].name
            df[name] = hdus[i].data[j, :]
        df['order'] = j
        df_original = df_original.append(df, ignore_index=True)
    keep_mask = df_original[df_original.columns[0:6]] != 0.0
    df_original = df_original[keep_mask.all(axis=1)].reset_index(drop=True)
    
    return df_original, header

In [7]:
def normalize_spectrum(df):
    """Normalizes spectrum to set to one"""
    for order in df.order.unique():
        mask = df.order == order
        norm_constant = df['Sci Flux'][mask].median() #mean takes outliers into account
        df['Sci Flux'][mask] = df['Sci Flux'][mask]/norm_constant
        df['Sci Error'][mask] = df['Sci Error'][mask]/norm_constant
        
    return df

Eventually we will loop over index.

In [8]:
order = 4
n_walkers = 32
n_params = 5
n_steps = 5000
labels = ["m", "b", "A", "mu", "w"]

In [17]:
df_results = pd.DataFrame()

In [31]:
j_date = [date_raw]
t = Time(j_date, format='isot', scale='utc')
jd = t.jd

In [34]:
for index in range(125, 136):

    fn = goldilocks_files[index]
    print(index, fn[-49:])
    df_orig, header = get_goldilocks_dataframe(fn)
    date_raw = header['DATE-OBS']
    date = date_raw[0:10]
    time = date_raw[11:19]
    obj = header['OBJECT']
    df = normalize_spectrum(df_orig)
    qidx = header['QIDX']
    j_date = date_raw
    t = Time(j_date, format='isot', scale='utc')
    jd = t.jd
    
    wavelength1 = 8538
    wavelength2 = 8546
    calcium_line = 8542
    
    sub_region = (df.order == order) & (df['Sci Wavl'] > wavelength1) & (df['Sci Wavl'] < wavelength2)
    wl = df['Sci Wavl'][sub_region].values
    flux = df['Sci Flux'][sub_region].values
    unc = df['Sci Error'][sub_region].values
    
    def generative_model(m, b, A, mu, logw, int_wl = calcium_line):
        """Generate the model given parameters"""
        continuum = m * (wl - int_wl) + b
        w = np.exp(logw)
        gaussian = A * np.exp(-0.5*(wl-mu)**2/w**2)
        return continuum - gaussian
    
    def log_likelihood(theta):
        m, b, A, mu, logw = theta
        model = generative_model(m, b, A, mu, logw, int_wl = calcium_line)
        residual = flux - model
        chi_squared = np.sum(residual** 2 / unc**2)
        return -0.5 * chi_squared
    
    m_guess, b_guess, A_guess, mu_guess, logw_guess = 0.01, 0.3, 0.1, calcium_line, np.log(0.4)
    theta_guess = np.array([m_guess, b_guess, A_guess, mu_guess, logw_guess])
    
    pos = theta_guess + 1e-4 * np.random.randn(n_walkers, n_params) #intial guess position
    
    sampler = emcee.EnsembleSampler(n_walkers, n_params, log_likelihood)
    sampler.run_mcmc(pos, n_steps, progress=True);
    
    flat_samples = sampler.get_chain(discard=1000, thin=15, flat=True)

    A_draws = flat_samples[:,2]
    b_draws = flat_samples[:,1]
    m_draws = flat_samples[:,0]
    mu_draws = flat_samples[:,3]
    w_draws = np.exp(flat_samples[:, 4])

    EW = ((2*np.pi)**.5)*(A_draws*w_draws)/(m_draws*(mu_draws-calcium_line)+b_draws)
    EW

    ew_mean = np.mean(EW)
    ew_std = np.std(EW)
    print(ew_mean)
    print(ew_std)
    temp = {'ew':ew_mean, 'ew_unc':ew_std, 'date':date, 'star_name':obj, 'time':time, 'int_wv':calcium_line, 'qidx':qidx, 'jd':jd}
    df_results = df_results.append(temp, ignore_index=True)

125 Goldilocks_20200919T063924_v1.0_0024.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:12<00:00, 398.47it/s]


1.0668622452807341
0.021014802856912058
126 Goldilocks_20200919T065336_v1.0_0025.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 433.26it/s]


1.1154406540317907
0.023310474332753062
127 Goldilocks_20200807T084257_v1.0_0035.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 417.32it/s]


1.185434562445211
0.032585850284225816
128 Goldilocks_20200807T085713_v1.0_0036.spectra.fits


  gaussian = A * np.exp(-0.5*(wl-mu)**2/w**2)
  gaussian = A * np.exp(-0.5*(wl-mu)**2/w**2)
  gaussian = A * np.exp(-0.5*(wl-mu)**2/w**2)
  w = np.exp(logw)
100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 421.72it/s]


16363400.046177018
382071391.938955
129 Goldilocks_20200807T091125_v1.0_0037.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 443.28it/s]


1.1517557129901546
0.040449821690755554
130 Goldilocks_20200807T092540_v1.0_0038.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 448.63it/s]


1.1462974626306097
0.02874735509523954
131 Goldilocks_20200808T084204_v1.0_0024.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 427.71it/s]


1.1395014793046532
0.021550550058529103
132 Goldilocks_20200808T085617_v1.0_0025.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 425.60it/s]


1.175840960406922
0.021024607410307226
133 Goldilocks_20200808T091032_v1.0_0026.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 426.63it/s]


1.1356723830926534
0.01816124056264782
134 Goldilocks_20200808T092446_v1.0_0027.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 439.93it/s]


1.10468458095331
0.020770320616378436
135 Goldilocks_20200918T111606_v1.0_0029.spectra.fits


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:11<00:00, 428.01it/s]


1.0588396961225828
0.022541051516894173


In [26]:
header

SIMPLE  =                    T / conforms to FITS standard                      
BITPIX  =                    8 / array data type                                
NAXIS   =                    0 / number of array dimensions                     
EXTEND  =                    T                                                  
ORIGIN  = 'astropy.fits'       / FITS file originator                           
TIMSHXRG= 'MACIE20180925'      / TIMS HxRG Version                              
DATE    = '2020-09-19T11:33:12' / File Creation Date                            
OBJECT  = 'HAT-P-32_3_OUT_d'   / Object Observed                                
OBSERVAT= 'McDonald'           / Observatory                                    
TELESCOP= 'HET     '           / Telescope                                      
INSTRUME= 'HPF     '           / Instrument                                     
OBSERVER= 'CFB     '           / Observer                                       
OBSTYPE = 'Sci     '        

In [35]:
df_results

Unnamed: 0,date,ew,ew_unc,int_wv,qidx,star_name,time,jd
0,2020-09-19,1.066084,0.02158865,8542.0,6062.0,HAT-P-32_3_IN,06:39:47,
1,2020-09-19,1.115549,0.02340373,8542.0,6062.0,HAT-P-32_3_IN,06:54:00,
2,2020-08-07,1.184028,0.03215695,8542.0,4968.0,HAT-P-32_1_OUT_b2,08:43:26,
3,2020-08-07,-0.01617869,0.4505612,8542.0,4968.0,HAT-P-32_1_OUT_b2,08:57:39,
4,2020-08-07,1.152361,0.04073554,8542.0,4968.0,HAT-P-32_1_OUT_b2,09:11:52,
5,2020-08-07,1.146301,0.02850835,8542.0,4968.0,HAT-P-32_1_OUT_b2,09:26:05,
6,2020-08-08,1.139007,0.02170367,8542.0,4969.0,HAT-P-32_1_OUT_b1,08:42:34,
7,2020-08-08,1.175998,0.02079834,8542.0,4969.0,HAT-P-32_1_OUT_b1,08:56:47,
8,2020-08-08,1.135312,0.01866167,8542.0,4969.0,HAT-P-32_1_OUT_b1,09:11:00,
9,2020-08-08,1.103793,0.02119783,8542.0,4969.0,HAT-P-32_1_OUT_b1,09:25:12,


Great! It works!  Let's save the results to a csv file.

In [36]:
df_results.to_csv('../data/preliminary_results.csv',index=False)