# Light curve preprocessing

## Data download

n addition to the tabular data provided in the resources section, we considered it worthwhile to analyze the light curve data as well, since this format offers deeper physical insights that could help a neural network model achieve better results. To accomplish this, we accessed the data through: [https://exoplanetarchive.ipac.caltech.edu/bulk_data_download/Kepler_KOI_wget.bat](https://exoplanetarchive.ipac.caltech.edu/bulk_data_download/Kepler_KOI_wget.bat)


In [2]:
import os

base_dir = "D:/ligthcurve/curvas"

In [3]:
list_names = os.listdir(base_dir)

In [4]:
cache_list = {}
for name in list_names:
    formatted_name = str(int(name.split("-")[0].replace("kplr","")))
    formatted_name = f"{int(formatted_name):09}"
    cache_list[formatted_name] = cache_list.get(formatted_name, 0) + 1

print(f"elementos en archivos: {len(cache_list)}")
cache_list

elementos en archivos: 1374


{'000892772': 14,
 '001026032': 17,
 '001026957': 19,
 '001027438': 18,
 '008443265': 13,
 '008443527': 27,
 '008443812': 17,
 '008444552': 17,
 '008444868': 17,
 '008445775': 17,
 '008453191': 17,
 '008453211': 17,
 '008453214': 14,
 '008453324': 19,
 '008453851': 18,
 '008454250': 18,
 '008456679': 51,
 '008458207': 17,
 '008459354': 20,
 '008459663': 16,
 '008460600': 20,
 '008462258': 14,
 '008463346': 17,
 '008474892': 8,
 '008474898': 17,
 '008478994': 54,
 '008479107': 17,
 '008479386': 29,
 '008480285': 18,
 '008480582': 17,
 '008480642': 17,
 '008481129': 17,
 '008482059': 17,
 '008482513': 16,
 '008483021': 16,
 '008483241': 17,
 '008483258': 16,
 '008483366': 17,
 '008487645': 18,
 '008487748': 17,
 '008487777': 17,
 '008487805': 17,
 '008487838': 17,
 '008488876': 19,
 '008488878': 14,
 '008490993': 17,
 '008491277': 21,
 '008491745': 17,
 '008492026': 9,
 '008492101': 10,
 '008493354': 17,
 '008494142': 50,
 '008494263': 14,
 '008494410': 17,
 '008494542': 17,
 '008494617'

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("preprocessed_data/kepler_tess_dataset.csv")
df.drop("Unnamed: 0", inplace=True, axis=1)
df.columns


Index(['search_id', 'num_planet', 'disposition', 'ror', 'stellar_mass',
       'ss_gravity', 'period', 'duration', 'transit_epoch'],
      dtype='object')

In [7]:
ids = pd.Series(cache_list.keys())
ids = "KIC " + ids.astype(int).astype(str)
ids = ids.to_numpy()

ids

array(['KIC 892772', 'KIC 1026032', 'KIC 1026957', ..., 'KIC 10395543',
       'KIC 10396708', 'KIC 10397751'], shape=(1374,), dtype=object)

**Note:** Due to time and connection limitations, only a portion of the entire dataset was downloaded. However, as will be shown next, this subset was sufficient to yield excellent results.


In [8]:

infits = df[(df["search_id"].isin(ids)) & (df["disposition"] != 2)].sort_values("search_id", ascending=False)
infits.reset_index(inplace=True)

infits


Unnamed: 0,index,search_id,num_planet,disposition,ror,stellar_mass,ss_gravity,period,duration,transit_epoch
0,2421,KIC 9838468,1,1,0.012628,0.954,4.309,54.409961,9.31400,2.455008e+06
1,6626,KIC 9838414,1,0,0.043932,0.748,4.551,1.332615,5.16100,2.454965e+06
2,4567,KIC 9838060,1,0,0.093998,0.915,4.572,23.815784,3.75910,2.454975e+06
3,2199,KIC 9837685,1,1,0.027248,0.923,4.562,13.712185,2.43700,2.454969e+06
4,2384,KIC 9837661,2,1,0.038292,0.513,4.744,2.226496,1.70730,2.454966e+06
...,...,...,...,...,...,...,...,...,...,...
1257,6739,KIC 10002266,1,0,0.012715,0.872,4.586,12.713689,1.87500,2.454970e+06
1258,625,KIC 10002261,1,0,0.182308,0.094,5.283,12.713794,2.44093,2.455008e+06
1259,2389,KIC 10001368,1,1,0.014616,1.069,4.374,4.768310,3.23630,2.454968e+06
1260,3629,KIC 10000941,1,1,0.008606,0.798,4.477,3.504723,1.94640,2.454967e+06


## Data preprocessing

Once downloaded, we proceed with the data cleaning inspired in a similar process as that taken by Shallue and Vanderburg (2017).

In [9]:
import lightkurve as lk
from preprocess_light_curves import LightCurvePreprocess
# import matplotlib.pyplot as plt
import json

counter = 0
data = infits.sort_values("search_id").iloc[:, :]
last_search_id = None
last_download = None

light_curves = pd.DataFrame(columns=["kic_id", "global_view", "local_view"])

def get_file_names(text):
    return [s for s in list_names if text in s]

def download_lightcurve_collection(kic_id):
    kic_number = kic_id.replace("KIC ", "")
    kic_padded = f"{int(kic_number):09d}"  
    glob_pattern = f"kplr{kic_padded}"
    archivos = get_file_names(glob_pattern)
    lcc = lk.LightCurveCollection([])
    for name in archivos:
        try:
            lc = lk.read(f"{base_dir}/{name}")
            lcc.append(lc)
        except:
            pass
    
    return lcc

for idx, row in enumerate(data.itertuples()):
    try:
        orbital_period, transit_duration, transit_epoch = row.period, row.duration, row.transit_epoch
        transit_epoch = transit_epoch - 2454833
        print(f"{idx:04d} : {row.search_id}")

        if row.search_id != last_search_id:
            # descarga otra vez la curva
            last_search_id = row.search_id
            last_download = download_lightcurve_collection(last_search_id)

        lcs = last_download

        lkp = LightCurvePreprocess()

        g_y, l_y = lkp.preprocess_signal(
            lcs,
            orbital_period,
            transit_duration,
            transit_epoch,
            outlier_sigma=2.0, 
            max_iter=3,
            n_global=1001, 
            n_local=101, 
            k_durations=4.0,
            delta_factor_global=1.0, 
            delta_factor_local=1.6,
            bic_grid_days=None,
        )

        serie = {
            "kic_id": row.search_id,
            "global_view": json.dumps(list(g_y)),
            "local_view": json.dumps(list(l_y))
        }

        light_curves.loc[len(light_curves)] = serie
    except:
        pass

0000 : KIC 10000490
0001 : KIC 10000941
0002 : KIC 10001368
0003 : KIC 10002261
0004 : KIC 10002266
0005 : KIC 10002866
0006 : KIC 10002866
0007 : KIC 10002866
0008 : KIC 10004519
0009 : KIC 10004738
0010 : KIC 10004738
0011 : KIC 10004738
0012 : KIC 10004772
0013 : KIC 10005020
0014 : KIC 10005758
0015 : KIC 10005758
0016 : KIC 10005788
0017 : KIC 10006096
0018 : KIC 10006581
0019 : KIC 10006641
0020 : KIC 10007492
0021 : KIC 10010440
0022 : KIC 10014702
0023 : KIC 10015516
0024 : KIC 10015937
0025 : KIC 10018233
0026 : KIC 10019065
0027 : KIC 10019399
0028 : KIC 10019643
0029 : KIC 10019643
0030 : KIC 10019708
0031 : KIC 10019708
0032 : KIC 10019763
0033 : KIC 10020423
0034 : KIC 10022908
0035 : KIC 10023469
0036 : KIC 10024051
0037 : KIC 10024701
0038 : KIC 10026136
0039 : KIC 10026457
0040 : KIC 10026458
0041 : KIC 10026502
0042 : KIC 10027247
0043 : KIC 10027323
0044 : KIC 10027323
0045 : KIC 10028140
0046 : KIC 10028352
0047 : KIC 10028535
0048 : KIC 10028792
0049 : KIC 10028792




0321 : KIC 10397751
0322 : KIC 8443265
0323 : KIC 8444552
0324 : KIC 8445775
0325 : KIC 8453191
0326 : KIC 8453211
0327 : KIC 8453214
0328 : KIC 8453324
0329 : KIC 8453851
0330 : KIC 8454250
0331 : KIC 8456679
0332 : KIC 8456679
0333 : KIC 8458207
0334 : KIC 8459354
0335 : KIC 8459663
0336 : KIC 8460600
0337 : KIC 8462258
0338 : KIC 8463346
0339 : KIC 8463346
0340 : KIC 8474892
0341 : KIC 8474898
0342 : KIC 8478994
0343 : KIC 8478994
0344 : KIC 8478994
0345 : KIC 8479107
0346 : KIC 8479386
0347 : KIC 8480285
0348 : KIC 8480285
0349 : KIC 8480582
0350 : KIC 8480642
0351 : KIC 8481129
0352 : KIC 8482513
0353 : KIC 8483021
0354 : KIC 8483241
0355 : KIC 8483258
0356 : KIC 8483366
0357 : KIC 8487645
0358 : KIC 8487748
0359 : KIC 8487777
0360 : KIC 8487805
0361 : KIC 8487838
0362 : KIC 8488876
0363 : KIC 8488878
0364 : KIC 8490993
0365 : KIC 8490993
0366 : KIC 8491277
0367 : KIC 8491745
0368 : KIC 8492026
0369 : KIC 8492101
0370 : KIC 8494142
0371 : KIC 8494142
0372 : KIC 8494410
0373 : KIC 

In [10]:
import json
import numpy as np


a = np.array(json.loads(light_curves["global_view"][0]))

a.shape

(1001,)

In [11]:
light_curves.to_csv("final_data.csv")

In [13]:
copy_light_curves = light_curves.copy()

In [16]:
copy_light_curves["search_id"] = copy_light_curves["kic_id"]
new_data = infits.merge(copy_light_curves, on="search_id")

new_data.drop(["index", "kic_id"], axis=1, inplace=True)

Finally, we create the dataset with the global and local flux vectors

In [17]:
new_data.head()

Unnamed: 0,search_id,num_planet,disposition,ror,stellar_mass,ss_gravity,period,duration,transit_epoch,global_view,local_view
0,KIC 9838468,1,1,0.012628,0.954,4.309,54.409961,9.314,2455008.0,"[-0.11291305906538904, 0.10793178189212953, 0....","[0.2713626011471899, 0.27876775831018774, 0.40..."
1,KIC 9838414,1,0,0.043932,0.748,4.551,1.332615,5.161,2454965.0,"[-0.10636503616246262, 0.1578112142941182, -0....","[0.09742806203994805, 0.14907600709401858, 0.0..."
2,KIC 9838060,1,0,0.093998,0.915,4.572,23.815784,3.7591,2454975.0,"[0.000595742676004736, 0.0003238881800717846, ...","[0.02286672191496115, 0.03699818466571551, 0.0..."
3,KIC 9837685,1,1,0.027248,0.923,4.562,13.712185,2.437,2454969.0,"[0.0011361371758506302, 0.05481143227875999, 0...","[0.010731168333516785, -0.03394275005661493, -..."
4,KIC 9837661,2,1,0.038292,0.513,4.744,2.226496,1.7073,2454966.0,"[-0.11011312998955108, -0.30014839757803774, 0...","[0.4321008216256368, 0.2384604509332056, -0.34..."


In [18]:
new_data.to_csv("dataset.csv")