In [1]:
import pandas as pd
df=pd.read_csv("kepler.csv",comment="#")
df

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.000,0,...,0.200,0.160,0.200,0.170,0.080,0.130,0.310,0.170,0.320,0.160
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,2018-08-16,CANDIDATE,0.969,0,...,0.000,0.480,0.390,0.360,0.490,0.340,0.120,0.730,0.500,0.450
2,3,10811496,K00753.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.000,0,...,-0.034,0.070,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.000,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.000,0,...,-0.090,0.180,0.100,0.140,0.070,0.180,0.020,0.160,0.070,0.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,9560,10090151,K07985.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.000,0,...,-1.757,0.068,2.763,0.074,2.344,0.072,-1.756,0.068,2.929,0.072
9560,9561,10128825,K07986.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.497,0,...,-0.250,0.490,0.780,0.460,0.500,0.400,-0.180,0.470,0.530,0.470
9561,9562,10147276,K07987.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.021,0,...,-3.650,0.260,5.000,0.220,3.380,0.160,-3.890,0.260,5.160,0.220
9562,9563,10155286,K07988.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.092,0,...,1.320,0.670,1.690,0.530,1.450,0.110,1.370,0.660,2.000,0.460


In [2]:
import os
import time
import glob
import pandas as pd
import numpy as np
import lightkurve as lk
from sklearn.preprocessing import RobustScaler

# Garante pasta para armazenar os dados
DOWNLOAD_DIR = "kepler_data"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)


# ---------- PARTE 1: DOWNLOAD EM BATCHES ----------
def download_lightcurves(df, batch_size=200, sleep_time=2):
    """Baixa os FITS de todos os kepid em lotes."""
    all_ids = df["kepid"].unique()
    
    for i in range(0, len(all_ids), batch_size):
        batch = all_ids[i:i+batch_size]
        print(f"\n🔽 Batch {i//batch_size + 1} de {len(all_ids)//batch_size + 1}...")
        
        for kepid in batch:
            str_kepid = str(int(kepid)).zfill(9)
            files = glob.glob(f"{DOWNLOAD_DIR}/*{str_kepid}*.fits")

            # pula se já existe
            if files:
                continue

            try:
                search = lk.search_lightcurvefile(f"KIC {int(kepid)}", mission="Kepler")
                if len(search) > 0:
                    search.download_all(download_dir=DOWNLOAD_DIR)
                    print(f"✔ Baixado {kepid}")
                else:
                    print(f"✖ Nenhum dado para {kepid}")
            except Exception as e:
                print(f"⚠ Erro ao baixar {kepid}: {e}")
                continue

            time.sleep(sleep_time)  # evita sobrecarregar servidor


# ---------- PARTE 2: PROCESSAMENTO LOCAL ----------
def get_lightcurve_local(kepid, period, t0, n_points=200):
    try:
        str_kepid = str(int(kepid)).zfill(9)
        files = glob.glob(f"{DOWNLOAD_DIR}/*{str_kepid}*.fits")
        if not files:
            return None

        lc_collection = lk.LightCurveFileCollection(files)
        lc = lc_collection.stitch().remove_nans().remove_outliers()

        folded = lc.fold(period=period, epoch_time=t0)
        flux = folded.flux.value

        scaler = RobustScaler()
        flux = scaler.fit_transform(flux.reshape(-1, 1)).flatten()

        flux_interp = np.interp(np.linspace(0, len(flux), n_points),
                                np.arange(len(flux)), flux)
        return flux_interp
    except Exception as e:
        print(f"Erro em {kepid}: {e}")
        return None


def process_lightcurves(df, n_points=200):
    """Gera DataFrame consolidado com fluxos já processados"""
    results = []
    for _, row in df.iterrows():
        flux = get_lightcurve_local(row.kepid, row.koi_period, row.koi_time0bk, n_points=n_points)
        if flux is not None:
            results.append({
                "kepid": row.kepid,
                "koi_period": row.koi_period,
                "koi_time0bk": row.koi_time0bk,
                "flux": flux
            })
    return pd.DataFrame(results)


# ---------- EXEMPLO DE USO ----------
# 1) Baixa em lotes de 200
# download_lightcurves(df, batch_size=200, sleep_time=2)

# 2) Processa todos os já baixados
# df_lightcurves = process_lightcurves(df, n_points=200)

# 3) Salva resultado parcial
# df_lightcurves.to_parquet("lightcurves.parquet")


In [None]:
download_lightcurves(df, batch_size=200, sleep_time=2)



🔽 Batch 1 de 42...


        Use search_lightcurve() instead.
  search = lk.search_lightcurvefile(f"KIC {int(kepid)}", mission="Kepler")
