In [1]:
import pandas as pd

In [2]:
toi_raw = pd.read_csv('toi_raw.csv')
toi_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 87 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              7703 non-null   int64  
 1   toi                7703 non-null   float64
 2   toipfx             7703 non-null   int64  
 3   tid                7703 non-null   int64  
 4   ctoi_alias         7703 non-null   float64
 5   pl_pnum            7703 non-null   int64  
 6   tfopwg_disp        7703 non-null   object 
 7   rastr              7703 non-null   object 
 8   ra                 7703 non-null   float64
 9   raerr1             0 non-null      float64
 10  raerr2             0 non-null      float64
 11  decstr             7703 non-null   object 
 12  dec                7703 non-null   float64
 13  decerr1            0 non-null      float64
 14  decerr2            0 non-null      float64
 15  st_pmra            7569 non-null   float64
 16  st_pmraerr1        7569 

In [3]:
k2_raw = pd.read_csv('k2pandc_raw.csv')
k2_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4004 entries, 0 to 4003
Columns: 295 entries, rowid to pl_ndispec
dtypes: float64(236), int64(26), object(33)
memory usage: 9.0+ MB


In [4]:

joint_in_names = [
    "pl_name",
    "hostname",
    "pl_tranmid",	    
    "pl_trandur",		    
    "pl_orbper",		    
    "pl_trandep",
    "pl_rade",		    
    "pl_insol",		    
    "pl_eqt",
    "st_teff",
    "st_logg",
    "st_rad",
    "ra",
    "dec",
    "sy_pmra",
    "sy_pmdec",
]

joint_out_names = [
    "toi",
    "toipfx",
    "pl_tranmid", # check units	    
    "pl_trandurh",		    
    "pl_orbper",		    
    "pl_trandep", # check uits
    "pl_rade",		    
    "pl_insol",		    
    "pl_eqt",
    "st_teff",
    "st_logg",
    "st_rad",
    "ra",
    "dec",
    "st_pmra",
    "st_pmdec",
]

def filter_k2(k2):
    full_joint_in_names = joint_in_names
    full_joint_out_names = joint_out_names
    
    for in_col, out_col in zip(joint_in_names, joint_out_names):
        if in_col+"err1" in k2.columns:
            full_joint_in_names.append(in_col+"err1")
            full_joint_out_names.append(out_col+"err1")
        if in_col+"err2" in k2.columns:
            full_joint_in_names.append(in_col+"err2")
            full_joint_out_names.append(out_col+"err2")

    rename_table = {in_col: out_col for in_col, out_col in zip(full_joint_in_names, full_joint_out_names)}

    k2_filt = k2[k2["tran_flag"] != 0]
    k2_filt = k2_filt[full_joint_in_names]
    k2_filt = k2_filt.rename(columns=rename_table)

    k2_filt['pl_trandep'] = k2_filt['pl_trandep']*1000

    return k2_filt

In [5]:
k2_filt = filter_k2(k2_raw)
k2_filt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3980 entries, 0 to 4003
Data columns (total 40 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   toi              3980 non-null   object 
 1   toipfx           3980 non-null   object 
 2   pl_tranmid       3925 non-null   float64
 3   pl_trandurh      2783 non-null   float64
 4   pl_orbper        3936 non-null   float64
 5   pl_trandep       2099 non-null   float64
 6   pl_rade          3173 non-null   float64
 7   pl_insol         626 non-null    float64
 8   pl_eqt           846 non-null    float64
 9   st_teff          2871 non-null   float64
 10  st_logg          2342 non-null   float64
 11  st_rad           3852 non-null   float64
 12  ra               3980 non-null   float64
 13  dec              3980 non-null   float64
 14  st_pmra          3941 non-null   float64
 15  st_pmdec         3941 non-null   float64
 16  pl_tranmiderr1   3024 non-null   float64
 17  pl_tranmiderr2   30

In [6]:
n = 2000  # for example
cols_with_many_nans = k2_filt.columns[k2_filt.isna().sum() > n].tolist()
print("Columns with more than", n, "NaNs:", cols_with_many_nans)

# --- Step 2: Create a new DataFrame with columns having >2000 non-null points ---
k2_filt = k2_filt.loc[:, k2_filt.notna().sum() > 2000]

print("Shape of new DataFrame:", k2_filt.shape)

Columns with more than 2000 NaNs: ['pl_insol', 'pl_eqt', 'pl_trandurherr1', 'pl_trandurherr2', 'pl_trandeperr1', 'pl_trandeperr2', 'pl_insolerr1', 'pl_insolerr2', 'pl_eqterr1', 'pl_eqterr2']
Shape of new DataFrame: (3980, 30)


In [7]:
k2_filt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3980 entries, 0 to 4003
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   toi             3980 non-null   object 
 1   toipfx          3980 non-null   object 
 2   pl_tranmid      3925 non-null   float64
 3   pl_trandurh     2783 non-null   float64
 4   pl_orbper       3936 non-null   float64
 5   pl_trandep      2099 non-null   float64
 6   pl_rade         3173 non-null   float64
 7   st_teff         2871 non-null   float64
 8   st_logg         2342 non-null   float64
 9   st_rad          3852 non-null   float64
 10  ra              3980 non-null   float64
 11  dec             3980 non-null   float64
 12  st_pmra         3941 non-null   float64
 13  st_pmdec        3941 non-null   float64
 14  pl_tranmiderr1  3024 non-null   float64
 15  pl_tranmiderr2  3024 non-null   float64
 16  pl_orbpererr1   3047 non-null   float64
 17  pl_orbpererr2   3047 non-null   float6

In [8]:
k2_filt.to_csv('k2pandc_filtered.csv')