In [2]:
#For use with training on Google Colab 

#from google.colab import drive

#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data From Drive

In [3]:
#!unzip "/content/drive/My Drive/Lightcurves/confirmed_lightcurve_processed.zip"

In [4]:
#!unzip "/content/drive/My Drive/Lightcurves/false_lightcurve_processed.zip"

In [5]:
#!ls

confirmed_lightcurves  drive  false_lightcurves  sample_data


In [None]:
# Colab Paths
#confirmed_lc_path = "/content/confirmed_lightcurves"
#false_lc_path = "/content/false_lightcurves"

In [1]:
# Local Paths
confirmed_lc_path = "data/confirmed_lightcurves"
false_lc_path = "data/false_lightcurves"

# Extract Transform and Load for Random Forest

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib # for saving model
from sklearn.metrics import accuracy_score # for evaluating model
import os

In [4]:
def load_data(dir):
  data = []
  for file in os.listdir(dir):
    if file.endswith(".csv"):
      filepath = os.path.join(dir, file)
      df = pd.read_csv(filepath)
      df = df.iloc[:, :13]
      df = df.drop(["QUALITY", "ORBITID"], axis=1)
      df = df.rename(
          columns={
              "KSPSAP_FLUX": "DET_FLUX",
              "KSPSAP_FLUX_ERR": "DET_FLUX_ERR",
              "KSPSAP_FLUX_SML": "DET_FLUX_SML",
              "KSPSAP_FLUX_LAG": "DET_FLUX_LAG",
          }
      )
      data.append(df)
  return pd.concat(data, ignore_index=True)

In [5]:
confirmed_lc = load_data(confirmed_lc_path)

In [6]:
print(confirmed_lc.shape)
print(confirmed_lc.head())
print(confirmed_lc[confirmed_lc.isna().any(axis=1)])

(9852146, 11)
          TIME  CADENCENO  SAP_FLUX  DET_FLUX  DET_FLUX_ERR      SAP_X  \
0  3126.642265     820445  1.019994  1.020734      0.005286  1800.3823   
1  3126.644580     820446  1.013390  0.992595      0.005286  1800.3783   
2  3126.646895     820447  1.015191  0.996209      0.005286  1800.3739   
3  3126.649210     820448  1.013493  0.993270      0.005286  1800.3779   
4  3126.651525     820449  1.020052  1.002043      0.005286  1800.3721   

       SAP_Y  SAP_BKG  SAP_BKG_ERR  DET_FLUX_SML  DET_FLUX_LAG  
0  291.42758    64.16        92.99      1.018177      1.020855  
1  291.41550   130.18       133.81      0.996789      0.990436  
2  291.42462    41.90       155.22      0.996953      0.993630  
3  291.41837   175.15       106.80      0.995014      0.993927  
4  291.42120   113.13        91.91      1.001834      1.002362  
                TIME  CADENCENO  SAP_FLUX  DET_FLUX  DET_FLUX_ERR       SAP_X  \
12589    2207.777683     141165       NaN       NaN      0.006598  143

In [7]:
confirmed_lc = confirmed_lc.dropna()

In [8]:
print(confirmed_lc[confirmed_lc.isna().any(axis=1)])

Empty DataFrame
Columns: [TIME, CADENCENO, SAP_FLUX, DET_FLUX, DET_FLUX_ERR, SAP_X, SAP_Y, SAP_BKG, SAP_BKG_ERR, DET_FLUX_SML, DET_FLUX_LAG]
Index: []


In [9]:
false_lc = load_data(false_lc_path)

In [10]:
print(false_lc.shape)
print(false_lc.head())
print(false_lc[false_lc.isna().any(axis=1)])

(9692892, 11)
          TIME  CADENCENO  SAP_FLUX  DET_FLUX  DET_FLUX_ERR      SAP_X  \
0  2637.477790     203042  0.988770  0.999323      0.004794  1233.0536   
1  2637.484734     203043  0.998916  1.009667      0.004794  1233.0540   
2  2637.491679     203044  0.994662  1.005454      0.004794  1233.0538   
3  2637.498623     203045  0.997867  1.008777      0.004794  1233.0538   
4  2637.505567     203046  0.987574  0.998449      0.004794  1233.0515   

       SAP_Y    SAP_BKG  SAP_BKG_ERR  DET_FLUX_SML  DET_FLUX_LAG  
0  1392.1917  110683.02       763.46      1.001365      0.996690  
1  1392.1908  112790.16      1013.83      1.005987      1.008757  
2  1392.1892  116500.82       902.16      1.001765      1.009290  
3  1392.1866  119515.37       888.66      1.007896      1.011716  
4  1392.1857  108032.88       920.01      1.001798      0.999099  
                TIME  CADENCENO  SAP_FLUX  DET_FLUX  DET_FLUX_ERR       SAP_X  \
940      2644.005525     203982       NaN       NaN      0

In [11]:
false_lc = false_lc.dropna()

In [12]:
print(false_lc[false_lc.isna().any(axis=1)])

Empty DataFrame
Columns: [TIME, CADENCENO, SAP_FLUX, DET_FLUX, DET_FLUX_ERR, SAP_X, SAP_Y, SAP_BKG, SAP_BKG_ERR, DET_FLUX_SML, DET_FLUX_LAG]
Index: []


In [13]:
print(confirmed_lc.max())
print(confirmed_lc.min())
print(false_lc.max())
print(false_lc.min())

TIME            3.339571e+03
CADENCENO       9.124310e+05
SAP_FLUX        2.149493e+26
DET_FLUX                 inf
DET_FLUX_ERR    4.809033e-02
SAP_X           2.091944e+03
SAP_Y           2.046795e+03
SAP_BKG         7.474109e+07
SAP_BKG_ERR     2.503320e+07
DET_FLUX_SML             inf
DET_FLUX_LAG             inf
dtype: float64
TIME            1.325329e+03
CADENCENO       4.697000e+03
SAP_FLUX        1.179115e-04
DET_FLUX        0.000000e+00
DET_FLUX_ERR    1.802014e-04
SAP_X           4.417363e+01
SAP_Y           1.009482e+00
SAP_BKG        -6.204268e+07
SAP_BKG_ERR     1.800000e+00
DET_FLUX_SML    0.000000e+00
DET_FLUX_LAG    0.000000e+00
dtype: float64
TIME            3.339571e+03
CADENCENO       9.124310e+05
SAP_FLUX        4.747897e+18
DET_FLUX                 inf
DET_FLUX_ERR    5.470158e-02
SAP_X           2.092161e+03
SAP_Y           2.047969e+03
SAP_BKG         7.392538e+07
SAP_BKG_ERR     2.948293e+07
DET_FLUX_SML             inf
DET_FLUX_LAG             inf
dtype: float6

In [14]:
inf_count_confirmed = confirmed_lc.isin([np.inf]).any(axis=1).sum()
print(inf_count_confirmed)

828


In [15]:
inf_count_false = false_lc.isin([np.inf]).any(axis=1).sum()
print(inf_count_false)

932


In [16]:
print(confirmed_lc.shape)
print(false_lc.shape)

(9786721, 11)
(9631801, 11)


In [17]:
confirmed_lc = confirmed_lc[~confirmed_lc.isin([np.inf]).any(axis=1)]
false_lc = false_lc[~false_lc.isin([np.inf]).any(axis=1)]

In [18]:
print(confirmed_lc.shape)
print(false_lc.shape)

(9785893, 11)
(9630869, 11)


In [19]:
confirmed_lc["LABEL"] = 1
false_lc["LABEL"] = 0

# Train Random Forest Classifier

In [20]:
data = pd.concat([confirmed_lc, false_lc], ignore_index=True)

In [21]:
print(data.shape)
print(data.head())

(19416762, 12)
          TIME  CADENCENO  SAP_FLUX  DET_FLUX  DET_FLUX_ERR      SAP_X  \
0  3126.642265     820445  1.019994  1.020734      0.005286  1800.3823   
1  3126.644580     820446  1.013390  0.992595      0.005286  1800.3783   
2  3126.646895     820447  1.015191  0.996209      0.005286  1800.3739   
3  3126.649210     820448  1.013493  0.993270      0.005286  1800.3779   
4  3126.651525     820449  1.020052  1.002043      0.005286  1800.3721   

       SAP_Y  SAP_BKG  SAP_BKG_ERR  DET_FLUX_SML  DET_FLUX_LAG  LABEL  
0  291.42758    64.16        92.99      1.018177      1.020855      1  
1  291.41550   130.18       133.81      0.996789      0.990436      1  
2  291.42462    41.90       155.22      0.996953      0.993630      1  
3  291.41837   175.15       106.80      0.995014      0.993927      1  
4  291.42120   113.13        91.91      1.001834      1.002362      1  


In [22]:
del false_lc
del confirmed_lc

In [23]:
import gc
gc.collect()

0

In [24]:
X = data.drop(["LABEL"], axis=1)
y = data["LABEL"]

In [25]:
print(X.head())
print(y.head())

          TIME  CADENCENO  SAP_FLUX  DET_FLUX  DET_FLUX_ERR      SAP_X  \
0  3126.642265     820445  1.019994  1.020734      0.005286  1800.3823   
1  3126.644580     820446  1.013390  0.992595      0.005286  1800.3783   
2  3126.646895     820447  1.015191  0.996209      0.005286  1800.3739   
3  3126.649210     820448  1.013493  0.993270      0.005286  1800.3779   
4  3126.651525     820449  1.020052  1.002043      0.005286  1800.3721   

       SAP_Y  SAP_BKG  SAP_BKG_ERR  DET_FLUX_SML  DET_FLUX_LAG  
0  291.42758    64.16        92.99      1.018177      1.020855  
1  291.41550   130.18       133.81      0.996789      0.990436  
2  291.42462    41.90       155.22      0.996953      0.993630  
3  291.41837   175.15       106.80      0.995014      0.993927  
4  291.42120   113.13        91.91      1.001834      1.002362  
0    1
1    1
2    1
3    1
4    1
Name: LABEL, dtype: int64


In [26]:
del data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 69)

In [28]:
del X
del y

In [29]:
rf = RandomForestClassifier(n_estimators=10, random_state=69, warm_start = True, oob_score=True, n_jobs=-1)

In [30]:
rf.fit(X_train, y_train)

  warn(


In [31]:
print(rf.oob_score_)
print(rf.score(X_test, y_test))

0.9926868596584304
0.9978374358447455


In [32]:
rf.predict(X_test)

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

# Test with Random Data

In [33]:
def load_and_predict(filepath, model):
  df = pd.read_csv(filepath)
  df = df.iloc[:, :13]

  df = df.rename(
      columns={
          "KSPSAP_FLUX": "DET_FLUX",
          "KSPSAP_FLUX_ERR": "DET_FLUX_ERR",
          "KSPSAP_FLUX_SML": "DET_FLUX_SML",
          "KSPSAP_FLUX_LAG": "DET_FLUX_LAG",
      }
  )
  df = df.dropna()
  df = df[~df.isin([np.inf]).any(axis=1)]
  df = df.drop(["QUALITY", "ORBITID"], axis=1)
  prediction = model.predict(df)
  return prediction

In [34]:
import random

def test_random_lightcurve(model, confirmed_path, false_path):
  paths = [confirmed_path, false_path]
  chosen_path = random.choice(paths)
  is_confirmed = chosen_path == confirmed_path

  files = [f for f in os.listdir(chosen_path) if f.endswith(".csv")]
  random_file = random.choice(files)
  filepath = os.path.join(chosen_path, random_file)

  prediction = load_and_predict(filepath, model)

  print(f"File: {filepath}")
  print(f"Actual Category: {'Confirmed' if is_confirmed else 'False'}")
  print(f"Predicted Category: {'Confirmed' if prediction[0] == 1 else 'False'}")


In [35]:
validation_confirmed_lc_path = "data/validation_lightcurves/confirmed_validation"
validation_false_lc_path = "data/validation_lightcurves/false_validation"

In [36]:
for i in range(10):
  test_random_lightcurve(rf, validation_confirmed_lc_path, validation_false_lc_path)

File: data/validation_lightcurves/confirmed_validation\1551345500_lightcurve.csv
Actual Category: Confirmed
Predicted Category: Confirmed
File: data/validation_lightcurves/false_validation\2041563029_lightcurve.csv
Actual Category: False
Predicted Category: False
File: data/validation_lightcurves/confirmed_validation\1551345500_lightcurve.csv
Actual Category: Confirmed
Predicted Category: Confirmed
File: data/validation_lightcurves/confirmed_validation\1884091865_lightcurve.csv
Actual Category: Confirmed
Predicted Category: Confirmed
File: data/validation_lightcurves/false_validation\1965256662_lightcurve.csv
Actual Category: False
Predicted Category: False
File: data/validation_lightcurves/confirmed_validation\1884091865_lightcurve.csv
Actual Category: Confirmed
Predicted Category: Confirmed
File: data/validation_lightcurves/confirmed_validation\1551345500_lightcurve.csv
Actual Category: Confirmed
Predicted Category: Confirmed
File: data/validation_lightcurves/false_validation\1717732

# Save the Model using joblib

In [38]:
if not os.path.exists("models"):
    os.makedirs("models")
joblib.dump(rf, "models/random_forest_lightcurve_classifier.pkl")

['models/random_forest_lightcurve_classifier.pkl']