In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import  RandomizedSearchCV
from sklearn.metrics import f1_score
import joblib



In [3]:
# List files in the data directory to check the correct filename
print(os.getcwd())
# __file__ is not defined in Jupyter notebooks; use os.getcwd() instead
dir_path = os.getcwd()

print("Current Directory:", dir_path)
dir_path = dir_path.replace('src\\kepler', 'data')
print("Data Directory:", dir_path)
df = pd.read_csv(dir_path + '/cumulative_final_2025.10.03.csv')


c:\Users\downe\RightGoodProgrammers\src\kepler
Current Directory: c:\Users\downe\RightGoodProgrammers\src\kepler
Data Directory: c:\Users\downe\RightGoodProgrammers\data


In [4]:
##perform a train test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[['koi_disposition_CANDIDATE', 'koi_disposition_CONFIRMED', 'koi_disposition_FALSE POSITIVE']])

In [5]:
# Separate features and target
target_cols = ['koi_disposition_CANDIDATE', 'koi_disposition_CONFIRMED', 'koi_disposition_FALSE POSITIVE']
X_train = train_df.drop(columns=target_cols)
y_train = train_df[target_cols]
X_test = test_df.drop(columns=target_cols)
y_test = test_df[target_cols]

# Print shapes to verify split
print("Training shapes:", X_train.shape, y_train.shape)
print("Testing shapes:", X_test.shape, y_test.shape)

Training shapes: (7651, 32) (7651, 3)
Testing shapes: (1913, 32) (1913, 3)


In [12]:

# Initialize scaler if it doesnt exist
for file in os.listdir(dir_path.replace('data', 'models')):
    print(file)
    if 'keplerscaler' in file:
        scaler = joblib.load(os.path.join(dir_path.replace('data', 'models'), file))
        break
    else:
        scaler = StandardScaler()

# Fit and transform training data (learns mean and std)
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data (using training mean/std)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames to keep column names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Show before/after scaling for first feature
print("Before scaling:")
print(f"Mean: {X_train.iloc[:,0].mean():.2f}")
print(f"Std: {X_train.iloc[:,0].std():.2f}")

print("\nAfter scaling:")
print(f"Mean: {X_train_scaled.iloc[:,0].mean():.2f}")
print(f"Std: {X_train_scaled.iloc[:,0].std():.2f}")

keplerkepler_rf_best.joblib
keplerlabel_encoder.joblib
keplerscaler.joblib
Before scaling:
Mean: 0.48
Std: 0.48

After scaling:
Mean: -0.00
Std: 1.00


In [14]:
# convert one-hot DataFrame -> single string label -> integer codes
y_train_single = y_train.idxmax(axis=1).str.replace('koi_disposition_', '').str.strip()
y_test_single  = y_test.idxmax(axis=1).str.replace('koi_disposition_', '').str.strip()
##encode labels
for filename in os.listdir(dir_path.replace('data', 'models')):
    if 'label_encoder.joblib' in filename:
        le = joblib.load(dir_path.replace('data', 'models') + '//' + filename)
        break
    else:
        le = LabelEncoder()
        
y_train_enc = le.fit_transform(y_train_single)
y_test_enc  = le.transform(y_test_single)

print("Classes:", le.classes_)
print("Training label distribution:\n", pd.Series(y_train_enc).value_counts())

Classes: ['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']
Training label distribution:
 2    3871
1    2197
0    1583
Name: count, dtype: int64


In [16]:
# create a RandomForestClassifier and save the best model based on f1 score
number_of_trees = [100, 200, 300, 400, 500]
best_f1 = 0
best_rf = None
for filename in os.listdir(dir_path.replace('data', 'models')):
    if 'kepler_rf_best.joblib' in filename:
        best_rf = joblib.load(dir_path.replace('data', 'models') + '//' + filename)
        y_pred = best_rf.predict(X_test_scaled)
        best_f1 = f1_score(y_test_enc, y_pred, average='weighted')
        print(f"Loaded existing model with F1 score: {best_f1:.4f}")
        break
    else:
        for n in number_of_trees:
            rf = RandomForestClassifier(n_estimators=n, random_state=42)
            rf.fit(X_train_scaled, y_train_enc)
            y_pred = rf.predict(X_test_scaled)
            f1 = f1_score(y_test_enc, y_pred, average='weighted')
            print(f"F1 score for {n} trees: {f1:.4f}")
            if f1 > best_f1:
                best_f1 = f1
                best_rf = rf

print(f"Best F1 score: {best_f1:.4f}")


Loaded existing model with F1 score: 0.9246
Best F1 score: 0.9246


In [None]:
y_pred = best_rf.predict(X_test_scaled)
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))
print("Confusion matrix:")
print(confusion_matrix(y_test_enc, y_pred))

                precision    recall  f1-score   support

     CANDIDATE       0.85      0.79      0.82       396
     CONFIRMED       0.87      0.91      0.89       549
FALSE POSITIVE       0.98      0.99      0.99       968

      accuracy                           0.93      1913
     macro avg       0.90      0.90      0.90      1913
  weighted avg       0.92      0.93      0.92      1913

Confusion matrix:
[[313  71  12]
 [ 44 501   4]
 [ 10   2 956]]


In [None]:
# save best model and label encoder and scaler
current_dir = os.getcwd()
model_dir = current_dir.replace('src\\kepler', 'models\\kepler')
joblib.dump(best_rf, model_dir + 'kepler_rf_best.joblib')
joblib.dump(le, model_dir + 'label_encoder.joblib')
joblib.dump(scaler, model_dir + 'scaler.joblib')

['c:\\Users\\downe\\RightGoodProgrammers\\models\\keplerscaler.joblib']