In [93]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import  RandomizedSearchCV
from sklearn.metrics import f1_score
import joblib



In [94]:
# List files in the data directory to check the correct filename
print(os.getcwd())
# __file__ is not defined in Jupyter notebooks; use os.getcwd() instead
dir_path = os.getcwd()

print("Current Directory:", dir_path)
dir_path = dir_path.replace('src\\kepler', 'data')
print("Data Directory:", dir_path)
df = pd.read_csv(dir_path + '/processed_toi.csv')

c:\Users\pjroc\OneDrive\Documents\GitHub\RightGoodProgrammers\src\kepler
Current Directory: c:\Users\pjroc\OneDrive\Documents\GitHub\RightGoodProgrammers\src\kepler
Data Directory: c:\Users\pjroc\OneDrive\Documents\GitHub\RightGoodProgrammers\data


In [95]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[['tfopwg_disp_APC', 'tfopwg_disp_CP', 'tfopwg_disp_FA','tfopwg_disp_FP', 'tfopwg_disp_KP', 'tfopwg_disp_PC']])

In [96]:
target_cols = ['tfopwg_disp_APC', 'tfopwg_disp_CP', 'tfopwg_disp_FA', 'tfopwg_disp_FP', 'tfopwg_disp_KP', 'tfopwg_disp_PC']
x_train = train_df.drop(columns=target_cols)
y_train = train_df[target_cols]
x_test = test_df.drop(columns=target_cols)
y_test = test_df[target_cols]

print("\nTraining set shape:", x_train.shape, y_train.shape)
print("Testing set shape:", x_test.shape, y_test.shape)


Training set shape: (6162, 37) (6162, 6)
Testing set shape: (1541, 37) (1541, 6)


In [97]:

# Initialize scaler if it doesnt exist
for file in os.listdir(dir_path.replace('data', 'models')):
    print(file)
    if 'toiscaler' in file:
        scaler = joblib.load(os.path.join(dir_path.replace('data', 'models'), file))
        break
    else:
        scaler = StandardScaler()

# Fit and transform training data (learns mean and std)
x_train_scaled = scaler.fit_transform(x_train)

# Transform test data (using training mean/std)
x_test_scaled = scaler.transform(x_test)

# Convert back to DataFrames to keep column names
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns, index=x_train.index)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns, index=x_test.index)

# Show before/after scaling for first feature
print("Before scaling:")
print(f"Mean: {x_train.iloc[:,0].mean():.2f}")
print(f"Std: {x_train.iloc[:,0].std():.2f}")

print("\nAfter scaling:")
print(f"Mean: {x_train_scaled.iloc[:,0].mean():.2f}")
print(f"Std: {x_train_scaled.iloc[:,0].std():.2f}")

keplerkepler_rf_best.joblib
keplerlabel_encoder.joblib
keplerscaler.joblib
toiscaler.joblib
Before scaling:
Mean: 43163.25
Std: 24867.81

After scaling:
Mean: 0.00
Std: 1.00


In [98]:
y_train = y_train.idxmax(axis=1).str.replace('tfopwg_disp_', '').str.strip()
y_test = y_test.idxmax(axis=1).str.replace('tfopwg_disp_', '').str.strip()


for file in os.listdir(dir_path.replace('data', 'models')):
    if 'toilabelencoder' in file:
        label_encoder = joblib.load(os.path.join(dir_path.replace('data', 'models'), file))
        break
    else:
        label_encoder = LabelEncoder()

        y_train = label_encoder.fit_transform(y_train)
        y_test = label_encoder.transform(y_test)

        print("\nClasses:", label_encoder.classes_)
        print("Training Label Distribution:",pd.Series(y_train).value_counts())


Classes: ['APC' 'CP' 'FA' 'FP' 'KP' 'PC']
Training Label Distribution: 5    3743
3     958
1     547
4     466
0     370
2      78
Name: count, dtype: int64

Classes: [0 1 2 3 4 5]
Training Label Distribution: 5    3743
3     958
1     547
4     466
0     370
2      78
Name: count, dtype: int64

Classes: [0 1 2 3 4 5]
Training Label Distribution: 5    3743
3     958
1     547
4     466
0     370
2      78
Name: count, dtype: int64

Classes: [0 1 2 3 4 5]
Training Label Distribution: 5    3743
3     958
1     547
4     466
0     370
2      78
Name: count, dtype: int64


In [99]:
number_of_trees = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
best_f1 = 0
best_rf = None
for file in os.listdir(dir_path.replace('data', 'models')):
    if 'toirandomforest' in file:
        best_rf = joblib.load(os.path.join(dir_path.replace('data', 'models'), file))
        y_pred = best_rf.predict(x_test_scaled)
        best_f1 = f1_score(y_test, y_pred, average='weighted')
        print(f"Loaded existing model with F1 Score: {best_f1:.4f}")
        break
    
for n_trees in number_of_trees:
    print(f"\nTraining Random Forest with {n_trees} trees...")
    rf = RandomForestClassifier(n_estimators=n_trees, random_state=42, n_jobs=-1)
    rf.fit(x_train_scaled, y_train)
    
    y_pred = rf.predict(x_test_scaled)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"F1 Score: {f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_rf = rf
        print("New best model found!")


Training Random Forest with 100 trees...
F1 Score: 0.6579
New best model found!

Training Random Forest with 200 trees...
F1 Score: 0.6579
New best model found!

Training Random Forest with 200 trees...
F1 Score: 0.6540

Training Random Forest with 300 trees...
F1 Score: 0.6540

Training Random Forest with 300 trees...
F1 Score: 0.6566

Training Random Forest with 400 trees...
F1 Score: 0.6566

Training Random Forest with 400 trees...
F1 Score: 0.6539

Training Random Forest with 500 trees...
F1 Score: 0.6539

Training Random Forest with 500 trees...
F1 Score: 0.6548

Training Random Forest with 600 trees...
F1 Score: 0.6548

Training Random Forest with 600 trees...
F1 Score: 0.6590
New best model found!

Training Random Forest with 700 trees...
F1 Score: 0.6590
New best model found!

Training Random Forest with 700 trees...
F1 Score: 0.6580

Training Random Forest with 800 trees...
F1 Score: 0.6580

Training Random Forest with 800 trees...
F1 Score: 0.6554

Training Random Forest wit