In [1]:
# Update sklearn to prevent version mismatches, uncomment if update is required
#!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model, uncomment if update is required
# Restart your kernel after installing 
#!pip install joblib

In [3]:
# Import dependencies
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Read in the data
df = pd.read_csv("../data/exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


# Select your features (columns)

Use `koi_disposition` for the y values

In [5]:
# Set features. This will also be used as your x values.

# Clean up the data, remove columns that are not required/needed
# Initially, it seems like the columns with uncertainties (_err1, _err2) may not be good features to include
err1 = "_err1"
err2 = "_err2"

# Create a copy of the original dataframe
new_df = df.copy()

# For all columnd in the dataframe
for x in range(len(df.columns)):
    
    # If "_err1" or "_err2" are in the column header/name, drop that column in the new dataframe
    if err1 in df.columns[x] or err2 in df.columns[x]:
        new_df = new_df.drop(df.columns[x], axis=1)
new_df

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,...,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,162.513840,0.586,4.50700,874.8,...,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,175.850252,0.969,1.78220,10829.0,...,638,39.30,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,...,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,171.595550,0.701,1.65450,603.3,...,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,172.979370,0.762,3.14020,686.0,...,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,132.016100,0.765,4.80600,87.7,...,929,176.40,8.4,1,5638,4.296,1.088,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,131.705093,1.252,3.22210,1579.2,...,2088,4500.53,453.3,1,5638,4.529,0.903,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,133.001270,0.043,3.11400,48.5,...,1608,1585.81,10.6,1,6119,4.444,1.031,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,132.181750,0.147,0.86500,103.6,...,2218,5713.41,12.3,1,6173,4.447,1.041,294.16489,47.176281,15.385


In [6]:
# Set the features for x and the target for y
X = new_df.drop("koi_disposition", axis=1)
y = new_df["koi_disposition"].values.reshape(-1, 1)

# Print the shape of both to make sure it is the same
print(X.shape, y.shape)

(6991, 20) (6991, 1)


# Create a Train Test Split

In [7]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Use the label encoder to change the categorical target values into numeric values using mapping, example: confirmed = 0, false positive = 1, candidate = 2
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)

# Create the training and testing data with a random state for repeatability
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=250)

# Pre-processing and Training

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Import dependencies
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Create a MinMaxScalar model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

# Transform the training and testing data with the scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Use RandomForestClassifier to create a model with the training data and score the test data
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_scaled, y_train)

print(f'Training Score: {rf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.8958810068649885


In [9]:
# Sort the feature importance in descending value
sorted(zip(rf.feature_importances_, X_train), reverse=True)

[(0.13161579832090747, 'koi_model_snr'),
 (0.1280068846390581, 'koi_fpflag_co'),
 (0.12695229031434335, 'koi_fpflag_nt'),
 (0.10829978395766783, 'koi_fpflag_ss'),
 (0.07061509256679084, 'koi_prad'),
 (0.051404787916938435, 'koi_depth'),
 (0.04406460341430325, 'koi_fpflag_ec'),
 (0.041409869262889615, 'koi_period'),
 (0.040246596862947515, 'koi_impact'),
 (0.036464802565089646, 'koi_teq'),
 (0.03174167794727144, 'koi_duration'),
 (0.0291006440195949, 'koi_time0bk'),
 (0.025681838705606377, 'koi_insol'),
 (0.022254200233893975, 'koi_steff'),
 (0.02183375819810256, 'ra'),
 (0.021146100633288346, 'koi_kepmag'),
 (0.0206474478946299, 'koi_srad'),
 (0.02058625896942758, 'koi_slogg'),
 (0.02054100575536139, 'dec'),
 (0.007386557821887476, 'koi_tce_plnt_num')]

In [10]:
# For comparison, let's bring in all the features in the data to see if our feature importance is similar or if we may need to make adjustments
X_all = df.drop("koi_disposition", axis=1)
y_all = df["koi_disposition"]

label_encoder = LabelEncoder()
label_encoder.fit(y_all)
y_all = label_encoder.transform(y_all)

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, random_state=250)

X_scaler_all = MinMaxScaler().fit(X_train_all)

# Transform the training and testing data with the scaler models
X_train_scaled_all = X_scaler_all.transform(X_train_all)
X_test_scaled_all = X_scaler_all.transform(X_test_all)

# Use RandomForestClassifier to create a model with the training data and score the test data
rf.fit(X_train_scaled_all, y_train_all)

print(f'Training Score: {rf.score(X_train_scaled_all, y_train_all)}')
print(f'Testing Score: {rf.score(X_test_scaled_all, y_test_all)}')

Training Score: 1.0
Testing Score: 0.8964530892448512


In [11]:
# Sort the feature importance in descending value
sorted(zip(rf.feature_importances_, X_train_all), reverse=True)

[(0.10997085006330476, 'koi_fpflag_co'),
 (0.10056226489555789, 'koi_fpflag_nt'),
 (0.07821605561519744, 'koi_fpflag_ss'),
 (0.05653037576135076, 'koi_model_snr'),
 (0.04334745202796319, 'koi_prad'),
 (0.03464175625924082, 'koi_duration_err2'),
 (0.034246889389689186, 'koi_prad_err1'),
 (0.0326689334527591, 'koi_duration_err1'),
 (0.03162544917990199, 'koi_fpflag_ec'),
 (0.031079489262795478, 'koi_prad_err2'),
 (0.02911908593741275, 'koi_steff_err1'),
 (0.028402200821341324, 'koi_steff_err2'),
 (0.023436837320198696, 'koi_duration'),
 (0.022805862170198067, 'koi_time0bk_err2'),
 (0.022623984903608695, 'koi_time0bk_err1'),
 (0.021448984886314793, 'koi_depth'),
 (0.02133429607559118, 'koi_period'),
 (0.0184707627653909, 'koi_insol_err1'),
 (0.018112917032699176, 'koi_teq'),
 (0.01679781372571654, 'koi_impact'),
 (0.016386561936601526, 'koi_period_err2'),
 (0.01605193563774088, 'koi_period_err1'),
 (0.013714964534352241, 'koi_insol'),
 (0.013567964304707083, 'koi_depth_err1'),
 (0.0134229

In [12]:
# It looks like we have similar features at the top and similar test scores for both runs so we will pick the top 5 features from our original set and retrain the model again to see if we can get an improved score

# Set the new features for x
X = new_df[["koi_fpflag_nt","koi_fpflag_co", "koi_fpflag_ss", "koi_model_snr", "koi_fpflag_ec", "koi_impact"]].copy()

# Create the training and testing data with a random state for repeatability
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=250)

# Create a MinMaxScalar model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

# Transform the training and testing data with the scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Use RandomForestClassifier to create a model with the training data and score the test data
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_scaled, y_train)

print(f'Training Score: {rf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rf.score(X_test_scaled, y_test)}')

Training Score: 0.9996185390043868
Testing Score: 0.8632723112128147


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10, 50, 100, 250],
              'max_depth': [5, 10, 20, 50]}
grid = GridSearchCV(rf,param_grid, verbose=3)

In [14]:
# Fit the model using the grid search estimator
# This will take the RandomForest model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] max_depth=5, n_estimators=10 ....................................
[CV] ........ max_depth=5, n_estimators=10, score=0.895, total=   0.1s
[CV] max_depth=5, n_estimators=10 ....................................
[CV] ........ max_depth=5, n_estimators=10, score=0.855, total=   0.1s
[CV] max_depth=5, n_estimators=10 ....................................
[CV] ........ max_depth=5, n_estimators=10, score=0.880, total=   0.0s
[CV] max_depth=5, n_estimators=10 ....................................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[CV] ........ max_depth=5, n_estimators=10, score=0.898, total=   0.0s
[CV] max_depth=5, n_estimators=10 ....................................
[CV] ........ max_depth=5, n_estimators=10, score=0.889, to

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              ra

In [15]:
# Find the best parameters and corresponding score
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 10, 'n_estimators': 100}
0.887852662295607


In [16]:
# Check the training and testing scores
print(f'Training Score: {grid.score(X_train_scaled, y_train)}')
grid.predict(X_test_scaled)
print(f'Testing Score: {grid.score(X_test_scaled, y_test)}')

Training Score: 0.9038718291054739
Testing Score: 0.8861556064073226


# Save the Model

In [17]:
import joblib
filename = 'Autonomousse_RandomForest.sav'
joblib.dump(rf, filename)

['Autonomousse_RandomForest.sav']