In [2]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [3]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
selected_features = df[
    ['koi_period', 
     'koi_time0bk', 
     #'koi_impact', 
     'koi_duration', 
     'koi_depth',
     'koi_prad',
     'koi_teq',
     'koi_insol',
     #'koi_model_snr',
     #'koi_steff',
     #'koi_slogg',
     #'koi_srad',
     'ra',
     'dec',
     'koi_kepmag'
    ]
]

In [4]:
y = df.koi_disposition.values.reshape(-1,1)
print(y)

[['CONFIRMED']
 ['FALSE POSITIVE']
 ['FALSE POSITIVE']
 ...
 ['CANDIDATE']
 ['FALSE POSITIVE']
 ['FALSE POSITIVE']]


# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
from sklearn.model_selection import train_test_split
X = selected_features.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,koi_period,koi_time0bk,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,ra,dec,koi_kepmag
6122,6.768901,133.07724,3.616,123.1,1.24,1017,253.3,294.40472,39.351681,14.725
6370,0.733726,132.02005,2.309,114.6,0.86,1867,2891.64,284.50391,42.46386,15.77
2879,7.652707,134.46038,79.8969,641.1,3.21,989,226.81,295.50211,38.98354,13.099
107,7.953547,174.66224,2.6312,875.4,2.25,696,55.37,291.15878,40.750271,15.66
29,4.959319,172.258529,2.22739,9802.0,12.21,1103,349.4,292.16705,48.727589,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [24]:
# Scale your data
from sklearn.preprocessing import StandardScaler, MinMaxScaler
X_scaler = MinMaxScaler(feature_range=(0,1))
# y_train_scaler = MinMaxScaler(y_train)

X_train_scaled =  X_scaler.fit_transform(X_train)
print(X_train_scaled)
X_test_scaled = X_scaler.fit_transform(X_test)

[[8.04827937e-03 9.29087782e-03 2.49253828e-02 ... 6.65745671e-01
  1.76049579e-01 6.41292669e-01]
 [5.85971231e-04 8.50893625e-03 1.54798985e-02 ... 2.12684670e-01
  3.73540046e-01 7.27663443e-01]
 [9.14107928e-03 1.03139055e-02 5.76195501e-01 ... 7.15962230e-01
  1.52688346e-01 5.06901397e-01]
 ...
 [7.28333836e-03 1.02861285e-02 1.10009178e-02 ... 7.92823003e-01
  5.97200429e-01 3.02173733e-01]
 [3.81261085e-03 1.05984604e-02 1.96013673e-01 ... 7.78217328e-01
  2.90002258e-01 5.21530705e-01]
 [2.48788464e-03 1.05640079e-02 3.76952151e-02 ... 6.80822216e-01
  2.71853535e-01 6.19307381e-01]]


In [25]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Train the Model



In [26]:
# print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
# print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.683066361556064


In [1]:
# scores = []
# This was for my own enjoyment
# for i in range(100,2000,100):
#     model = RandomForestClassifier(n_estimators=i)
#     model.fit(X_train, y_train)
#     print(f'Number of estimators: {i}')
#     print(f"Training Data Score: {model.score(X_train, y_train)}")
# #     model.score(X_train, y_train)
#     print(f"Testing Data Score: {model.score(X_test, y_test)}")
#     print('\n------------')
# #     score = model.score(X_test, y_test)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [29]:
X_train_scaled

array([[8.04827937e-03, 9.29087782e-03, 2.49253828e-02, ...,
        6.65745671e-01, 1.76049579e-01, 6.41292669e-01],
       [5.85971231e-04, 8.50893625e-03, 1.54798985e-02, ...,
        2.12684670e-01, 3.73540046e-01, 7.27663443e-01],
       [9.14107928e-03, 1.03139055e-02, 5.76195501e-01, ...,
        7.15962230e-01, 1.52688346e-01, 5.06901397e-01],
       ...,
       [7.28333836e-03, 1.02861285e-02, 1.10009178e-02, ...,
        7.92823003e-01, 5.97200429e-01, 3.02173733e-01],
       [3.81261085e-03, 1.05984604e-02, 1.96013673e-01, ...,
        7.78217328e-01, 2.90002258e-01, 5.21530705e-01],
       [2.48788464e-03, 1.05640079e-02, 3.76952151e-02, ...,
        6.80822216e-01, 2.71853535e-01, 6.19307381e-01]])

In [27]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
import numpy as np
param_grid = {'n_estimators': np.arange(100,300,10),
              'max_features': ['auto', 'sqrt', 'log2']}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=3)

In [31]:
# Train the model with GridSearch
grid.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] max_features=auto, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . max_features=auto, n_estimators=100, score=0.751, total=   1.9s
[CV] max_features=auto, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV] . max_features=auto, n_estimators=100, score=0.737, total=   1.9s
[CV] max_features=auto, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


[CV] . max_features=auto, n_estimators=100, score=0.721, total=   1.6s
[CV] max_features=auto, n_estimators=100 .............................
[CV] . max_features=auto, n_estimators=100, score=0.703, total=   1.8s
[CV] max_features=auto, n_estimators=100 .............................
[CV] . max_features=auto, n_estimators=100, score=0.739, total=   1.9s
[CV] max_features=auto, n_estimators=110 .............................
[CV] . max_features=auto, n_estimators=110, score=0.740, total=   1.7s
[CV] max_features=auto, n_estimators=110 .............................
[CV] . max_features=auto, n_estimators=110, score=0.730, total=   3.6s
[CV] max_features=auto, n_estimators=110 .............................
[CV] . max_features=auto, n_estimators=110, score=0.714, total=   2.3s
[CV] max_features=auto, n_estimators=110 .............................
[CV] . max_features=auto, n_estimators=110, score=0.718, total=   1.9s
[CV] max_features=auto, n_estimators=110 .............................
[CV] .

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 13.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=200, n_jobs=None,
                                              oob_score=False,
                                              rando

In [32]:
print(grid.best_params_)
print(grid.best_score_)

{'max_features': 'sqrt', 'n_estimators': 240}
0.7354539765243525


In [33]:
model_final = RandomForestClassifier(n_estimators=240, max_features='sqrt')
model_final.fit(X_train_scaled, y_train)

print(f'''
Training Score: {model_final.score(X_train_scaled, y_train)}
Testing Score: {model_final.score(X_test_scaled, y_test)}
''')


Training Score: 1.0
Testing Score: 0.6933638443935927



# Save the Model

In [34]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'random_forest_model_exoplanets.h5'
joblib.dump(model_final, filename)

['random_forest_model_exoplanets.h5']