In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[
    ['koi_period', 
     'koi_time0bk', 
     'koi_impact', 
     'koi_duration', 
     'koi_depth',
     'koi_prad',
     'koi_teq',
     'koi_insol',
     'koi_model_snr',
     'koi_steff',
     'koi_slogg',
     'koi_srad',
     'ra',
     'dec',
     'koi_kepmag'
    ]
]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
y = df.koi_disposition.values.reshape(-1,1)
print(y)

[['CONFIRMED']
 ['FALSE POSITIVE']
 ['FALSE POSITIVE']
 ...
 ['CANDIDATE']
 ['FALSE POSITIVE']
 ['FALSE POSITIVE']]


In [7]:
from sklearn.model_selection import train_test_split
X = selected_features.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,5855,4.578,0.797,284.50391,42.46386,15.77
2879,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,6328,4.481,0.963,295.50211,38.98354,13.099
107,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,4768,4.536,0.779,291.15878,40.750271,15.66
29,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,5712,4.359,1.082,292.16705,48.727589,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [17]:
# Scale your data
from sklearn.preprocessing import StandardScaler, MinMaxScaler
X_scaler = MinMaxScaler(feature_range=(0,1))

X_train_scaled =  X_scaler.fit_transform(X_train)
print(X_train_scaled)
X_test_scaled = X_scaler.fit_transform(X_test)
print(X_test_scaled)

[[8.04827937e-03 9.29087782e-03 1.48800667e-03 ... 6.65745671e-01
  1.76049579e-01 6.41292669e-01]
 [5.85971231e-04 8.50893625e-03 2.88673293e-03 ... 2.12684670e-01
  3.73540046e-01 7.27663443e-01]
 [9.14107928e-03 1.03139055e-02 9.62244311e-03 ... 7.15962230e-01
  1.52688346e-01 5.06901397e-01]
 ...
 [7.28333836e-03 1.02861285e-02 1.25984564e-02 ... 7.92823003e-01
  5.97200429e-01 3.02173733e-01]
 [3.81261085e-03 1.05984604e-02 1.20032538e-02 ... 7.78217328e-01
  2.90002258e-01 5.21530705e-01]
 [2.48788464e-03 1.05640079e-02 7.13251195e-03 ... 6.80822216e-01
  2.71853535e-01 6.19307381e-01]]
[[8.23120328e-02 8.64429858e-02 9.13069604e-03 ... 7.13709623e-01
  7.83395611e-01 8.21170587e-01]
 [9.53573333e-02 5.68303988e-02 9.60540811e-03 ... 8.10999904e-01
  6.56069862e-01 7.56436377e-01]
 [7.48704819e-04 1.40853673e-02 6.32949424e-03 ... 5.16736968e-01
  9.37138909e-01 8.33209854e-01]
 ...
 [3.77803586e-02 4.01113314e-02 3.63545318e-03 ... 6.91630514e-01
  2.40628422e-01 6.55306538e-01]

# Train the Model



In [18]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [19]:
model = DecisionTreeClassifier(max_depth=2)

In [20]:
model.fit(X_train_scaled, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [22]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.6080488270074385
Testing Data Score: 0.6252860411899314


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [23]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
import numpy as np
param_grid = {'n_estimators': np.arange(100,110,10),
              'max_features': ['auto', 'sqrt', 'log2']}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=3)

In [25]:
print(grid)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_features': ['auto', 'sqrt', 'log2'],
              

In [24]:
# Train the model with GridSearch
grid.fit(X_train,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] max_features=auto, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter n_estimators for estimator DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)