In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\10pai\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import matplotlib.pyplot as plt
from sklearn import tree
import pandas as pd
import os

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("../Data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
X = df[['koi_period', 'koi_impact', 'koi_duration', 'koi_prad', 'koi_teq', 'koi_steff', 'koi_slogg']]
X_names = X.columns
X_names

Index(['koi_period', 'koi_impact', 'koi_duration', 'koi_prad', 'koi_teq',
       'koi_steff', 'koi_slogg'],
      dtype='object')

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
#Select column for dependent variable (y)
y = df["koi_disposition"]
y_names = df.koi_disposition.unique()
y_names.sort()
print(y_names)

['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']


In [7]:
#Split the data into train/test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X.shape, y.shape)

(6991, 7) (6991,)


In [8]:
X_train.head()

Unnamed: 0,koi_period,koi_impact,koi_duration,koi_prad,koi_teq,koi_steff,koi_slogg
540,0.567856,0.324,1.0258,1.55,1848,5073,4.483
2586,122.385868,0.101,7.123,1.72,269,4926,4.653
3623,421.428482,0.356,8.60447,54.16,335,6235,4.034
5850,17.663798,0.695,6.194,2.98,1178,5734,3.591
1894,1.716828,0.7,2.068,0.74,1455,5808,4.522


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [9]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

In [10]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [11]:
# Create a Decision Tree Classifier
clf = tree.DecisionTreeClassifier()

In [12]:
# Fit the classifier to the data
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6172768878718535

In [13]:
# Create, fit, and score a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7191075514874142

In [14]:
sorted(zip(rf.feature_importances_, X_names), reverse=True)

[(0.23976178440337406, 'koi_prad'),
 (0.15612719880680564, 'koi_period'),
 (0.13500232821059144, 'koi_duration'),
 (0.13188101447262107, 'koi_teq'),
 (0.1285483971796949, 'koi_impact'),
 (0.10507037737327186, 'koi_slogg'),
 (0.10360889955364097, 'koi_steff')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'min_samples_leaf': [3, 4, 5],
              'min_samples_split': [8, 10, 12],
              'n_estimators': [100, 200, 300, 1000]}
grid = GridSearchCV(rf, param_grid, verbose=3)
print(rf.get_params().keys())

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])


In [16]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=100 .......


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=100, score=0.721, total=   0.6s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=100 .......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=100, score=0.732, total=   0.5s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=100 .......


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=100, score=0.715, total=   0.5s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=100 .......
[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=100, score=0.718, total=   0.5s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=100 .......
[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=100, score=0.698, total=   0.6s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=200 .......
[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=200, score=0.716, total=   1.0s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=200 .......
[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=200, score=0.722, total=   1.0s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=200 .......
[CV]  min_samples_leaf=3, min_samples_split=8, n_estimators=200, score=0.724, total=   1.0s
[CV] min_samples_leaf=3, min_samples_split=8, n_estimators=200 .......
[CV]  min_samples_leaf

[CV]  min_samples_leaf=3, min_samples_split=12, n_estimators=300, score=0.723, total=   1.6s
[CV] min_samples_leaf=3, min_samples_split=12, n_estimators=300 ......
[CV]  min_samples_leaf=3, min_samples_split=12, n_estimators=300, score=0.691, total=   1.7s
[CV] min_samples_leaf=3, min_samples_split=12, n_estimators=1000 .....
[CV]  min_samples_leaf=3, min_samples_split=12, n_estimators=1000, score=0.714, total=   5.4s
[CV] min_samples_leaf=3, min_samples_split=12, n_estimators=1000 .....
[CV]  min_samples_leaf=3, min_samples_split=12, n_estimators=1000, score=0.733, total=   5.2s
[CV] min_samples_leaf=3, min_samples_split=12, n_estimators=1000 .....
[CV]  min_samples_leaf=3, min_samples_split=12, n_estimators=1000, score=0.725, total=   5.2s
[CV] min_samples_leaf=3, min_samples_split=12, n_estimators=1000 .....
[CV]  min_samples_leaf=3, min_samples_split=12, n_estimators=1000, score=0.720, total=   5.6s
[CV] min_samples_leaf=3, min_samples_split=12, n_estimators=1000 .....
[CV]  min_sa

[CV]  min_samples_leaf=4, min_samples_split=12, n_estimators=100, score=0.715, total=   0.5s
[CV] min_samples_leaf=4, min_samples_split=12, n_estimators=100 ......
[CV]  min_samples_leaf=4, min_samples_split=12, n_estimators=100, score=0.696, total=   0.5s
[CV] min_samples_leaf=4, min_samples_split=12, n_estimators=200 ......
[CV]  min_samples_leaf=4, min_samples_split=12, n_estimators=200, score=0.716, total=   1.0s
[CV] min_samples_leaf=4, min_samples_split=12, n_estimators=200 ......
[CV]  min_samples_leaf=4, min_samples_split=12, n_estimators=200, score=0.725, total=   1.0s
[CV] min_samples_leaf=4, min_samples_split=12, n_estimators=200 ......
[CV]  min_samples_leaf=4, min_samples_split=12, n_estimators=200, score=0.720, total=   1.1s
[CV] min_samples_leaf=4, min_samples_split=12, n_estimators=200 ......
[CV]  min_samples_leaf=4, min_samples_split=12, n_estimators=200, score=0.726, total=   1.0s
[CV] min_samples_leaf=4, min_samples_split=12, n_estimators=200 ......
[CV]  min_sample

[CV]  min_samples_leaf=5, min_samples_split=10, n_estimators=300, score=0.688, total=   1.7s
[CV] min_samples_leaf=5, min_samples_split=10, n_estimators=1000 .....
[CV]  min_samples_leaf=5, min_samples_split=10, n_estimators=1000, score=0.723, total=   6.0s
[CV] min_samples_leaf=5, min_samples_split=10, n_estimators=1000 .....
[CV]  min_samples_leaf=5, min_samples_split=10, n_estimators=1000, score=0.734, total=   6.0s
[CV] min_samples_leaf=5, min_samples_split=10, n_estimators=1000 .....
[CV]  min_samples_leaf=5, min_samples_split=10, n_estimators=1000, score=0.723, total=   5.7s
[CV] min_samples_leaf=5, min_samples_split=10, n_estimators=1000 .....
[CV]  min_samples_leaf=5, min_samples_split=10, n_estimators=1000, score=0.720, total=   5.4s
[CV] min_samples_leaf=5, min_samples_split=10, n_estimators=1000 .....
[CV]  min_samples_leaf=5, min_samples_split=10, n_estimators=1000, score=0.690, total=   5.3s
[CV] min_samples_leaf=5, min_samples_split=12, n_estimators=100 ......
[CV]  min_s

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  6.5min finished


GridSearchCV(estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=3)

In [17]:
#Print best params and their score
print(grid.best_params_)
print(grid.best_score_)

{'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 200}
0.7194334480675889


In [18]:
#Make predictions
predictions = rf.predict(X_test_scaled)
predictions

array(['FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', ...,
       'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE'], dtype=object)

In [19]:
#Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names = y_names))

                precision    recall  f1-score   support

     CANDIDATE       0.00      0.00      0.00       419
     CONFIRMED       0.00      0.00      0.00       444
FALSE POSITIVE       0.51      1.00      0.67       885

      accuracy                           0.51      1748
     macro avg       0.17      0.33      0.22      1748
  weighted avg       0.26      0.51      0.34      1748



  _warn_prf(average, modifier, msg_start, len(result))


# Save the Model

In [None]:
# # save your model by updating "your_name" with your name
# # and "your_model" with your model variable
# # be sure to turn this in to BCS
# # if joblib fails to import, try running the command to install in terminal/git-bash
# import joblib
# filename = 'AJP.sav'
# joblib.dump(SVC_model, filename)