In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\10pai\anaconda3\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [35]:
import matplotlib.pyplot as plt
from sklearn import tree
import pandas as pd
import os

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("../Data/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [40]:
# Set features. This will also be used as your x values.
X = df[['koi_period', 'koi_impact', 'koi_duration', 'koi_prad', 'koi_teq', 'koi_steff', 'koi_slogg']]
X_names = X.columns
X_names

Index(['koi_period', 'koi_impact', 'koi_duration', 'koi_prad', 'koi_teq',
       'koi_steff', 'koi_slogg'],
      dtype='object')

# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
#Select column for dependent variable (y)
y = df["koi_disposition"]
y_names = df.koi_disposition.unique()
y_names.sort()
print(y_names)

['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']


In [7]:
#Split the data into train/test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X.shape, y.shape)

(6991, 7) (6991,)


In [8]:
X_train.head()

Unnamed: 0,koi_period,koi_impact,koi_duration,koi_prad,koi_teq,koi_steff,koi_slogg
4214,302.388678,1.134,3.1229,40.73,238,5328,4.499
5712,13.22982,0.435,3.566,1.5,856,6049,4.353
2970,1.227986,0.477,1.551,1.17,1673,5746,4.435
3408,21.086786,0.702,0.978,2.08,651,6084,4.542
3312,5.474552,0.931,3.71,1.4,1155,5611,4.19


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [9]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

In [10]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [36]:
# Create a Decision Tree Classifier
clf = tree.DecisionTreeClassifier()

In [37]:
# Fit the classifier to the data
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6533180778032036

In [38]:
# Create, fit, and score a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7505720823798627

In [41]:
sorted(zip(rf.feature_importances_, X_names), reverse=True)

[(0.23174626945087104, 'koi_prad'),
 (0.15476111366527656, 'koi_period'),
 (0.14080468010292035, 'koi_duration'),
 (0.13212191755831787, 'koi_teq'),
 (0.13187577461908828, 'koi_impact'),
 (0.10497917882287583, 'koi_steff'),
 (0.10371106578064998, 'koi_slogg')]

In [27]:
# Choose the best k from above and re-fit the KNN Classifier using that k value.
# print the score for the test data
# YOUR CODE HERE
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
print('k=9 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

k=9 Test Acc: 0.618


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [30]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9],
             'weights': ['uniform', 'distance'],
             'metric': ['euclidean', 'manhattan']}
grid = GridSearchCV(knn, param_grid, verbose=3)
print(knn.get_params().keys())

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])


In [31]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] metric=euclidean, n_neighbors=3, weights=uniform ................
[CV]  metric=euclidean, n_neighbors=3, weights=uniform, score=0.561, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=uniform ................
[CV]  metric=euclidean, n_neighbors=3, weights=uniform, score=0.578, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=uniform ................
[CV]  metric=euclidean, n_neighbors=3, weights=uniform, score=0.554, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=uniform ................
[CV]  metric=euclidean, n_neighbors=3, weights=uniform, score=0.562, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=uniform ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  metric=euclidean, n_neighbors=3, weights=uniform, score=0.558, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=distance ...............
[CV]  metric=euclidean, n_neighbors=3, weights=distance, score=0.553, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=distance ...............
[CV]  metric=euclidean, n_neighbors=3, weights=distance, score=0.583, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=distance ...............
[CV]  metric=euclidean, n_neighbors=3, weights=distance, score=0.559, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=distance ...............
[CV]  metric=euclidean, n_neighbors=3, weights=distance, score=0.576, total=   0.0s
[CV] metric=euclidean, n_neighbors=3, weights=distance ...............
[CV]  metric=euclidean, n_neighbors=3, weights=distance, score=0.576, total=   0.0s
[CV] metric=euclidean, n_neighbors=5, weights=uniform ................
[CV]  metric=euclidean, n_neighbors=5, weights=uniform, score=0.572, to

[CV]  metric=manhattan, n_neighbors=7, weights=uniform, score=0.572, total=   0.0s
[CV] metric=manhattan, n_neighbors=7, weights=uniform ................
[CV]  metric=manhattan, n_neighbors=7, weights=uniform, score=0.584, total=   0.0s
[CV] metric=manhattan, n_neighbors=7, weights=distance ...............
[CV]  metric=manhattan, n_neighbors=7, weights=distance, score=0.586, total=   0.0s
[CV] metric=manhattan, n_neighbors=7, weights=distance ...............
[CV]  metric=manhattan, n_neighbors=7, weights=distance, score=0.599, total=   0.0s
[CV] metric=manhattan, n_neighbors=7, weights=distance ...............
[CV]  metric=manhattan, n_neighbors=7, weights=distance, score=0.577, total=   0.0s
[CV] metric=manhattan, n_neighbors=7, weights=distance ...............
[CV]  metric=manhattan, n_neighbors=7, weights=distance, score=0.591, total=   0.0s
[CV] metric=manhattan, n_neighbors=7, weights=distance ...............
[CV]  metric=manhattan, n_neighbors=7, weights=distance, score=0.603, to

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    2.7s finished


GridSearchCV(estimator=KNeighborsClassifier(n_neighbors=9),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [32]:
#Print best params and their score
print(grid.best_params_)
print(grid.best_score_)

{'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
0.6042335848754539


In [33]:
#Make predictions
predictions = knn.predict(X_test_scaled)
predictions

array(['FALSE POSITIVE', 'CANDIDATE', 'CONFIRMED', ..., 'FALSE POSITIVE',
       'CONFIRMED', 'CANDIDATE'], dtype=object)

In [34]:
#Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, target_names = y_names))

                precision    recall  f1-score   support

     CANDIDATE       0.39      0.30      0.34       410
     CONFIRMED       0.54      0.61      0.58       474
FALSE POSITIVE       0.75      0.77      0.76       864

      accuracy                           0.62      1748
     macro avg       0.56      0.56      0.56      1748
  weighted avg       0.61      0.62      0.61      1748



# Save the Model

In [None]:
# # save your model by updating "your_name" with your name
# # and "your_model" with your model variable
# # be sure to turn this in to BCS
# # if joblib fails to import, try running the command to install in terminal/git-bash
# import joblib
# filename = 'AJP.sav'
# joblib.dump(SVC_model, filename)