In [None]:
# Resources 
# Data https://www.kaggle.com/nasa/kepler-exoplanet-search-results
# Scikit-Learn tutorial pt 1 https://www.youtube.com/watch?v=4PXAztQtoTg
# Scikit-Learn tutorial pt 2 https://www.youtube.com/watch?v=gK43gtGh49o&t=5858s
# Grid Search https://scikit-learn.org/stable/modules/grid_search.html

In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /anaconda3/lib/python3.7/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
#!pip install joblib

Collecting joblib
[?25l  Downloading https://files.pythonhosted.org/packages/8f/42/155696f85f344c066e17af287359c9786b436b1bf86029bb3411283274f3/joblib-0.14.0-py2.py3-none-any.whl (294kB)
[K    100% |████████████████████████████████| 296kB 4.4MB/s ta 0:00:01
[?25hInstalling collected packages: joblib
Successfully installed joblib-0.14.0


In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("exoplanet_data.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
#selected_features = df[['names', 'of', 'selected', 'features', 'here']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
from sklearn.model_selection import train_test_split
y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])
X_train, X_test, y_train, y_test = train_test_split(X , y, random_state=42, stratify=y)

In [5]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3206,0,1,1,0,31.805143,4.2e-05,-4.2e-05,186.38905,0.00105,-0.00105,...,-161.0,4.545,0.044,-0.176,0.863,0.217,-0.072,298.04453,40.086361,14.517
3954,0,0,0,0,24.560711,0.000375,-0.000375,146.7938,0.0161,-0.0161,...,-135.0,4.192,0.137,-0.125,1.499,0.272,-0.245,289.92145,46.74456,12.805
1410,0,0,0,0,7.560522,2.6e-05,-2.6e-05,134.47889,0.0027,-0.0027,...,-211.0,4.503,0.052,-0.208,0.94,0.282,-0.094,283.84515,44.609089,15.986
5865,0,0,0,0,4.644901,3.8e-05,-3.8e-05,133.67436,0.00786,-0.00786,...,-85.0,4.54,0.052,-0.017,0.77,0.027,-0.046,282.34305,48.340778,14.48
340,0,0,1,1,2.037441,1e-05,-1e-05,133.59962,0.0039,-0.0039,...,-207.0,4.415,0.087,-0.203,1.015,0.312,-0.134,295.79526,47.66396,14.187


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [6]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


# Train the Model



In [7]:
from sklearn.linear_model import LogisticRegression
# train on unscaled data 
model = LogisticRegression()
model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
print(f"Training Data Score:{model.score(X_train, y_train)}")
print(f"Testing Data Score:{model.score(X_test, y_test)}")

Training Data Score:0.6684964928331808
Testing Data Score:0.6765782250686185


In [9]:
model2= LogisticRegression()
model2.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
print(f"Scaled Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Scaled Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Scaled Training Data Score: 0.845684659957304
Scaled Testing Data Score: 0.8371454711802379


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [11]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'penalty':['l1', 'l2']}
grid = GridSearchCV(model, param_grid, verbose=3)

In [12]:
# Train the unscaled model with GridSearch
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s remaining:    0.0s


[CV] ........ C=1, penalty=l1, score=0.8747142203932328, total=   4.2s
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.8778591033851784, total=   1.9s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.1s remaining:    0.0s


[CV] ........ C=1, penalty=l1, score=0.8732265446224257, total=   3.8s
[CV] C=1, penalty=l2 .................................................




[CV] ........ C=1, penalty=l2, score=0.6799268404206675, total=   1.0s
[CV] C=1, penalty=l2 .................................................




[CV] ......... C=1, penalty=l2, score=0.661024702653248, total=   1.4s
[CV] C=1, penalty=l2 .................................................




[CV] ........ C=1, penalty=l2, score=0.6718535469107552, total=   1.6s
[CV] C=5, penalty=l1 .................................................




[CV] ......... C=5, penalty=l1, score=0.877914951989026, total=  24.3s
[CV] C=5, penalty=l1 .................................................




[CV] ......... C=5, penalty=l1, score=0.879231473010064, total=   2.1s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8778032036613272, total=   4.0s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.6812985825331505, total=   0.9s
[CV] C=5, penalty=l2 .................................................




[CV] ........ C=5, penalty=l2, score=0.6646843549862763, total=   1.3s
[CV] C=5, penalty=l2 .................................................




[CV] ........ C=5, penalty=l2, score=0.6745995423340961, total=   1.3s
[CV] C=10, penalty=l1 ................................................




[CV] ........ C=10, penalty=l1, score=0.879286694101509, total=  21.1s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8796889295516926, total=   1.5s
[CV] C=10, penalty=l1 ................................................
[CV] ....... C=10, penalty=l1, score=0.8791762013729977, total=   5.1s
[CV] C=10, penalty=l2 ................................................




[CV] ....... C=10, penalty=l2, score=0.6776406035665294, total=   1.2s
[CV] C=10, penalty=l2 ................................................




[CV] ....... C=10, penalty=l2, score=0.6642268984446478, total=   1.3s
[CV] C=10, penalty=l2 ................................................


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  1.3min finished


[CV] ....... C=10, penalty=l2, score=0.6709382151029748, total=   1.3s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [13]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'penalty': 'l1'}
0.8793839585239402


In [14]:
predictions=grid.predict(X_test)

In [15]:
predictions[:10]

array(['FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE',
       'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED',
       'CONFIRMED', 'CANDIDATE'], dtype=object)

In [16]:
y_test[:10]

1738    FALSE POSITIVE
411          CONFIRMED
8005    FALSE POSITIVE
5536    FALSE POSITIVE
967          CONFIRMED
5415    FALSE POSITIVE
5466    FALSE POSITIVE
1385         CONFIRMED
1705         CONFIRMED
2868         CANDIDATE
Name: koi_disposition, dtype: object

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.83      0.67      0.74       529
     CONFIRMED       0.75      0.85      0.80       568
FALSE POSITIVE       0.98      1.00      0.99      1089

     micro avg       0.88      0.88      0.88      2186
     macro avg       0.85      0.84      0.84      2186
  weighted avg       0.88      0.88      0.88      2186



In [18]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
             'penalty': ['l1', "l2"]}
grid2 = GridSearchCV(model2, param_grid, verbose=3)

In [19]:
grid2.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.8628257887517147, total=   0.3s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ......... C=1, penalty=l1, score=0.868252516010979, total=   1.1s
[CV] C=1, penalty=l1 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV] ........ C=1, penalty=l1, score=0.8700228832951945, total=   0.6s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.8376771833561957, total=   0.1s
[CV] C=1, penalty=l2 .................................................




[CV] ........ C=1, penalty=l2, score=0.8348581884720951, total=   0.1s
[CV] C=1, penalty=l2 .................................................
[CV] ........ C=1, penalty=l2, score=0.8443935926773455, total=   0.1s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8765432098765432, total=   1.4s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8783165599268069, total=   3.5s
[CV] C=5, penalty=l1 .................................................




[CV] ........ C=5, penalty=l1, score=0.8768878718535469, total=   2.7s
[CV] C=5, penalty=l2 .................................................




[CV] .......... C=5, penalty=l2, score=0.85413808870599, total=   0.2s
[CV] C=5, penalty=l2 .................................................
[CV] ........ C=5, penalty=l2, score=0.8586459286367795, total=   0.2s
[CV] C=5, penalty=l2 .................................................




[CV] ........ C=5, penalty=l2, score=0.8613272311212815, total=   0.1s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8760859625057156, total=   1.5s
[CV] C=10, penalty=l1 ................................................




[CV] ........ C=10, penalty=l1, score=0.879231473010064, total=   5.7s
[CV] C=10, penalty=l1 ................................................




[CV] ....... C=10, penalty=l1, score=0.8768878718535469, total=   2.3s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8559670781893004, total=   0.1s
[CV] C=10, penalty=l2 ................................................




[CV] ....... C=10, penalty=l2, score=0.8659652333028363, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ....... C=10, penalty=l2, score=0.8672768878718535, total=   0.1s


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   20.6s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [20]:
print(grid2.best_params_)
print(grid2.best_score_)

{'C': 10, 'penalty': 'l1'}
0.8774016468435498


# Save the Model

In [38]:
prediction2 = grid2.predict(X_test_scaled)
prediction2[:10]

array(['FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE',
       'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED',
       'CONFIRMED', 'CANDIDATE'], dtype=object)

In [39]:
y_test[:10]

1738    FALSE POSITIVE
411          CONFIRMED
8005    FALSE POSITIVE
5536    FALSE POSITIVE
967          CONFIRMED
5415    FALSE POSITIVE
5466    FALSE POSITIVE
1385         CONFIRMED
1705         CONFIRMED
2868         CANDIDATE
Name: koi_disposition, dtype: object

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction2))

                precision    recall  f1-score   support

     CANDIDATE       0.83      0.66      0.74       529
     CONFIRMED       0.74      0.86      0.80       568
FALSE POSITIVE       0.98      1.00      0.99      1089

     micro avg       0.88      0.88      0.88      2186
     macro avg       0.85      0.84      0.84      2186
  weighted avg       0.88      0.88      0.88      2186



In [61]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'logistic_model.sav'
joblib.dump(predictions, filename)

['logistic_model.sav']

In [62]:
filename2 = 'logistic_model2.sav'
joblib.dump(prediction2, filename2)

['logistic_model2.sav']