In [2]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [3]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [4]:
import pandas as pd

import warnings
warnings.simplefilter('ignore')

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("../Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
# Overview of the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

In [6]:
df["koi_disposition"].value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

# Select your features (columns)

In [8]:
# Set features. This will also be used as your x values.
selected_features = list(df.columns[1:])

# Create a Train Test Split

Use `koi_disposition` for the y values

In [9]:
y = df["koi_disposition"]

X = df[selected_features]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
y_train

3563         CANDIDATE
4099         CONFIRMED
5460         CANDIDATE
1091         CONFIRMED
5999         CANDIDATE
             ...      
905          CONFIRMED
5192    FALSE POSITIVE
3980    FALSE POSITIVE
235          CONFIRMED
5157         CONFIRMED
Name: koi_disposition, Length: 5243, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
# Scale your data
# Scale your data
from sklearn.preprocessing import MinMaxScaler
X_train_scaled = MinMaxScaler().fit_transform(X_train)
X_test_scaled = MinMaxScaler().fit_transform(X_test)

In [14]:
X_train_scaled 

array([[0.        , 0.        , 0.        , ..., 0.83497297, 0.51779124,
        0.5155798 ],
       [0.        , 0.        , 0.        , ..., 0.72693168, 0.38067188,
        0.70650467],
       [0.        , 0.        , 0.        , ..., 0.56436342, 0.80798012,
        0.69823952],
       ...,
       [0.        , 0.        , 1.        , ..., 0.88419373, 0.2724652 ,
        0.74055707],
       [0.        , 0.        , 0.        , ..., 0.38035748, 0.58629009,
        0.733697  ],
       [0.        , 0.        , 0.        , ..., 0.25722845, 0.72706515,
        0.59980164]])

# Train the Model



In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)

print(f"Training Data Score: {training_score}")
print(f"Testing Data Score: {testing_score}")

Training Data Score: 0.26416173946214
Testing Data Score: 0.27631578947368424


In [18]:
import numpy as np

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [19]:
# Create the GridSearchCV model
# Create first pipeline for base without reducing features.
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([('classifier' , RandomForestClassifier())])
# pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create param grid.
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,40)),
    'classifier__max_features' : list(range(6,32,10))}
]

# Create grid search object
clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

# Train the models with GridSearch
best_clf = clf.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 49 candidates, totalling 245 fits


In [20]:
print(best_clf.best_params_)
print(best_clf.best_score_)

{'classifier': RandomForestClassifier(max_features=16, n_estimators=90), 'classifier__max_features': 16, 'classifier__n_estimators': 90}
0.89433302527307


# Save the Model

In [22]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'best_clf.sav'
joblib.dump(best_clf, filename)

['best_clf.sav']