In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
## Restart your kernel after installing 
#!pip install joblib

In [3]:
# Basic dependiecies
import pandas as pd

In [4]:
# sklearn imports
from sklearn.model_selection import train_test_split


# Read the CSV and Perform Basic Data Cleaning

In [5]:
exoplanet_df = pd.read_csv("../Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
exoplanet_df = exoplanet_df.dropna(axis='columns', how='all')
# Drop the null rows
exoplanet_df = exoplanet_df.dropna()

print(f'The total count of raw data given is: {len(exoplanet_df)}')
exoplanet_df.head()

The total count of raw data given is: 6991


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
# Check for direct correlations within columns to see if they need be dropped.
#exoplanet_df.corr()

In [7]:
# The several error 2 columns are dropped because of a driect correlation inorder to have a more accureate prediction
#a = exoplanet_df.drop(['koi_period_err2', 'koi_time0bk_err2'], axis= 1)

In [8]:
#exoplanet_df.info()

# Select your features (columns)

In [9]:
# Set features. This will also be used as your x values.
selected_features_df = exoplanet_df.drop(['koi_disposition'], axis=1)
#selected_features = df[['names', 'of', 'selected', 'features', 'here']]
selected_features_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [10]:
X = selected_features_df
y = exoplanet_df['koi_disposition']
#.values.reshape(-1, 1)
print(X.shape, y.shape)

(6991, 40) (6991,)


In [11]:
# Create train and test values from our data set 
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [12]:
print(f'The length of the X train data is: {len(X_train)}')
X_train.head()

The length of the X train data is: 5243


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6080,1,0,0,0,12.496435,0.0002213,-0.0002213,132.0358,0.0143,-0.0143,...,-286,3.805,0.39,-0.13,2.73,0.535,-1.248,289.2308,44.412483,13.054
3001,0,0,0,0,11.615625,0.0001528,-0.0001528,131.96843,0.00823,-0.00823,...,-72,4.083,0.368,-0.092,1.453,0.218,-0.51,293.52756,41.111439,15.162
570,0,1,0,0,10.980246,6.93e-07,-6.93e-07,137.137607,5.3e-05,-5.3e-05,...,-159,4.462,0.098,-0.182,0.897,0.238,-0.119,282.79764,43.578129,14.212
4897,1,0,0,0,466.90824,0.01194,-0.01194,136.3731,0.019,-0.019,...,-146,4.456,0.102,-0.361,0.867,0.448,-0.103,297.65436,43.178551,15.202
625,0,1,1,1,1.061933,1.25e-06,-1.25e-06,133.850441,0.000978,-0.000978,...,-167,3.975,0.259,-0.111,1.851,0.383,-0.575,288.90253,44.632992,12.953


In [13]:
y_train

6080    FALSE POSITIVE
3001         CANDIDATE
570     FALSE POSITIVE
4897    FALSE POSITIVE
625     FALSE POSITIVE
             ...      
3315         CONFIRMED
2249         CONFIRMED
3751    FALSE POSITIVE
6016         CANDIDATE
5477         CANDIDATE
Name: koi_disposition, Length: 5243, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [14]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [15]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Y is categorical do not scale

In [16]:
X_train_scaled

array([[ 2.3636166 , -0.57155056, -0.50607237, ..., -0.60905163,
         0.17104541, -0.92407435],
       [-0.42308046, -0.57155056, -0.50607237, ...,  0.29580336,
        -0.74397694,  0.65138821],
       [-0.42308046,  1.7496265 , -0.50607237, ..., -1.96381125,
        -0.06023068, -0.05861626],
       ...,
       [-0.42308046,  1.7496265 , -0.50607237, ...,  0.77936028,
         0.27732824, -0.9920853 ],
       [-0.42308046, -0.57155056, -0.50607237, ...,  0.71398891,
        -0.40930913, -3.39488991],
       [-0.42308046, -0.57155056, -0.50607237, ...,  0.11014294,
        -0.6770074 ,  0.47575552]])

In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

LabelEncoder()

In [21]:
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [40]:
from tensorflow.keras.utils import to_categorical

In [41]:
# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

#  Defining our Model Architecture (the layers)



In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [71]:
model = Sequential()

Next, we add our first layer. This layer requires you to specify both the number of inputs and the number of nodes that you want in the hidden layer.  Our final layer is the output layer. Here, we need to specify the activation function (typically softmax for classification) and the number of classes (labels) that we are trying to predic

In [72]:
number_inputs = 40
number_hidden_nodes = 100
number_classes = 3

In [73]:
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(units=number_hidden_nodes, activation='relu'))
model.add(Dense(units=number_classes, activation='softmax'))

In [74]:
# Model Summary
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 100)               4100      
_________________________________________________________________
dense_17 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_18 (Dense)             (None, 3)                 303       
Total params: 14,503
Trainable params: 14,503
Non-trainable params: 0
_________________________________________________________________


In [75]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# The output layer in this test is using software for logistic regression (categorical)
# If output layer activation was `linear` then you may want to use `mse` for loss

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model

In [76]:
 # Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2)

Train on 5243 samples
Epoch 1/1000
5243/5243 - 0s - loss: 0.4899 - accuracy: 0.7833
Epoch 2/1000
5243/5243 - 0s - loss: 0.3078 - accuracy: 0.8730
Epoch 3/1000
5243/5243 - 0s - loss: 0.2728 - accuracy: 0.8827
Epoch 4/1000
5243/5243 - 0s - loss: 0.2621 - accuracy: 0.8865
Epoch 5/1000
5243/5243 - 0s - loss: 0.2576 - accuracy: 0.8947
Epoch 6/1000
5243/5243 - 0s - loss: 0.2447 - accuracy: 0.8938
Epoch 7/1000
5243/5243 - 0s - loss: 0.2369 - accuracy: 0.9001
Epoch 8/1000
5243/5243 - 0s - loss: 0.2287 - accuracy: 0.8981
Epoch 9/1000
5243/5243 - 0s - loss: 0.2217 - accuracy: 0.9069
Epoch 10/1000
5243/5243 - 0s - loss: 0.2131 - accuracy: 0.9102
Epoch 11/1000
5243/5243 - 0s - loss: 0.2115 - accuracy: 0.9090
Epoch 12/1000
5243/5243 - 0s - loss: 0.2027 - accuracy: 0.9132
Epoch 13/1000
5243/5243 - 0s - loss: 0.2039 - accuracy: 0.9092
Epoch 14/1000
5243/5243 - 0s - loss: 0.1954 - accuracy: 0.9163
Epoch 15/1000
5243/5243 - 0s - loss: 0.1986 - accuracy: 0.9140
Epoch 16/1000
5243/5243 - 0s - loss: 0.193

Epoch 131/1000
5243/5243 - 0s - loss: 0.0252 - accuracy: 0.9926
Epoch 132/1000
5243/5243 - 0s - loss: 0.0224 - accuracy: 0.9931
Epoch 133/1000
5243/5243 - 0s - loss: 0.0250 - accuracy: 0.9924
Epoch 134/1000
5243/5243 - 0s - loss: 0.0193 - accuracy: 0.9950
Epoch 135/1000
5243/5243 - 0s - loss: 0.0225 - accuracy: 0.9939
Epoch 136/1000
5243/5243 - 0s - loss: 0.0236 - accuracy: 0.9943
Epoch 137/1000
5243/5243 - 0s - loss: 0.0529 - accuracy: 0.9821
Epoch 138/1000
5243/5243 - 0s - loss: 0.0280 - accuracy: 0.9910
Epoch 139/1000
5243/5243 - 0s - loss: 0.0245 - accuracy: 0.9926
Epoch 140/1000
5243/5243 - 0s - loss: 0.0196 - accuracy: 0.9958
Epoch 141/1000
5243/5243 - 0s - loss: 0.0164 - accuracy: 0.9964
Epoch 142/1000
5243/5243 - 0s - loss: 0.0164 - accuracy: 0.9964
Epoch 143/1000
5243/5243 - 0s - loss: 0.0202 - accuracy: 0.9947
Epoch 144/1000
5243/5243 - 0s - loss: 0.0167 - accuracy: 0.9968
Epoch 145/1000
5243/5243 - 0s - loss: 0.0197 - accuracy: 0.9937
Epoch 146/1000
5243/5243 - 0s - loss: 0.

Epoch 260/1000
5243/5243 - 0s - loss: 0.0016 - accuracy: 1.0000
Epoch 261/1000
5243/5243 - 0s - loss: 0.0016 - accuracy: 1.0000
Epoch 262/1000
5243/5243 - 0s - loss: 0.0015 - accuracy: 1.0000
Epoch 263/1000
5243/5243 - 0s - loss: 0.0014 - accuracy: 1.0000
Epoch 264/1000
5243/5243 - 0s - loss: 0.0014 - accuracy: 1.0000
Epoch 265/1000
5243/5243 - 0s - loss: 0.0013 - accuracy: 1.0000
Epoch 266/1000
5243/5243 - 0s - loss: 0.0014 - accuracy: 1.0000
Epoch 267/1000
5243/5243 - 0s - loss: 0.0012 - accuracy: 1.0000
Epoch 268/1000
5243/5243 - 0s - loss: 0.0026 - accuracy: 1.0000
Epoch 269/1000
5243/5243 - 0s - loss: 0.0018 - accuracy: 1.0000
Epoch 270/1000
5243/5243 - 0s - loss: 0.0013 - accuracy: 1.0000
Epoch 271/1000
5243/5243 - 0s - loss: 0.0011 - accuracy: 1.0000
Epoch 272/1000
5243/5243 - 0s - loss: 0.0011 - accuracy: 1.0000
Epoch 273/1000
5243/5243 - 0s - loss: 0.0012 - accuracy: 1.0000
Epoch 274/1000
5243/5243 - 0s - loss: 0.0012 - accuracy: 1.0000
Epoch 275/1000
5243/5243 - 0s - loss: 0.

Epoch 385/1000
5243/5243 - 0s - loss: 3.7561e-04 - accuracy: 1.0000
Epoch 386/1000
5243/5243 - 0s - loss: 3.3999e-04 - accuracy: 1.0000
Epoch 387/1000
5243/5243 - 0s - loss: 3.1414e-04 - accuracy: 1.0000
Epoch 388/1000
5243/5243 - 0s - loss: 3.3116e-04 - accuracy: 1.0000
Epoch 389/1000
5243/5243 - 0s - loss: 3.4855e-04 - accuracy: 1.0000
Epoch 390/1000
5243/5243 - 0s - loss: 3.1090e-04 - accuracy: 1.0000
Epoch 391/1000
5243/5243 - 0s - loss: 2.9441e-04 - accuracy: 1.0000
Epoch 392/1000
5243/5243 - 0s - loss: 2.7435e-04 - accuracy: 1.0000
Epoch 393/1000
5243/5243 - 0s - loss: 0.1533 - accuracy: 0.9683
Epoch 394/1000
5243/5243 - 0s - loss: 0.0869 - accuracy: 0.9807
Epoch 395/1000
5243/5243 - 0s - loss: 0.0721 - accuracy: 0.9901
Epoch 396/1000
5243/5243 - 0s - loss: 0.0401 - accuracy: 0.9968
Epoch 397/1000
5243/5243 - 0s - loss: 0.0025 - accuracy: 0.9996
Epoch 398/1000
5243/5243 - 0s - loss: 0.0012 - accuracy: 1.0000
Epoch 399/1000
5243/5243 - 0s - loss: 0.0011 - accuracy: 1.0000
Epoch 40

Epoch 507/1000
5243/5243 - 0s - loss: 1.2831e-04 - accuracy: 1.0000
Epoch 508/1000
5243/5243 - 0s - loss: 1.3834e-04 - accuracy: 1.0000
Epoch 509/1000
5243/5243 - 0s - loss: 1.4539e-04 - accuracy: 1.0000
Epoch 510/1000
5243/5243 - 0s - loss: 1.1672e-04 - accuracy: 1.0000
Epoch 511/1000
5243/5243 - 0s - loss: 1.0942e-04 - accuracy: 1.0000
Epoch 512/1000
5243/5243 - 0s - loss: 1.0303e-04 - accuracy: 1.0000
Epoch 513/1000
5243/5243 - 0s - loss: 1.3656e-04 - accuracy: 1.0000
Epoch 514/1000
5243/5243 - 0s - loss: 0.1316 - accuracy: 0.9773
Epoch 515/1000
5243/5243 - 0s - loss: 0.0630 - accuracy: 0.9840
Epoch 516/1000
5243/5243 - 0s - loss: 0.0421 - accuracy: 0.9922
Epoch 517/1000
5243/5243 - 0s - loss: 0.0243 - accuracy: 0.9985
Epoch 518/1000
5243/5243 - 0s - loss: 0.0018 - accuracy: 0.9998
Epoch 519/1000
5243/5243 - 0s - loss: 7.4571e-04 - accuracy: 1.0000
Epoch 520/1000
5243/5243 - 0s - loss: 6.2784e-04 - accuracy: 1.0000
Epoch 521/1000
5243/5243 - 0s - loss: 5.4339e-04 - accuracy: 1.0000


Epoch 629/1000
5243/5243 - 0s - loss: 1.5790e-04 - accuracy: 1.0000
Epoch 630/1000
5243/5243 - 0s - loss: 1.5039e-04 - accuracy: 1.0000
Epoch 631/1000
5243/5243 - 0s - loss: 1.4836e-04 - accuracy: 1.0000
Epoch 632/1000
5243/5243 - 0s - loss: 1.3777e-04 - accuracy: 1.0000
Epoch 633/1000
5243/5243 - 0s - loss: 1.2987e-04 - accuracy: 1.0000
Epoch 634/1000
5243/5243 - 0s - loss: 1.2425e-04 - accuracy: 1.0000
Epoch 635/1000
5243/5243 - 0s - loss: 1.1854e-04 - accuracy: 1.0000
Epoch 636/1000
5243/5243 - 0s - loss: 1.1143e-04 - accuracy: 1.0000
Epoch 637/1000
5243/5243 - 0s - loss: 1.0645e-04 - accuracy: 1.0000
Epoch 638/1000
5243/5243 - 0s - loss: 1.0421e-04 - accuracy: 1.0000
Epoch 639/1000
5243/5243 - 0s - loss: 9.5234e-05 - accuracy: 1.0000
Epoch 640/1000
5243/5243 - 0s - loss: 9.0173e-05 - accuracy: 1.0000
Epoch 641/1000
5243/5243 - 0s - loss: 8.4504e-05 - accuracy: 1.0000
Epoch 642/1000
5243/5243 - 0s - loss: 8.0012e-05 - accuracy: 1.0000
Epoch 643/1000
5243/5243 - 0s - loss: 7.5501e-05

Epoch 751/1000
5243/5243 - 0s - loss: 5.8252e-05 - accuracy: 1.0000
Epoch 752/1000
5243/5243 - 0s - loss: 5.4800e-05 - accuracy: 1.0000
Epoch 753/1000
5243/5243 - 0s - loss: 5.2423e-05 - accuracy: 1.0000
Epoch 754/1000
5243/5243 - 0s - loss: 5.0753e-05 - accuracy: 1.0000
Epoch 755/1000
5243/5243 - 0s - loss: 4.6577e-05 - accuracy: 1.0000
Epoch 756/1000
5243/5243 - 0s - loss: 4.5846e-05 - accuracy: 1.0000
Epoch 757/1000
5243/5243 - 0s - loss: 4.3299e-05 - accuracy: 1.0000
Epoch 758/1000
5243/5243 - 0s - loss: 4.1906e-05 - accuracy: 1.0000
Epoch 759/1000
5243/5243 - 0s - loss: 3.8492e-05 - accuracy: 1.0000
Epoch 760/1000
5243/5243 - 0s - loss: 3.7307e-05 - accuracy: 1.0000
Epoch 761/1000
5243/5243 - 0s - loss: 3.7481e-05 - accuracy: 1.0000
Epoch 762/1000
5243/5243 - 0s - loss: 3.4302e-05 - accuracy: 1.0000
Epoch 763/1000
5243/5243 - 0s - loss: 3.3215e-05 - accuracy: 1.0000
Epoch 764/1000
5243/5243 - 0s - loss: 3.1242e-05 - accuracy: 1.0000
Epoch 765/1000
5243/5243 - 0s - loss: 2.9679e-05

5243/5243 - 0s - loss: 3.5484e-05 - accuracy: 1.0000
Epoch 873/1000
5243/5243 - 0s - loss: 3.4434e-05 - accuracy: 1.0000
Epoch 874/1000
5243/5243 - 0s - loss: 3.2564e-05 - accuracy: 1.0000
Epoch 875/1000
5243/5243 - 0s - loss: 3.0693e-05 - accuracy: 1.0000
Epoch 876/1000
5243/5243 - 0s - loss: 2.9690e-05 - accuracy: 1.0000
Epoch 877/1000
5243/5243 - 0s - loss: 2.7999e-05 - accuracy: 1.0000
Epoch 878/1000
5243/5243 - 0s - loss: 2.7291e-05 - accuracy: 1.0000
Epoch 879/1000
5243/5243 - 0s - loss: 2.5696e-05 - accuracy: 1.0000
Epoch 880/1000
5243/5243 - 0s - loss: 2.4192e-05 - accuracy: 1.0000
Epoch 881/1000
5243/5243 - 0s - loss: 2.3119e-05 - accuracy: 1.0000
Epoch 882/1000
5243/5243 - 0s - loss: 2.1750e-05 - accuracy: 1.0000
Epoch 883/1000
5243/5243 - 0s - loss: 2.0896e-05 - accuracy: 1.0000
Epoch 884/1000
5243/5243 - 0s - loss: 1.9519e-05 - accuracy: 1.0000
Epoch 885/1000
5243/5243 - 0s - loss: 1.8664e-05 - accuracy: 1.0000
Epoch 886/1000
5243/5243 - 0s - loss: 1.7818e-05 - accuracy: 1.

5243/5243 - 0s - loss: 2.3420e-04 - accuracy: 1.0000
Epoch 994/1000
5243/5243 - 0s - loss: 1.7251e-04 - accuracy: 1.0000
Epoch 995/1000
5243/5243 - 0s - loss: 1.4629e-04 - accuracy: 1.0000
Epoch 996/1000
5243/5243 - 0s - loss: 1.2841e-04 - accuracy: 1.0000
Epoch 997/1000
5243/5243 - 0s - loss: 1.1532e-04 - accuracy: 1.0000
Epoch 998/1000
5243/5243 - 0s - loss: 1.0582e-04 - accuracy: 1.0000
Epoch 999/1000
5243/5243 - 0s - loss: 9.7241e-05 - accuracy: 1.0000
Epoch 1000/1000
5243/5243 - 0s - loss: 8.9514e-05 - accuracy: 1.0000


<tensorflow.python.keras.callbacks.History at 0x2a20afd0e48>

# Quantifying the Model

In [77]:
 # Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1748 - 0s - loss: 1.7545 - accuracy: 0.8844
Loss: 1.754466798812901, Accuracy: 0.8844393491744995


# Making Predictions with new data

In [None]:
import numpy as np
new_data = np.array([[0.2, 0.3, 0.4]])
print(f"Predicted class: {model.predict_classes(new_data)}")

# Save the Model

In [28]:
model.save("neural_network.h5")

['../Saved_Models/random_tree.sav']