In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import os


In [2]:
#Readibg the CSV and performing basic data cleaning
df = pd.read_csv("exoplanet_data.csv")
# Dropping the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Dropping the null rows
df = df.dropna()
df.head()
#Getting rid of space on FALSE_POSITIVES
df.replace(to_replace ="FALSE POSITIVE", 
                 value ="FALSE_POSITIVE",
          inplace=True) 


In [3]:
# Setting features that will also be used as my x values.
# Instead of selecting 40 columns I dropped the koi_disposition column since its being used for the Y. 
X = df.drop(columns = ['koi_disposition'])
Y= df["koi_disposition"]

print(X.shape, Y.shape)

(6991, 40) (6991,)


In [4]:
#Creating a train test split
from sklearn.model_selection import train_test_split

X = pd.get_dummies(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state= 40)

In [5]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3866,0,1,0,0,6.714248,8e-06,-8e-06,134.31703,0.00137,-0.00137,...,-179,4.508,0.102,-0.085,0.8,0.096,-0.096,299.08057,44.29501,17.376
2059,0,0,0,0,2.118894,8e-06,-8e-06,132.8042,0.0029,-0.0029,...,-182,4.583,0.038,-0.152,0.796,0.176,-0.075,288.21072,42.556808,15.855
3086,0,1,1,1,0.512583,3e-06,-3e-06,131.77571,0.00456,-0.00456,...,-232,4.471,0.056,-0.224,0.999,0.333,-0.111,295.46341,46.01123,15.484
6901,0,0,0,1,1.332544,1.2e-05,-1.2e-05,132.06788,0.00841,-0.00841,...,-191,4.543,0.099,-0.081,0.734,0.1,-0.082,294.94775,46.671951,15.84
5580,0,1,0,0,52.884577,0.001744,-0.001744,147.01058,0.00373,-0.00373,...,-233,4.42,0.067,-0.202,1.078,0.333,-0.133,290.40158,37.942009,15.694


In [6]:
# Scaling the data/ Preprocessing data set prior to fitting
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

print(X.shape, Y.shape)

(6991, 40) (6991,)


In [7]:
# Label encoding the responses (y)
label_encoder = LabelEncoder()
label_encoder.fit(Y_train)

encoded_y_train = label_encoder.transform(Y_train)
encoded_y_test = label_encoder.transform(Y_test)

In [8]:
# Converting encoded labels to one hot encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [9]:
#Creating sequential model
from tensorflow.keras.models import Sequential

model = Sequential()

In [21]:
#Adding first layer specifying the number of inputs and nodes that I want in the hidden layer
from tensorflow.keras.layers import Dense


number_inputs = 40
number_hidden_nodes = 100
model.add(Dense(units=number_hidden_nodes,activation='relu', input_dim=number_inputs))
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

In [22]:
model.summary() 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 120)               4920      
_________________________________________________________________
dense_1 (Dense)              (None, 120)               14520     
_________________________________________________________________
dense_2 (Dense)              (None, 120)               14520     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 242       
_________________________________________________________________
dense_4 (Dense)              (None, 100)               300       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
_________________________________________________________________
dense_6 (Dense)              (None, 100)               3

In [23]:
#Compiling the model using a loss function and optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary() 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 120)               4920      
_________________________________________________________________
dense_1 (Dense)              (None, 120)               14520     
_________________________________________________________________
dense_2 (Dense)              (None, 120)               14520     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 242       
_________________________________________________________________
dense_4 (Dense)              (None, 100)               300       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
_________________________________________________________________
dense_6 (Dense)              (None, 100)               3

In [24]:
# Fitting (training) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/100
5243/5243 - 2s - loss: 0.7760 - accuracy: 0.6201
Epoch 2/100
5243/5243 - 1s - loss: 0.4391 - accuracy: 0.7416
Epoch 3/100
5243/5243 - 1s - loss: 0.4096 - accuracy: 0.7465
Epoch 4/100
5243/5243 - 1s - loss: 0.4077 - accuracy: 0.7423
Epoch 5/100
5243/5243 - 1s - loss: 0.4070 - accuracy: 0.7458
Epoch 6/100
5243/5243 - 1s - loss: 0.4061 - accuracy: 0.7471
Epoch 7/100
5243/5243 - 1s - loss: 0.4059 - accuracy: 0.7456
Epoch 8/100
5243/5243 - 1s - loss: 0.4056 - accuracy: 0.7482
Epoch 9/100
5243/5243 - 1s - loss: 0.4059 - accuracy: 0.7507
Epoch 10/100
5243/5243 - 1s - loss: 0.4170 - accuracy: 0.7463
Epoch 11/100
5243/5243 - 1s - loss: 0.4058 - accuracy: 0.7492
Epoch 12/100
5243/5243 - 1s - loss: 0.4056 - accuracy: 0.7440
Epoch 13/100
5243/5243 - 1s - loss: 0.4057 - accuracy: 0.7498
Epoch 14/100
5243/5243 - 1s - loss: 0.4056 - accuracy: 0.7480
Epoch 15/100
5243/5243 - 1s - loss: 0.4054 - accuracy: 0.7490
Epoch 16/100
5243/5243 - 1s - loss: 0.4054 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x7ffaeb738350>

In [26]:
 # Evaluating the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1 - 0s - loss: 0.3167 - accuracy: 0.7506
Loss: 0.38129041164908856, Accuracy: 0.7505720853805542


In [None]:
#Saving model
model2.save("model_2.sav")
