In [1]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
## install joblib. This will be used to save your model. 
## Restart your kernel after installing 
#!pip install joblib

In [3]:
# Basic dependiecies
import pandas as pd

In [4]:
# sklearn imports
from sklearn.model_selection import train_test_split


# Read the CSV and Perform Basic Data Cleaning

In [5]:
exoplanet_df = pd.read_csv("../Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
exoplanet_df = exoplanet_df.dropna(axis='columns', how='all')
# Drop the null rows
exoplanet_df = exoplanet_df.dropna()

print(f'The total count of raw data given is: {len(exoplanet_df)}')
exoplanet_df.head()

The total count of raw data given is: 6991


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
# Check for direct correlations within columns to see if they need be dropped.
#exoplanet_df.corr()

In [7]:
# The several error 2 columns are dropped because of a driect correlation inorder to have a more accureate prediction
#a = exoplanet_df.drop(['koi_period_err2', 'koi_time0bk_err2'], axis= 1)

In [8]:
#exoplanet_df.info()

# Select your features (columns)

In [9]:
# Set features. This will also be used as your x values.
selected_features_df = exoplanet_df.drop(['koi_disposition'], axis=1)
#selected_features = df[['names', 'of', 'selected', 'features', 'here']]
selected_features_df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [10]:
X = selected_features_df
y = exoplanet_df['koi_disposition']
#.values.reshape(-1, 1)
print(X.shape, y.shape)

(6991, 40) (6991,)


In [11]:
# Create train and test values from our data set 
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [12]:
print(f'The length of the X train data is: {len(X_train)}')
X_train.head()

The length of the X train data is: 5243


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
6080,1,0,0,0,12.496435,0.0002213,-0.0002213,132.0358,0.0143,-0.0143,...,-286,3.805,0.39,-0.13,2.73,0.535,-1.248,289.2308,44.412483,13.054
3001,0,0,0,0,11.615625,0.0001528,-0.0001528,131.96843,0.00823,-0.00823,...,-72,4.083,0.368,-0.092,1.453,0.218,-0.51,293.52756,41.111439,15.162
570,0,1,0,0,10.980246,6.93e-07,-6.93e-07,137.137607,5.3e-05,-5.3e-05,...,-159,4.462,0.098,-0.182,0.897,0.238,-0.119,282.79764,43.578129,14.212
4897,1,0,0,0,466.90824,0.01194,-0.01194,136.3731,0.019,-0.019,...,-146,4.456,0.102,-0.361,0.867,0.448,-0.103,297.65436,43.178551,15.202
625,0,1,1,1,1.061933,1.25e-06,-1.25e-06,133.850441,0.000978,-0.000978,...,-167,3.975,0.259,-0.111,1.851,0.383,-0.575,288.90253,44.632992,12.953


In [13]:
y_train

6080    FALSE POSITIVE
3001         CANDIDATE
570     FALSE POSITIVE
4897    FALSE POSITIVE
625     FALSE POSITIVE
             ...      
3315         CONFIRMED
2249         CONFIRMED
3751    FALSE POSITIVE
6016         CANDIDATE
5477         CANDIDATE
Name: koi_disposition, Length: 5243, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [14]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [15]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Y is categorical do not scale

In [16]:
X_train_scaled

array([[ 2.3636166 , -0.57155056, -0.50607237, ..., -0.60905163,
         0.17104541, -0.92407435],
       [-0.42308046, -0.57155056, -0.50607237, ...,  0.29580336,
        -0.74397694,  0.65138821],
       [-0.42308046,  1.7496265 , -0.50607237, ..., -1.96381125,
        -0.06023068, -0.05861626],
       ...,
       [-0.42308046,  1.7496265 , -0.50607237, ...,  0.77936028,
         0.27732824, -0.9920853 ],
       [-0.42308046, -0.57155056, -0.50607237, ...,  0.71398891,
        -0.40930913, -3.39488991],
       [-0.42308046, -0.57155056, -0.50607237, ...,  0.11014294,
        -0.6770074 ,  0.47575552]])

In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

LabelEncoder()

In [21]:
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [22]:
y_train_encoded

array([2, 0, 2, ..., 2, 0, 0])

#  Defining our Model Architecture (the layers)



In [33]:
from tensorflow.keras.models import Sequential

model = Sequential()


 Next, we add our first layer. This layer requires you to specify both the number of inputs and the number of nodes that you want in the hidden layer.

In [34]:
from tensorflow.keras.layers import Dense
number_inputs = 40
number_hidden_nodes = 4
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))

 Our final layer is the output layer. Here, we need to specify the activation function (typically softmax for classification) and the number of classes (labels) that we are trying to predict (2 in this example).

In [35]:
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

In [36]:
# Model Summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 4)                 164       
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 15        
Total params: 179
Trainable params: 179
Non-trainable params: 0
_________________________________________________________________


In [37]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# The output layer in this test is using software for logistic regression (categorical)
# If output layer activation was `linear` then you may want to use `mse` for loss

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model

In [38]:
 # Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_encoded,
    epochs=1000,
    shuffle=True,
    verbose=2)

ValueError: You are passing a target array of shape (5243, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
```
from keras.utils import to_categorical
y_binary = to_categorical(y_int)
```

Alternatively, you can use the loss function `sparse_categorical_crossentropy` instead, which does expect integer targets.

# Quantifying the Model

In [None]:
 # Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Making Predictions with new data

In [None]:
import numpy as np
new_data = np.array([[0.2, 0.3, 0.4]])
print(f"Predicted class: {model.predict_classes(new_data)}")

# Save the Model

In [28]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = '../Saved_Models/random_tree.sav'
joblib.dump(model, filename)

['../Saved_Models/random_tree.sav']