In [42]:
## Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [43]:
## install joblib. This will be used to save your model. 
## Restart your kernel after installing 
#!pip install joblib

In [44]:
# Basic dependiecies
import pandas as pd
# To save model
#import joblib

In [45]:
# sklearn imports
from sklearn.model_selection import train_test_split

## SCALER
from sklearn.preprocessing import StandardScaler, LabelEncoder
#from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [46]:
## TenserFlow imports
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read the CSV and Perform Basic Data Cleaning

In [47]:
stars_df = pd.read_csv("../../ETL/Resources/cleaned_data/stars_classification.csv")
print(f'Count of pre-dropped data: {len(stars_df)}')

# Drop the null columns where all values are null
stars_df = stars_df.dropna(axis='columns', how='all')
# Drop the null rows
stars_df = stars_df.dropna()
stars_df = stars_df.drop(['Unnamed: 0'], axis=1)

print(f'The final count of data is: {len(stars_df)}')
stars_df.head()

Count of pre-dropped data: 240
The final count of data is: 240


Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


In [48]:
## Make Type its Text format for later to encode and predict
stars_df['Type'] = stars_df['Type'].map({0: 'Red Dwarf', 1: 'Brown Dwarf', 
                                         2: 'White Dwarf', 3: 'Main Sequence', 
                                         4: 'Super Giants', 5: 'Hyper Giants' })
stars_df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,Red Dwarf
1,3042,0.0005,0.1542,16.6,Red,M,Red Dwarf
2,2600,0.0003,0.102,18.7,Red,M,Red Dwarf
3,2800,0.0002,0.16,16.65,Red,M,Red Dwarf
4,1939,0.000138,0.103,20.06,Red,M,Red Dwarf


In [49]:
stars_df['Color'].unique()

array(['Red', 'Blue White', 'White', 'Yellowish White',
       'Pale yellow orange', 'Blue', 'Whitish', 'yellow-white', 'Orange',
       'White-Yellow', 'Yellowish', 'Orange-Red'], dtype=object)

In [50]:
# Need to turn color into number (convert categorial data)
label_encoder = LabelEncoder()
label_encoder.fit(stars_df['Color'])

stars_df['Color'] = label_encoder.transform(stars_df['Color'])

In [51]:
stars_df['Color'].unique()

array([ 5,  1,  6, 10,  4,  0,  8, 11,  2,  7,  9,  3])

In [52]:
colors = ['Red', 'Blue White', 'White', 'Yellowish White',
       'Pale yellow orange', 'Blue', 'Whitish', 'yellow-white', 'Orange',
       'White-Yellow', 'Yellowish', 'Orange-Red']
color_number = [ 5,  1,  6, 10,  4,  0,  8, 11,  2,  7,  9,  3]

color_type_dict = {'color': colors, 'color_number': color_number}
color_type_df = pd.DataFrame(color_type_dict)
color_type_df

Unnamed: 0,color,color_number
0,Red,5
1,Blue White,1
2,White,6
3,Yellowish White,10
4,Pale yellow orange,4
5,Blue,0
6,Whitish,8
7,yellow-white,11
8,Orange,2
9,White-Yellow,7


In [53]:
## Need to scale due to number of color ranging from 0-16..?


# stars_df['Color'] = StandardScaler(stars_df['Color']).fit()
# stars_df['Color'] = X_scaler.transform(stars_df['Color'])
# stars_df['Color'].unique()

In [54]:
stars_df['Spectral_Class'].unique()

array(['M', 'B', 'A', 'F', 'O', 'K', 'G'], dtype=object)

In [55]:
# Need to turn spectrial class into number (convert categorial data)
label_encoder = LabelEncoder()
label_encoder.fit(stars_df['Spectral_Class'])

stars_df['Spectral_Class'] = label_encoder.transform(stars_df['Spectral_Class'])

In [56]:
stars_df['Spectral_Class'].unique()

array([5, 1, 0, 2, 6, 4, 3])

In [57]:
spec_class = ['M', 'B', 'A', 'F', 'O', 'K', 'G']
spec_number = [5, 1, 0, 2, 6, 4, 3]

spectral_class_dict = {'spectral_class': spec_class, 'spec_number': spec_number}
spectral_class_df = pd.DataFrame(spectral_class_dict)
spectral_class_df

Unnamed: 0,spectral_class,spec_number
0,M,5
1,B,1
2,A,0
3,F,2
4,O,6
5,K,4
6,G,3


In [58]:
# Check for direct correlations within columns to see if they need be dropped.
stars_df.corr()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class
Temperature,1.0,0.393404,0.064216,-0.420261,-0.675345,-0.207852
L,0.393404,1.0,0.526516,-0.692619,-0.366836,0.27528
R,0.064216,0.526516,1.0,-0.608728,-0.032348,0.097124
A_M,-0.420261,-0.692619,-0.608728,1.0,0.344999,-0.08584
Color,-0.675345,-0.366836,-0.032348,0.344999,1.0,0.075775
Spectral_Class,-0.207852,0.27528,0.097124,-0.08584,0.075775,1.0


In [59]:
stars_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature     240 non-null    int64  
 1   L               240 non-null    float64
 2   R               240 non-null    float64
 3   A_M             240 non-null    float64
 4   Color           240 non-null    int32  
 5   Spectral_Class  240 non-null    int32  
 6   Type            240 non-null    object 
dtypes: float64(3), int32(2), int64(1), object(1)
memory usage: 13.1+ KB


# Select your features (columns)

In [60]:
# Set features. This will also be used as your x values.
selected_features_df = stars_df.drop(['Type'], axis=1)
selected_features_df.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class
0,3068,0.0024,0.17,16.12,5,5
1,3042,0.0005,0.1542,16.6,5,5
2,2600,0.0003,0.102,18.7,5,5
3,2800,0.0002,0.16,16.65,5,5
4,1939,0.000138,0.103,20.06,5,5


# Create a Train Test Split

Use `koi_disposition` for the y values

In [61]:
X = selected_features_df
y = stars_df['Type']
#.values.reshape(-1, 1)
print(X.shape, y.shape)

(240, 6) (240,)


In [62]:
# Create train and test values from our data set 
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [63]:
print(f'The length of the X train data is: {len(X_train)}')
X_train.head()

The length of the X train data is: 180


Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class
111,3605,126000.0,1124.0,-10.81,5,5
221,12749,332520.0,76.0,-7.02,0,6
140,13420,0.00059,0.00981,13.67,1,1
56,3660,363000.0,1673.0,-11.92,5,5
75,3180,0.001,0.35,11.76,5,5


In [64]:
y_train

111    Hyper Giants
221    Super Giants
140     White Dwarf
56     Hyper Giants
75      Brown Dwarf
           ...     
84      White Dwarf
234    Hyper Giants
1         Red Dwarf
222    Super Giants
166    Super Giants
Name: Type, Length: 180, dtype: object

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [65]:
# Save X-train to be able to scale the input data from website (only one row of data so not able to scale on its own)
X_train.to_csv('../../ETL/Resources/cleaned_data/X_train.csv')

In [66]:
# Create a Scaler model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)

#X_scaler = MinMaxScaler().fit(X_train)

In [67]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Y is categorical do not scale

In [68]:
X_train_scaled

array([[-0.72585306,  0.09304437,  1.70486752, -1.440102  ,  0.51915961,
         0.58596913],
       [ 0.23602209,  1.22039512, -0.31526582, -1.08162193, -1.19864791,
         1.06539841],
       [ 0.30660588, -0.59476404, -0.46174513,  0.87535768, -0.85508641,
        -1.33174802],
       ...,
       [-0.78507612, -0.59476404, -0.46146681,  1.15249399,  0.51915961,
         0.58596913],
       [-0.118054  ,  1.27727578, -0.27285844, -1.0778385 , -1.19864791,
         1.06539841],
       [ 0.9906691 ,  0.23497309, -0.32104865, -0.95582285, -1.19864791,
         1.06539841]])

In [69]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

LabelEncoder()

In [70]:
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_train_encoded

array([1, 4, 5, 1, 0, 2, 5, 3, 5, 4, 4, 4, 1, 3, 2, 4, 3, 0, 5, 4, 2, 0,
       2, 2, 2, 2, 2, 2, 5, 3, 3, 1, 5, 2, 4, 1, 3, 0, 3, 0, 0, 0, 3, 0,
       0, 1, 1, 4, 0, 2, 2, 1, 3, 0, 1, 5, 0, 5, 3, 4, 1, 5, 2, 1, 3, 5,
       3, 2, 0, 1, 2, 3, 2, 2, 1, 3, 0, 0, 4, 4, 1, 2, 5, 1, 2, 0, 5, 0,
       1, 3, 1, 4, 1, 3, 1, 0, 0, 4, 2, 2, 4, 2, 0, 3, 4, 4, 3, 4, 0, 5,
       4, 1, 3, 3, 1, 0, 4, 0, 2, 3, 1, 2, 4, 4, 0, 2, 5, 0, 4, 4, 0, 1,
       2, 5, 5, 3, 5, 3, 5, 5, 2, 5, 5, 2, 3, 3, 1, 5, 4, 5, 5, 5, 0, 1,
       4, 1, 5, 0, 3, 1, 3, 2, 4, 0, 3, 1, 5, 5, 0, 4, 3, 2, 1, 4, 5, 5,
       1, 3, 4, 4])

In [71]:
# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)
y_train_categorical

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

#  Defining our Model Architecture (the layers)

In [72]:
model = Sequential()

Next, we add our first layer. This layer requires you to specify both the number of inputs and the number of nodes that you want in the hidden layer.  Our final layer is the output layer. Here, we need to specify the activation function (typically softmax for classification) and the number of classes (labels) that we are trying to predic

In [73]:
number_inputs = 6
number_hidden_nodes = 100
number_classes = 6

In [74]:
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(units=number_hidden_nodes, activation='relu'))
model.add(Dense(units=number_classes, activation='softmax'))

In [75]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# The output layer in this test is using software for logistic regression (categorical)
# If output layer activation was `linear` then you may want to use `mse` for loss

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [76]:
# Model Summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               700       
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 606       
Total params: 11,406
Trainable params: 11,406
Non-trainable params: 0
_________________________________________________________________


# Train the model

In [77]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=20,
    shuffle=True,
    verbose=3)

Train on 180 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2f14f8147f0>

# Quantifying the Model

In [78]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

60/60 - 0s - loss: 0.1917 - accuracy: 0.9833
Loss: 0.19174869259198507, Accuracy: 0.9833333492279053


In [79]:
X_test_scaled[:1].shape

(1, 6)

# Make Predictions

In [84]:
encoded_predictions = model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")
print(encoded_predictions)

Predicted classes: ['White Dwarf' 'Brown Dwarf' 'Super Giants' 'Red Dwarf' 'Red Dwarf'
 'Brown Dwarf' 'Hyper Giants' 'Hyper Giants' 'Brown Dwarf' 'Hyper Giants']
Actual Labels: ['White Dwarf', 'Brown Dwarf', 'Super Giants', 'Red Dwarf', 'Red Dwarf', 'Brown Dwarf', 'Hyper Giants', 'Hyper Giants', 'Brown Dwarf', 'Hyper Giants']
[5 0 4 3 3 0 1 1 0 1]


# Save the model

In [81]:
model.save("../neural_network.h5")

In [82]:
## Load the model
#from tensorflow.keras.models import load_model
#model = load_model("neural_network.h5")