# Binary Startup Classification Model

In [1]:
# Import Libraries and Dependancies
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

### Import/Prepare Data (Neural Network)

In [2]:
#Read in data and review df
df = pd.read_csv(Path('Resources/applicants_data.csv'))
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [3]:
#Drop EIN and NAME from df not revelant to model and review df
df = df.drop(columns=['EIN','NAME'])
df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
#Create a list of all categorical variables in the df and then review
categorical_var = []
for c in df.columns:
    if df[c].dtypes == 'O':
        categorical_var.append(c)

display(categorical_var)
display(df[categorical_var].dtypes)

['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
dtype: object

In [5]:
#Encode categorical variables (OneHotEncoder), and then create new df to store encoded variables
enc = OneHotEncoder()
enc_data = enc.fit_transform(df[categorical_var]).toarray()
enc_df = pd.DataFrame(enc_data, columns=enc.get_feature_names(categorical_var))
enc_df.head()

Unnamed: 0,APPLICATION_TYPE_T10,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,APPLICATION_TYPE_T25,APPLICATION_TYPE_T29,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
#add numerical columns from original df to enc_df 
enc_df = pd.concat([df.drop(columns=categorical_var), enc_df], axis=1)
enc_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T10,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,108590,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,5000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,6692,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,142590,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
#define the features(X) and target(y = "IS_SUCCESSFUL")
X = enc_df.drop(columns=['IS_SUCCESSFUL'])
y = enc_df['IS_SUCCESSFUL']
display(X.head())
display(y[:5])

Unnamed: 0,STATUS,ASK_AMT,APPLICATION_TYPE_T10,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,108590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,5000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,6692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,142590,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


0    1
1    1
2    0
3    1
4    1
Name: IS_SUCCESSFUL, dtype: int64

In [8]:
#split datasets into training and testing with random_state=1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
#Scale training and testing dataset(X)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Compile and Evaluate a Binary Classification Model using a Nueral Network 

In [10]:
# Define input features and output/hidden layer nodes
num_input_features = len(X_train.iloc[0])
num_output_neurons = 1
hid_nodes_l1 = int(((num_input_features + num_output_neurons)/2) +1)
hid_nodes_l2 = int((hid_nodes_l1/2) +1)
display(num_input_features, num_output_neurons, hid_nodes_l1, hid_nodes_l2)

116

1

59

30

In [11]:
# Create 2-layer neural network using 'relu' and 'sigmoid'
nn = Sequential()
nn.add(Dense(units=hid_nodes_l1, input_dim=num_input_features, activation='relu'))
nn.add(Dense(units=hid_nodes_l2, activation='relu'))
nn.add(Dense(units=num_output_neurons, activation='sigmoid'))
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 59)                6903      
                                                                 
 dense_1 (Dense)             (None, 30)                1800      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 8,734
Trainable params: 8,734
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Compile and fit using 'binary_crossentropy', 'adam', and 'accuracy' as metric. Fit for 50 epochs using X_train_scaled
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn.fit(x=X_train_scaled, y=y_train, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x246dae0d488>

In [13]:
# Evaluate the model original loss and accuracy metrics for test data
model_loss, model_accuracy = nn.evaluate(x=X_test_scaled, y=y_test, verbose=0)
print(f"Loss: {model_loss:.4f}, Accuracy: {model_accuracy:.4f}")

Loss: 0.5539, Accuracy: 0.7313


In [14]:
# Save and export nn to HDF5 file, named AlphabetSoup.h5
nn.save(Path('Resources/AlphabetSoup.h5'), save_format='h5')

### Optimize Neural Network Model


In [25]:
# Define input features and output/hidden layer nodes adding one more layer
num_input_features = len(X_train.iloc[0])
num_output_neurons = 1
hid_nodes_l1 = int(((num_input_features + num_output_neurons+1)/2))
hid_nodes_l2 = int((hid_nodes_l1+1)/2)
hid_nodes_l3 = int((hid_nodes_l2+1)/2)
display(num_input_features, num_output_neurons, hid_nodes_l1, hid_nodes_l2, hid_nodes_l3)

116

1

59

30

15

In [26]:
# Create 3-layer neural network using 'relu' and 'sigmoid'
nn1 = Sequential()
nn1.add(Dense(units=hid_nodes_l1, input_dim=num_input_features, activation='relu'))
nn1.add(Dense(units=hid_nodes_l2, activation='relu'))
nn1.add(Dense(units=hid_nodes_l3, activation='relu'))
nn1.add(Dense(units=num_output_neurons, activation='sigmoid'))
nn1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 59)                6903      
                                                                 
 dense_11 (Dense)            (None, 30)                1800      
                                                                 
 dense_12 (Dense)            (None, 15)                465       
                                                                 
 dense_13 (Dense)            (None, 1)                 16        
                                                                 
Total params: 9,184
Trainable params: 9,184
Non-trainable params: 0
_________________________________________________________________


In [31]:
# Compile and fit using 'binary_crossentropy', 'adam', and 'accuracy' as metric. Fit for 50 epochs using X_train_scaled
nn1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn1.fit(x=X_train_scaled, y=y_train, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x246e86b1908>

In [32]:
# Evaluate the model alt 1 loss and accuracy metrics for test data
model_loss, model_accuracy = nn1.evaluate(x=X_test_scaled, y=y_test, verbose=0)
print(f"Loss: {model_loss:.4f}, Accuracy: {model_accuracy:.4f}")

Loss: 0.5639, Accuracy: 0.7303


In [33]:
# Save and export nn1 to HDF5 file, named AlphabetSoup.h5
nn1.save(Path('Resources/AlphabetSoupAlt1.h5'), save_format='h5')

### Alternate 1
Adding a layer did not affect the accuracy of the neural network by much.

In [50]:
# Define input features and output/hidden layer nodes adding one more layer (doubling total nodes)
num_input_features = len(X_train.iloc[0])
num_output_neurons = 1
hid_nodes_l1 = int((num_input_features+num_output_neurons)*(4/3))
hid_nodes_l2 = int((hid_nodes_l1+1)*(1/3))
print(num_input_features, num_output_neurons, hid_nodes_l1, hid_nodes_l2)

116 1 156 52


In [51]:
# Create 2-layer neural network using 'relu' and 'sigmoid'
nn2 = Sequential()
nn2.add(Dense(units=hid_nodes_l1, input_dim=num_input_features, activation='relu'))
nn2.add(Dense(units=hid_nodes_l2, activation='relu'))
nn2.add(Dense(units=num_output_neurons, activation='sigmoid'))
nn2.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_17 (Dense)            (None, 156)               18252     
                                                                 
 dense_18 (Dense)            (None, 52)                8164      
                                                                 
 dense_19 (Dense)            (None, 1)                 53        
                                                                 
Total params: 26,469
Trainable params: 26,469
Non-trainable params: 0
_________________________________________________________________


In [52]:
# Compile and fit using 'binary_crossentropy', 'adam', and 'accuracy' as metric. Fit for 50 epochs using X_train_scaled
nn2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
nn2.fit(x=X_train_scaled, y=y_train, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x246e6c18ac8>

In [53]:
# Evaluate the model alt 2 loss and accuracy metrics for test data
model_loss, model_accuracy = nn2.evaluate(x=X_test_scaled, y=y_test, verbose=0)
print(f"Loss: {model_loss:.4f}, Accuracy: {model_accuracy:.4f}")

Loss: 0.5642, Accuracy: 0.7292


In [54]:
# Save and export nn2 to HDF5 file, named AlphabetSoup.h5
nn2.save(Path('Resources/AlphabetSoupAlt2.h5'), save_format='h5')

### Alternate 2
Doubled the total nodes but the accuracy did not increase. Increasing the epochs(50 to 100) did not increase accuracy by much(about 1%).

In [58]:
print("Original Model Results")

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn.evaluate(x=X_test_scaled, y=y_test, verbose=0)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Original Model Results
Loss: 0.5538591742515564, Accuracy: 0.7313119769096375


In [56]:
print("Alternative Model 1 Results")

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn1.evaluate(x=X_test_scaled, y=y_test, verbose=0)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Alternative Model 1 Results
Loss: 0.5639289617538452, Accuracy: 0.7302623987197876


In [57]:
print("Alternative Model 2 Results")

# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_accuracy = nn2.evaluate(x=X_test_scaled, y=y_test, verbose=0)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Alternative Model 2 Results
Loss: 0.5641979575157166, Accuracy: 0.7292128205299377
