# Deliverable 3: Optimizing the Outcomes

In [59]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras.callbacks import ModelCheckpoint
from pathlib import Path
import kerastuner as kt

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("Resources/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [69]:
# What percentage of donations are acutally used for something? 
successes = len(application_df[application_df.IS_SUCCESSFUL == 1])
total = len(application_df)
successes/total

0.5324061926003674

## Preprocessing

In [3]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(columns=['EIN', 'NAME'], inplace=True)
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
# Determine the number of unique values in each column.
application_df.nunique()

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [5]:
# Look at APPLICATION_TYPE value counts for binning
app_type_counts = application_df.APPLICATION_TYPE.value_counts()
app_type_counts

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [6]:
# Determine which values to replace if counts are less than ...?
replace_application = list(app_type_counts[app_type_counts < 1200].index)

# Replace in dataframe
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app,"Other")
    
# Check to make sure binning was successful
application_df.APPLICATION_TYPE.value_counts()

T3       27037
Other     4504
T4        1542
T6        1216
Name: APPLICATION_TYPE, dtype: int64

In [7]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df.CLASSIFICATION.value_counts()
classification_counts

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64

In [8]:
# Determine which values to replace if counts are under ...  (Initial analysis was under 200)
replace_class = list(classification_counts[classification_counts < 2000].index)

# Replace in dataframe
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls,"Other")
    
# Check to make sure binning was successful
application_df.CLASSIFICATION.value_counts()

C1000    17326
C2000     6074
Other     6062
C1200     4837
Name: CLASSIFICATION, dtype: int64

In [9]:
# Generate our categorical variable lists
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist()
application_cat

# Note that the special consideration has binary answers as it is, so one column will have to be removed after encoding

['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

In [10]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)
encode_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")

In [12]:
# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)

In [13]:
application_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,...,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,108590,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,5000,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,6692,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,142590,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
min_ask = application_df.ASK_AMT.min()
max_ask = application_df.ASK_AMT.max()
median_ask = application_df.ASK_AMT.median()

print(f'Min: {min_ask}, Median: {median_ask}, Max: {max_ask}')


Min: 5000, Median: 5000.0, Max: 8597806340


In [15]:
# Bin ther ask amounts 
# Apply the bins, labeling with the arbitrary 1-5 categories for the network learning
bins = [0, 5000, 35000, 100000, 500000, max_ask]
labels = [1, 2, 3, 4, 5]

application_df["ASK_BIN"] = pd.cut(
    application_df["ASK_AMT"], bins=bins, labels=labels)

application_df[["ASK_AMT","ASK_BIN"]].head(10)

Unnamed: 0,ASK_AMT,ASK_BIN
0,5000,1
1,108590,4
2,5000,1
3,6692,2
4,142590,4
5,5000,1
6,31452,2
7,7508025,5
8,94389,3
9,5000,1


In [16]:
# Separate into 2 groups, the 5000 requests and the non-5000 requests
application_low = application_df[application_df.ASK_BIN == 1]
application_high = application_df[application_df.ASK_BIN != 1]

In [17]:
application_low = application_low.drop(columns="ASK_AMT")
application_low.head()


Unnamed: 0,STATUS,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y,ASK_BIN
0,1,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
12,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [18]:
application_high = application_high.drop(columns="ASK_AMT")
application_high.head()


Unnamed: 0,STATUS,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y,ASK_BIN
1,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4
6,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
7,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5


In [19]:
def df_trainer(df):
    # Remove the outcome and set up the training model
    y = df.IS_SUCCESSFUL
    X = df.drop(columns="IS_SUCCESSFUL")

    # Split the preprocessed data into a training and testing dataset, values are not balanced - apply stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=512, stratify=y)

    # Scale the data
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test


## Training the Low Model, where all requests were $5000

In [20]:
# Prior to splitting, need to remove the bins for request amount, since these all have the same amount
application_low = application_low.drop(columns='ASK_BIN')

In [21]:
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_low)

## Neural Network Application

In [22]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 80
hidden_layer2 = 30

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1, input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                2800      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5,261
Trainable params: 5,261
Non-trainable params: 0
_________________________________________________________________


In [23]:
# Define Checkpoint Path and filenames
os.makedirs("checkpoints_3/", exist_ok=True)
checkpoint_path = "checkpoints_3/weights.{epoch:02d}.hdf5"

In [24]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True, 
    save_freq = 5
)

In [25]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10, callbacks=[cp_callback])

Epoch 1/10
  1/596 [..............................] - ETA: 5:04 - loss: 0.6686 - accuracy: 0.5312
Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5
 20/596 [>.............................] - ETA: 1s - loss: 0.6756 - accuracy: 0.5781  
Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5
 40/596 [=>............................] - ETA: 1s - loss: 0.6452 - accuracy: 0.6320
Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5

Epoch 1: saving model to checkpoints_3\weights.01.hdf5
 60/596 [==>...........................] - ETA: 1s - loss: 0.

In [26]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

199/199 - 0s - loss: 0.5575 - accuracy: 0.7321 - 216ms/epoch - 1ms/step
Loss: 0.557496964931488, Accuracy: 0.732125997543335


## Evaluating the Requests that were higher than $5000

In [27]:
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_high)


In [28]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 80
hidden_layer2 = 30

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 80)                2880      
                                                                 
 dense_4 (Dense)             (None, 30)                2430      
                                                                 
 dense_5 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5,341
Trainable params: 5,341
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Define Checkpoint Path and filenames
os.makedirs("checkpoints_4/", exist_ok=True)
checkpoint_path = "checkpoints_4/weights.{epoch:02d}.hdf5"


In [30]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)


In [31]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10, callbacks=[cp_callback])


Epoch 1/10
  1/209 [..............................] - ETA: 1:12 - loss: 0.7058 - accuracy: 0.4062
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 22/209 [==>...........................] - ETA: 0s - loss: 0.6752 - accuracy: 0.5838  
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 40/209 [====>.........................] - ETA: 0s - loss: 0.6560 - accuracy: 0.6359
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch

In [32]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


70/70 - 0s - loss: 0.6197 - accuracy: 0.6680 - 129ms/epoch - 2ms/step
Loss: 0.6196949481964111, Accuracy: 0.6680143475532532


This reduced the potential for prediction with the bins. Going to retry this without binned data.

In [33]:
# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

# Bin the types
replace_application = list(app_type_counts[app_type_counts < 1200].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app,"Other")

# Bin the Classification
replace_class = list(classification_counts[classification_counts < 2000].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls,"Other")

# Encode the categorical variables
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist()

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")

# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)

In [34]:
# Separate the 5000 from the rest
application_high = application_df[application_df.ASK_AMT != 5000]

In [35]:
# Split and Train
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_high)

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 80
hidden_layer2 = 30

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Define Checkpoint Path and filenames
os.makedirs("checkpoints_4/", exist_ok=True)
checkpoint_path = "checkpoints_4/weights.{epoch:02d}.hdf5"


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 80)                2880      
                                                                 
 dense_7 (Dense)             (None, 30)                2430      
                                                                 
 dense_8 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5,341
Trainable params: 5,341
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)


In [37]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10, callbacks=[cp_callback])


Epoch 1/10
  1/209 [..............................] - ETA: 1:11 - loss: 0.7264 - accuracy: 0.5312
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 20/209 [=>............................] - ETA: 0s - loss: 0.7046 - accuracy: 0.5516  
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 40/209 [====>.........................] - ETA: 0s - loss: 0.6742 - accuracy: 0.6141
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch

In [38]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


70/70 - 0s - loss: 0.6254 - accuracy: 0.6707 - 131ms/epoch - 2ms/step
Loss: 0.6254388689994812, Accuracy: 0.6707097887992859


## Changing the the ask amount - in this case, change the labels from numerical to string for the ASK Amounts

In [79]:
# Removing ASK_AMT and not including binning yielded a .7144 accuracy

# # With the ASK_BINS containing 8 bins, the accuracy was .7161
# bins = [0, 5000, 10000, 30000, 50000, 100000, 500000, 1000000, max_ask]
# labels = [0, 1, 2, 3, 4, 5, 6, 7]

# With the ASK_BINS containing 7 bins, the accuracy was .7194
bins = [0, 5000, 10000, 50000, 100000, 500000, 1000000, max_ask]
labels = ['ASK_0', 'ASK_1', 'ASK_2', 'ASK_3', 'ASK_4', 'ASK_5', 'ASK_6']

# # With the ASK_BINS containing 6 bins, the accuracy was .7192
# bins = [0, 5000, 10000, 50000, 100000, 500000, max_ask]
# labels = [0, 1, 2, 3, 4, 5]

# With the ASK_BINS containing 5 bins, the accuracy was .7181 
# bins = [0, 5000, 35000, 100000, 500000, max_ask]
# labels = [1, 2, 3, 4, 5]

# # Setting 3 bins, the accuracy was also .7181
# bins = [0, 5000, 100000, max_ask]
# labels = [1, 2, 3]

# # Setting 2 bins, the cutoff at 35000 yielded .7188
# bins = [0, 35000, max_ask]
# labels = [0, 1]

# # Setting 2 bins, the cutoff at 100000 yielded .7186
# bins = [0, 100000, max_ask]
# labels = [0, 1]


# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

# Bin the types
replace_application = list(app_type_counts[app_type_counts < 1200].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(
        app, "Other")

# Bin the Classification
replace_class = list(classification_counts[classification_counts < 2000].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(
        cls, "Other")

# Apply the bins, labeling with the arbitrary 1-5 categories for the network learning
application_df["ASK_BIN"] = pd.cut(
    application_df["ASK_AMT"], bins=bins, labels=labels)

# Remove the Ask Amount - this seems to be a bad predictor, even when binned
application_df = application_df.drop(columns="ASK_AMT")





In [81]:
application_df.dtypes

APPLICATION_TYPE            object
AFFILIATION                 object
CLASSIFICATION              object
USE_CASE                    object
ORGANIZATION                object
STATUS                       int64
INCOME_AMT                  object
SPECIAL_CONSIDERATIONS      object
IS_SUCCESSFUL                int64
ASK_BIN                   category
dtype: object

In [83]:
# Encode the categorical variables
application_cat = application_df.dtypes[(application_df.dtypes == 'object') | (application_df.dtypes == 'category')].index.tolist()


In [84]:
application_df.dtypes

APPLICATION_TYPE            object
AFFILIATION                 object
CLASSIFICATION              object
USE_CASE                    object
ORGANIZATION                object
STATUS                       int64
INCOME_AMT                  object
SPECIAL_CONSIDERATIONS      object
IS_SUCCESSFUL                int64
ASK_BIN                   category
dtype: object

In [85]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies and Ask amount
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")


# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(
    encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)


In [86]:
application_df.head()

Unnamed: 0,STATUS,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,...,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_Y,ASK_BIN_ASK_0,ASK_BIN_ASK_1,ASK_BIN_ASK_2,ASK_BIN_ASK_3,ASK_BIN_ASK_4,ASK_BIN_ASK_5,ASK_BIN_ASK_6
0,1,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [87]:
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_df)

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 80
hidden_layer2 = 30

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Define Checkpoint Path and filenames

os.makedirs("checkpoints_4/", exist_ok=True)
checkpoint_path = "checkpoints_4/weights.{epoch:02d}.hdf5"


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 80)                3360      
                                                                 
 dense_25 (Dense)            (None, 30)                2430      
                                                                 
 dense_26 (Dense)            (None, 1)                 31        
                                                                 
Total params: 5,821
Trainable params: 5,821
Non-trainable params: 0
_________________________________________________________________


In [88]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)


In [89]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10, callbacks=[cp_callback])


Epoch 1/10
  1/804 [..............................] - ETA: 5:05 - loss: 0.6672 - accuracy: 0.6250
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 20/804 [..............................] - ETA: 2s - loss: 0.6492 - accuracy: 0.6656  
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 36/804 [>.............................] - ETA: 2s - loss: 0.6339 - accuracy: 0.6797
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 55/804 [=>............................] - ETA: 2s - loss: 0.6242 - accuracy: 0.6841
Epoch 1: saving model to checkpo

In [90]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


268/268 - 1s - loss: 0.5699 - accuracy: 0.7178 - 608ms/epoch - 2ms/step
Loss: 0.5699030160903931, Accuracy: 0.7177842855453491


## Remove the USE_CASE Parameters, which were determined to reduce accuracy

In [44]:
# With the ASK_BINS containing 7 bins with USE_CASE, the accuracy was .7194, but removing USE_CASE dropped it to .7131
bins = [0, 5000, 10000, 50000, 100000, 500000, 1000000, max_ask]
labels = [0, 1, 2, 3, 4, 5, 6]


# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

# Bin the types
# With count type filter < 2000, accuracy was .7083
# With count type filter < 1500, accuracy was .7173
# with count type filter < 1200, accuracy was .7194
# With count type filter < 1000, accuracy was .7243
# With count type filter < 500, accuracy was .7237
# With count type filter < 200, accuracy was .7247
# With count type filter < 50, accuracy was .7226
# With count type filter < 10, accuracy was .7210

replace_application = list(app_type_counts[app_type_counts < 50].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(
        app, "Other")

# Bin the Classification
# With classifcation at 5000, accuracy was 0.7247
# With classifcation at 4000, accuracy was 0.7215
# With classifcation at 2000, accuracy was 0.7247
# With classifcation at 1000, accuracy was 0.7243
# With classifcation at 500, accuracy was 0.7257
# With classifcation at 200, accuracy was 0.7269/.7358
# With classifcation at 100, accuracy was 0.7255

replace_class = list(classification_counts[classification_counts < 200].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(
        cls, "Other")

# Apply the bins, labeling with the arbitrary 1-5 categories for the network learning
application_df["ASK_BIN"] = pd.cut(
    application_df["ASK_AMT"], bins=bins, labels=labels)

# Remove the Ask Amount - this seems to be a bad predictor, even when binned
application_df = application_df.drop(columns="ASK_AMT")

# Encode the categorical variables
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist(
)

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies and Ask amount
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")


# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(
    encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)

In [45]:
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_df)

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_layer1 = 80
hidden_layer2 = 30

nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Define Checkpoint Path and filenames

os.makedirs("checkpoints_4/", exist_ok=True)
checkpoint_path = "checkpoints_4/weights.{epoch:02d}.hdf5"


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 80)                3760      
                                                                 
 dense_13 (Dense)            (None, 30)                2430      
                                                                 
 dense_14 (Dense)            (None, 1)                 31        
                                                                 
Total params: 6,221
Trainable params: 6,221
Non-trainable params: 0
_________________________________________________________________


In [46]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)


In [47]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=10, callbacks=[cp_callback])


Epoch 1/10
  1/804 [..............................] - ETA: 7:13 - loss: 0.8677 - accuracy: 0.4375
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 20/804 [..............................] - ETA: 2s - loss: 0.7361 - accuracy: 0.5469  
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 36/804 [>.............................] - ETA: 2s - loss: 0.6890 - accuracy: 0.5990
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 55/804 [=>............................] - ETA: 2s - loss: 0.6623 - accuracy: 0.6307
Epoch 1: saving model to checkpo

In [48]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


268/268 - 0s - loss: 0.5552 - accuracy: 0.7266 - 231ms/epoch - 862us/step
Loss: 0.5551736950874329, Accuracy: 0.7266472578048706


# Adjusting the Model

In [49]:
#The Max Accuracy Achieved has been between .726 and .736
bins = [0, 5000, 10000, 50000, 100000, 500000, 1000000, max_ask]
labels = [0, 1, 2, 3, 4, 5, 6]


# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

# Bin the types
# Highest accuracies were either with 50 or 200 as the cutoff

replace_application = list(app_type_counts[app_type_counts < 50].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(
        app, "Other")

# Bin the Classification
# Highest accuracy was with 200 as the cutoff

replace_class = list(classification_counts[classification_counts < 200].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(
        cls, "Other")

# Apply the bins, labeling with the arbitrary 1-5 categories for the network learning
application_df["ASK_BIN"] = pd.cut(
    application_df["ASK_AMT"], bins=bins, labels=labels)

# Remove the Ask Amount - this seems to be a bad predictor, even when binned
application_df = application_df.drop(columns="ASK_AMT")

# Encode the categorical variables
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist(
)

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies and Ask amount
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")


# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(
    encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)


X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_df)


In [50]:
# Adding a third hidden layer, layers are 80/40/20, accuracy is .7225
# Trying Reverse pyramid relu, layers are 20/40/80, accuracy is .7251
# Trying from Tuner, 21/66/91 relu, accuracy is .7265
# Trying 22/66/91 tanh with 10 epochs, accuracy is .7264
# Trying the 22/66/91 tanh with 30 epochs, accuracy was .7258
# Trying 5 layers, based on tuner 56/76/36/21/36 with 10 epochs all relu, accuracy was .7230
# Trying 5 layers, based on 10/10/10/10/10 with 10 epochs all relu, accuracy was .7234
# Trying 5 layers, based on tuner 21/66/91/20/30 with 10 epochs all relu, accuracy was .7255
# Trying 5 layers, based on tuner 21/66/91/20/30 with 10 epochs all relu, accuracy was .7245
# Trying 3 layers, 21/66/91 relu/tanh/tanh, accuracy is .7222
# Trying 3 layers, 21/66/91 tanh/relu/relu, accuracy is .7254
# Trying 3 layers, 21/66/91 all exponential, acc is .4676
# Trying 3 layers, 21/66/91 all selu, acc is .7219
# Trying 3 layers, 88/88/88 all relu, acc is .7267
# Trying 3 layers, 80/30/4 all relu, acc is .7256
# Trying 3 layers, 8/3/4 all relu, acc is .7243
# Trying 3 layers, 80/30/80 all relu, acc is .7238
# Trying 4 layers, 80/60/40/20 all relu, acc is .7248
# Trying 5 layers, 80/60/40/20/10 all relu, acc is .7262
# Trying 5 layers, 80/60/40/20/10 all tanh, acc is .7249
# Trying 5 layers, 80/60/40/20/10 all softmax, acc is .7249
# Trying 4 layers, based on tuner 56/76/36/21 with 10 epochs all softmax, accuracy was .7258
# Trying 5 layers, based on tuner 56/76/36/21/36 with 10 epochs all softmax, accuracy was .5234
# Trying 5 layers, based on tuner 56/76/36/21/11 with 10 epochs all softmax, accuracy was .7243
# Trying 5 layers, based on tuner 56/76/36/21/11 with 10 epochs softmax/relu... , accuracy was .7270
# Trying 5 layers, based on tuner 56/76/36/21/11 with 10 epochs softmax/tanh... , accuracy was .7263
# Trying 3 layers, based on tuner 56/76/36 with 10 epochs softmax/tanh... , accuracy was .7265
# Trying 3 layers, based on tuner 56/76/36 with 10 epochs softmax/relu... , accuracy was .7264
# Trying 5 layers, based on tuner 56/76/36/21/11 with 10 epochs softmax/relu/relu/sigmoid/sigmoid , accuracy was .7272  ** Higest
# Trying 5 layers, based on tuner 56/76/36/21/11 with 10 epochs softmax/relu/relu/sigmoid/tanh , accuracy was .7244
# Trained 2 layers 56/76 softmax/relu, accuracy .7249
# Trained 2 layers 56/76 relu, accuracy .7270
# Trained 2 layers 56/76 tanh, accuracy .7271 * high and simple
# Trained 1 layer 150 tanh, acc .7243
# Trained 1 layer 150 relu, acc .7241
# Trained 1 layer 150 softmax, acc .7263
# Trying 5 layers, based on tuner 56/76/36/21/36 with 10 epochs softmax/relu..., accuracy was .7270









number_input_features = len(X_train_scaled[0])
hidden_layer1 = 56
hidden_layer2 = 76
hidden_layer3 = 36
hidden_layer4 = 21
hidden_layer5 = 11


nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='tanh'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='tanh'))
# nn.add(tf.keras.layers.Dense(units=hidden_layer3, activation='relu'))
# nn.add(tf.keras.layers.Dense(units=hidden_layer4, activation='relu'))
# nn.add(tf.keras.layers.Dense(units=hidden_layer5, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Define Checkpoint Path and filenames

os.makedirs("checkpoints_4/", exist_ok=True)
checkpoint_path = "checkpoints_4/weights.{epoch:02d}.hdf5"


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 56)                2632      
                                                                 
 dense_16 (Dense)            (None, 76)                4332      
                                                                 
 dense_17 (Dense)            (None, 1)                 77        
                                                                 
Total params: 7,041
Trainable params: 7,041
Non-trainable params: 0
_________________________________________________________________


In [51]:
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)

fit_model = nn.fit(X_train_scaled, y_train, epochs=10, callbacks=[cp_callback])


Epoch 1/10
  1/804 [..............................] - ETA: 4:45 - loss: 0.7238 - accuracy: 0.5312
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 20/804 [..............................] - ETA: 2s - loss: 0.6642 - accuracy: 0.6078  
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 40/804 [>.............................] - ETA: 2s - loss: 0.6249 - accuracy: 0.6734
Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5

Epoch 1: saving model to checkpoints_4\weights.01.hdf5
 57/804 [=>............................] - ETA: 2s - loss: 0.6163 - accuracy: 0.6842
Epoch 1: saving model to checkpo

In [52]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


268/268 - 0s - loss: 0.5523 - accuracy: 0.7272 - 224ms/epoch - 837us/step
Loss: 0.5523378849029541, Accuracy: 0.7272303104400635


# Random Forests

In [53]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
barf = BalancedRandomForestClassifier(n_estimators=100, random_state=512)

y = application_df.IS_SUCCESSFUL
X = application_df.drop(columns='IS_SUCCESSFUL')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=512, stratify=y)


# Fit the model
barf.fit(X_train, y_train)
# calculate accuracy
y_pred = barf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

# Random Forests outperforms the NN in a fraction of the time

0.7313214692153926

In [54]:
# In a previous analysis, removing the Use Cases and ask amounts yielded higher accuracy

# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

# Bin the types
# Highest accuracies were either with 50 or 200 as the cutoff

replace_application = list(app_type_counts[app_type_counts < 50].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(
        app, "Other")

# Bin the Classification
# Highest accuracy was with 200 as the cutoff

replace_class = list(classification_counts[classification_counts < 200].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(
        cls, "Other")


# Remove the Ask Amount - this seems to be a bad predictor, even when binned
application_df = application_df.drop(columns="ASK_AMT")

# Encode the categorical variables
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist(
)

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies and Ask amount
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")


# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(
    encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)



# Scale and Train the Data for Random Forests
y = application_df.IS_SUCCESSFUL
X = application_df.drop(columns='IS_SUCCESSFUL')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=512)


# Fit the model after removing ASK_AMT
barf.fit(X_train, y_train)
# calculate accuracy
y_pred = barf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.7323270979378913

In [55]:
# Removing the USE_CASE and ASK_AMT, acc = .7342
# Removing the USE_CASE using ASK_BIN, acc = .7322
# Removing USE_CASE, ASK_AMT, STATUS = .7341 * Consistently the highest
# Removing USE_CASE, ASK_AMT, STATUS, AFFILIATION .6523
# Removing USE_CASE, ASK_AMT, STATUS, ORGANIZATION .7310
# Removing USE_CASE, ASK_AMT, STATUS, CLASSIFICATION .7330
# Removing USE_CASE, ASK_AMT, STATUS, APPLICATION_TYPE  .7012
# Removing USE_CASE, ASK_AMT, STATUS, INCOME_AMT  .7286
# Removing USE_CASE, ASK_AMT, STATUS, APPLICATION_TYPE  .7012

# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME', 'ASK_AMT', 'STATUS'], inplace=True)

# Bin the types
# Highest accuracies were either with 50 or 200 as the cutoff

replace_application = list(app_type_counts[app_type_counts < 50].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(
        app, "Other")

# Bin the Classification
# Highest accuracy was with 200 as the cutoff

replace_class = list(classification_counts[classification_counts < 200].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(
        cls, "Other")


# # Apply the bins, labeling with the arbitrary 1-5 categories for the network learning
# application_df["ASK_BIN"] = pd.cut(
#     application_df["ASK_AMT"], bins=bins, labels=labels)

# # Remove the Ask Amount - this seems to be a bad predictor, even when binned
# application_df = application_df.drop(columns="ASK_AMT")

# Encode the categorical variables
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist(
)

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies and Ask amount
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")


# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(
    encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)


# Scale and Train the Data for Random Forests
y = application_df.IS_SUCCESSFUL
X = application_df.drop(columns='IS_SUCCESSFUL')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=512)


# Fit the model after removing ASK_AMT
barf.fit(X_train, y_train)
# calculate accuracy
y_pred = barf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.732856789003632

### Try the NN model with these parameters

In [56]:
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_df)

number_input_features = len(X_train_scaled[0])
hidden_layer1 = 56
hidden_layer2 = 76
hidden_layer3 = 36
hidden_layer4 = 21
hidden_layer5 = 11


nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='softmax'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer3, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer4, activation='sigmoid'))
nn.add(tf.keras.layers.Dense(units=hidden_layer5, activation='sigmoid'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Define Checkpoint Path and filenames

os.makedirs("checkpoints/", exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 56)                2520      
                                                                 
 dense_19 (Dense)            (None, 76)                4332      
                                                                 
 dense_20 (Dense)            (None, 36)                2772      
                                                                 
 dense_21 (Dense)            (None, 21)                777       
                                                                 
 dense_22 (Dense)            (None, 11)                242       
                                                                 
 dense_23 (Dense)            (None, 1)                 12        
                                                                 
Total params: 10,655
Trainable params: 10,655
Non-trai

In [57]:
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)

fit_model = nn.fit(X_train_scaled, y_train, epochs=20, callbacks=[cp_callback])


Epoch 1/20
  1/804 [..............................] - ETA: 6:15 - loss: 0.6980 - accuracy: 0.5625
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 15/804 [..............................] - ETA: 3s - loss: 0.7087 - accuracy: 0.5229  
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 30/804 [>.............................] - ETA: 2s - loss: 0.7031 - accuracy: 0.5229
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 45/804 [>.............................] - ETA: 2s - loss: 0.6962 - accuracy: 0.5417
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 

In [58]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


268/268 - 0s - loss: 0.5575 - accuracy: 0.7273 - 264ms/epoch - 983us/step
Loss: 0.5574709177017212, Accuracy: 0.7273469567298889


# Comparing the OneHotEncoded Ask Bins vs. the Numerical Ask Bins

The above accuracy is for the numerical ask bins, read from a single column, though these numbers are just categories. Below, each category is treated categorically and encoded. 

In [92]:

# Read new
application_df = pd.read_csv("Resources/charity_data.csv")

# Drop names
application_df.drop(columns=['EIN', 'NAME', 'STATUS'], inplace=True)

# With the ASK_BINS containing 7 bins, the accuracy was .7194
bins = [0, 5000, 10000, 50000, 100000, 500000, 1000000, max_ask]
labels = ['ASK_0', 'ASK_1', 'ASK_2', 'ASK_3', 'ASK_4', 'ASK_5', 'ASK_6']


# Bin the types
replace_application = list(app_type_counts[app_type_counts < 50].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(
        app, "Other")

# Bin the Classification
replace_class = list(classification_counts[classification_counts < 200].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(
        cls, "Other")

# Apply the bins, labeling with the arbitrary 1-5 categories for the network learning
application_df["ASK_BIN"] = pd.cut(
    application_df["ASK_AMT"], bins=bins, labels=labels)

# Remove the Ask Amount - this seems to be a bad predictor, even when binned
application_df = application_df.drop(columns="ASK_AMT")

# Encode the categorical variables
application_cat = application_df.dtypes[(application_df.dtypes == 'object') | (
    application_df.dtypes == 'category')].index.tolist()

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_cat)

# Drop redundancies and Ask amount
encode_df = encode_df.drop(columns="SPECIAL_CONSIDERATIONS_N")


# Merge one-hot encoded features and drop the originals
application_df = application_df.merge(
    encode_df, left_index=True, right_index=True)
application_df.drop(columns=application_cat, inplace=True)




## Random Forests

In [93]:
# Scale and Train the Data for Random Forests
y = application_df.IS_SUCCESSFUL
X = application_df.drop(columns='IS_SUCCESSFUL')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=512, stratify=y)


# Fit the model after removing ASK_AMT
barf.fit(X_train, y_train)
# calculate accuracy
y_pred = barf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


0.7239835140180255

## Neural Network

In [94]:
X_train_scaled, X_test_scaled, y_train, y_test = df_trainer(application_df)

number_input_features = len(X_train_scaled[0])
hidden_layer1 = 56
hidden_layer2 = 76
hidden_layer3 = 36
hidden_layer4 = 21
hidden_layer5 = 11


nn = tf.keras.models.Sequential()

# Layers
nn.add(tf.keras.layers.Dense(units=hidden_layer1,
       input_dim=number_input_features, activation='softmax'))
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer3, activation='relu'))
nn.add(tf.keras.layers.Dense(units=hidden_layer4, activation='sigmoid'))
nn.add(tf.keras.layers.Dense(units=hidden_layer5, activation='sigmoid'))
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Define Checkpoint Path and filenames

os.makedirs("checkpoints/", exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_27 (Dense)            (None, 56)                2352      
                                                                 
 dense_28 (Dense)            (None, 76)                4332      
                                                                 
 dense_29 (Dense)            (None, 36)                2772      
                                                                 
 dense_30 (Dense)            (None, 21)                777       
                                                                 
 dense_31 (Dense)            (None, 11)                242       
                                                                 
 dense_32 (Dense)            (None, 1)                 12        
                                                                 
Total params: 10,487
Trainable params: 10,487
Non-trai

In [95]:
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=5
)

fit_model = nn.fit(X_train_scaled, y_train, epochs=20, callbacks=[cp_callback])


Epoch 1/20
  1/804 [..............................] - ETA: 6:03 - loss: 0.7908 - accuracy: 0.4375
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 15/804 [..............................] - ETA: 2s - loss: 0.7242 - accuracy: 0.5104  
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 30/804 [>.............................] - ETA: 2s - loss: 0.7101 - accuracy: 0.5188
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 45/804 [>.............................] - ETA: 2s - loss: 0.7046 - accuracy: 0.5160
Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5

Epoch 1: saving model to checkpoints\weights.01.hdf5
 

In [96]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


268/268 - 0s - loss: 0.5712 - accuracy: 0.7184 - 252ms/epoch - 939us/step
Loss: 0.5712328553199768, Accuracy: 0.718367338180542
