## Preprocessing

In [5]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

2025-01-27 20:50:48.673483: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [7]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN', 'NAME'])

In [9]:
# Determine the number of unique values in each column.
unique_values = application_df.nunique()

# Display the number of unique values
print(unique_values)

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [15]:
# Look at the value counts for 'APPLICATION_TYPE'
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Set a threshold to consider as "Other" (for example, replace any type that appears less than 100 times)
threshold = 100

# Replace values with 'Other' if their count is below the threshold
application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].apply(
    lambda x: x if application_type_counts[x] >= threshold else 'Other'
)

# Display the updated value counts
print(application_df['APPLICATION_TYPE'].value_counts())

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: count, dtype: int64


In [19]:
# Choose a cutoff value (e.g., replace types that appear less than 100 times)
cutoff_value = 100

# Create a list of application types that appear less frequently than the cutoff
application_types_to_replace = application_df['APPLICATION_TYPE'].value_counts()[application_df['APPLICATION_TYPE'].value_counts() < cutoff_value].index.tolist()

# Replace in the dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app, "Other")

# Check to make sure replacement was successful
print(application_df['APPLICATION_TYPE'].value_counts())

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: count, dtype: int64


In [21]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
#  YOUR CODE GOES HERE

# Choose a cutoff value (e.g., replace types that appear less than 100 times)
cutoff_value = 100

# Create a list of classifications that appear less frequently than the cutoff
classification_types_to_replace = application_df['CLASSIFICATION'].value_counts()[application_df['CLASSIFICATION'].value_counts() < cutoff_value].index.tolist()

# Replace in the dataframe
for classification in classification_types_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(classification, "Other")

# Check to make sure replacement was successful
print(application_df['CLASSIFICATION'].value_counts())


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
Other      669
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Name: count, dtype: int64


In [23]:
# You may find it helpful to look at CLASSIFICATION value counts >1
#  YOUR CODE GOES HERE

# Look at CLASSIFICATION value counts and filter those that appear more than once
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Filter to keep only classifications with counts greater than 1
classification_counts_greater_than_1 = classification_counts[classification_counts > 1]

# Display the filtered counts
print(classification_counts_greater_than_1)


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
Other      669
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Name: count, dtype: int64


In [25]:
# Choose a cutoff value and create a list of classifications to be replaced
cutoff_value = 100
# use the variable name `classifications_to_replace`
# Create a list of classifications that appear less frequently than the cutoff
classifications_to_replace = application_df['CLASSIFICATION'].value_counts()[application_df['CLASSIFICATION'].value_counts() < cutoff_value].index.tolist()


# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
Other      669
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
Name: count, dtype: int64

In [29]:
# Convert categorical data to numeric with `pd.get_dummies`

application_df_encoded = pd.get_dummies(application_df, columns=['APPLICATION_TYPE', 'CLASSIFICATION'])

# Check the first few rows of the modified DataFrame
application_df_encoded.head()

Unnamed: 0,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,...,CLASSIFICATION_C1270,CLASSIFICATION_C1700,CLASSIFICATION_C2000,CLASSIFICATION_C2100,CLASSIFICATION_C2700,CLASSIFICATION_C3000,CLASSIFICATION_C4000,CLASSIFICATION_C5000,CLASSIFICATION_C7000,CLASSIFICATION_Other
0,Independent,ProductDev,Association,1,0,N,5000,1,False,True,...,False,False,False,False,False,False,False,False,False,False
1,Independent,Preservation,Co-operative,1,1-9999,N,108590,1,False,False,...,False,False,True,False,False,False,False,False,False,False
2,CompanySponsored,ProductDev,Association,1,0,N,5000,0,False,False,...,False,False,False,False,False,True,False,False,False,False
3,CompanySponsored,Preservation,Trust,1,10000-24999,N,6692,1,False,False,...,False,False,True,False,False,False,False,False,False,False
4,Independent,Heathcare,Trust,1,100000-499999,N,142590,1,False,False,...,False,False,False,False,False,False,False,False,False,False


In [35]:
# Split our preprocessed data into our features and target arrays
X = application_df_encoded.drop(columns=['IS_SUCCESSFUL'])  # Features (everything except target)
y = application_df_encoded['IS_SUCCESSFUL']  # Target (the column we want to predict)


# Split the preprocessed data into a training and testing dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Check the shapes of the resulting datasets
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")

Training features shape: (27439, 29)
Testing features shape: (6860, 29)
Training target shape: (27439,)
Testing target shape: (6860,)


In [43]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# First, apply pd.get_dummies() to both the training and testing data together
X_encoded = pd.get_dummies(application_df.drop(columns=['IS_SUCCESSFUL']))  # Encode features

# Split into features (X) and target (y) again after encoding
X = X_encoded
y = application_df['IS_SUCCESSFUL']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now we can safely apply StandardScaler because X_train and X_test have the same columns
scaler = StandardScaler()

# Fit the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Scale the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Check the shapes of the scaled data
print(f"Scaled training features shape: {X_train_scaled.shape}")
print(f"Scaled testing features shape: {X_test_scaled.shape}")


Scaled training features shape: (27439, 50)
Scaled testing features shape: (6860, 50)


## Compile, Train and Evaluate the Model

In [47]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = X_train_scaled.shape[1]

# Define the model
nn = tf.keras.models.Sequential()

# First hidden layer
# Number of hidden nodes can vary, here we'll choose 128 for example
nn.add(Dense(units=128, activation='relu', input_dim=input_features))

# Second hidden layer
# Number of hidden nodes can vary, here we'll choose 64 for example
nn.add(Dense(units=64, activation='relu'))

# Output layer
# For binary classification, use 1 unit with sigmoid activation
nn.add(Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [49]:
# Compile the model
nn.compile(optimizer='adam', 
           loss='binary_crossentropy', 
           metrics=['accuracy'])

In [51]:
# Train the model
history = nn.fit(X_train_scaled, 
                 y_train, 
                 epochs=50,         # Number of iterations over the dataset
                 batch_size=32,     # Number of samples per gradient update
                 validation_data=(X_test_scaled, y_test),  # Validation data to monitor performance
                 verbose=1)         # Show progress during training

Epoch 1/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7110 - loss: 0.5827 - val_accuracy: 0.7178 - val_loss: 0.5637
Epoch 2/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7331 - loss: 0.5501 - val_accuracy: 0.7233 - val_loss: 0.5613
Epoch 3/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7290 - loss: 0.5546 - val_accuracy: 0.7261 - val_loss: 0.5573
Epoch 4/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7291 - loss: 0.5508 - val_accuracy: 0.7246 - val_loss: 0.5550
Epoch 5/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7315 - loss: 0.5497 - val_accuracy: 0.7294 - val_loss: 0.5562
Epoch 6/50
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7361 - loss: 0.5430 - val_accuracy: 0.7273 - val_loss: 0.5522
Epoch 7/50
[1m858/858[0m 

In [53]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - 2ms/step - accuracy: 0.7289 - loss: 0.5583
Loss: 0.5582510828971863, Accuracy: 0.728863000869751


In [57]:
# Export our model to HDF5 file
#  YOUR CODE GOES HERE
nn.save('model.h5')

print("Model saved to 'model.h5'")



Model saved to 'model.h5'
