In [1]:
# Import dependencies
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")

# Drop the 'EIN' and 'NAME' columns
application_df = application_df.drop(columns=["EIN", "NAME"])

In [3]:
# Examine value counts
application_counts = application_df["APPLICATION_TYPE"].value_counts()
print(application_counts)

# Choose cutoff
application_types_to_replace = application_counts[application_counts < 500].index

# Replace with "Other"
for app in application_types_to_replace:
    application_df["APPLICATION_TYPE"] = application_df["APPLICATION_TYPE"].replace(app, "Other")

# Check to make sure replacement was successful
print(application_df["APPLICATION_TYPE"].value_counts())


APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64
APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64


In [4]:
# Examine value counts
classification_counts = application_df["CLASSIFICATION"].value_counts()
print(classification_counts)

# Choose cutoff
classifications_to_replace = classification_counts[classification_counts < 1000].index

# Replace with "Other"
for cls in classifications_to_replace:
    application_df["CLASSIFICATION"] = application_df["CLASSIFICATION"].replace(cls, "Other")

# Check to make sure replacement was successful
print(application_df["CLASSIFICATION"].value_counts())


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1248        1
C6100        1
C1820        1
C1900        1
C2150        1
Name: count, Length: 71, dtype: int64
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64


In [5]:
# Apply log transformation to ASK_AMT to reduce skewness
import numpy as np
application_df["ASK_AMT"] = np.log1p(application_df["ASK_AMT"])

In [6]:
# One-hot encode the categorical variables
application_df = pd.get_dummies(application_df).astype(int)

# Split features (X) and target (y)
X = application_df.drop("IS_SUCCESSFUL", axis=1).values
y = application_df["IS_SUCCESSFUL"].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [12]:
from tensorflow.keras.callbacks import EarlyStopping

# Build the improved model
nn = tf.keras.models.Sequential()
nn.add(tf.keras.layers.Dense(units=64, activation="relu", input_dim=X_train_scaled.shape[1]))
nn.add(tf.keras.layers.Dense(units=32, activation="relu"))
nn.add(tf.keras.layers.Dense(units=16, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# EarlyStopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model (with validation split)
fit_model = nn.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    callbacks=[early_stop],
    verbose=1
)

# fit_model = nn.fit(X_train_scaled, y_train, epochs=150, verbose=1)


Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6986 - loss: 0.5968 - val_accuracy: 0.7388 - val_loss: 0.5478
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7252 - loss: 0.5560 - val_accuracy: 0.7370 - val_loss: 0.5520
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7376 - loss: 0.5435 - val_accuracy: 0.7355 - val_loss: 0.5479
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7302 - loss: 0.5516 - val_accuracy: 0.7362 - val_loss: 0.5443
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7273 - loss: 0.5498 - val_accuracy: 0.7361 - val_loss: 0.5496
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7327 - loss: 0.5444 - val_accuracy: 0.7364 - val_loss: 0.5488
Epoch 7/100
[1m644/64

In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 2ms/step - accuracy: 0.7301 - loss: 0.5544
Loss: 0.5544053912162781, Accuracy: 0.7301457524299622


In [14]:
nn.save("AlphabetSoupCharity_Optimization.h5")

