## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
raw_application_data = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
raw_application_data.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [None]:
# Drop the non-beneficial ID column_names, 'EIN' and 'NAME'.
raw_application_data = raw_application_data.drop(columns=["EIN", "NAME"])
raw_application_data

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [None]:
# Determine the number of unique values in each column.
column_unique_value_counts = raw_application_data.nunique()
print(column_unique_value_counts)

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [None]:
# Look at APPLICATION_TYPE value counts for binning
application_type_frequencies = raw_application_data['APPLICATION_TYPE'].value_counts()
print(application_type_frequencies)

APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64


In [None]:
# Choose a cutoff value and create a list of application types to be replaced
cutoff_value = 528

# Create a list of application types to be replaced
infrequent_application_types = application_type_frequencies[application_type_frequencies < cutoff_value].index.tolist()

# Replace in dataframe
for app in infrequent_application_types:
    raw_application_data['APPLICATION_TYPE'] = raw_application_data['APPLICATION_TYPE'].replace(app, "Other")

# Check to make sure binning was successful
raw_application_data['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [None]:
# Look at CLASSIFICATION value counts for binning
classification_type_frequencies = raw_application_data['CLASSIFICATION'].value_counts()
print(classification_type_frequencies)

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1248        1
C6100        1
C1820        1
C1900        1
C2150        1
Name: count, Length: 71, dtype: int64


In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
cutoff_value = 1

# Filter the DataFrame to keep only rows where CLASSIFICATION value counts are greater than cutoff_value
infrequent_application_types = classification_type_frequencies[classification_type_frequencies <= cutoff_value].index.tolist()
raw_application_data = raw_application_data[~raw_application_data['CLASSIFICATION'].isin(infrequent_application_types)]

# Check to make sure the rows with low count values were dropped
raw_application_data['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
C7000,777
C1700,287
C4000,194
C5000,116
C1270,114


In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `infrequent_classifications`
cutoff_value = 1883
infrequent_classifications = classification_type_frequencies[classification_type_frequencies < cutoff_value].index.tolist()


# Replace in dataframe
for cls in infrequent_classifications:
    raw_application_data['CLASSIFICATION'] = raw_application_data['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure binning was successful
raw_application_data['CLASSIFICATION'].value_counts()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_application_data['CLASSIFICATION'] = raw_application_data['CLASSIFICATION'].replace(cls,"Other")


Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2235
C3000,1918
C2100,1883


In [None]:
# Assuming you have a DataFrame named 'raw_application_data'
# You can use the 'dtype' attribute to get the data type of each column
dataframe_column_types = raw_application_data.dtypes

# Filter column_names that have a categorical data type
categorical_feature_columns = dataframe_column_types[dataframe_column_types == 'object'].index.tolist()

# Print the list of categorical column_names
print("Categorical Columns:")
print(categorical_feature_columns)

# Convert categorical column_names to dummy variables
encoded_data = pd.get_dummies(raw_application_data[categorical_feature_columns]).astype(int)

# Concatenate the original DataFrame with the dummy DataFrame
raw_application_data = pd.concat([raw_application_data, encoded_data], axis=1)

# Drop the original categorical column_names
raw_application_data.drop(columns=categorical_feature_columns, inplace=True)

# Display the updated DataFrame
raw_application_data.head()

Categorical Columns:
['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']


Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [None]:
raw_application_data.dtypes

Unnamed: 0,0
STATUS,int64
ASK_AMT,int64
IS_SUCCESSFUL,int64
APPLICATION_TYPE_Other,int64
APPLICATION_TYPE_T10,int64
APPLICATION_TYPE_T19,int64
APPLICATION_TYPE_T3,int64
APPLICATION_TYPE_T4,int64
APPLICATION_TYPE_T5,int64
APPLICATION_TYPE_T6,int64


In [None]:
# Split our preprocessed data into our features and target arrays
features = raw_application_data.drop(columns=['IS_SUCCESSFUL'])
labels = raw_application_data['IS_SUCCESSFUL']

# Split the preprocessed data into a training and testing dataset
# Splitting features (features) and target variable (labels)
X_train, X_test, y_train, test_labels = train_test_split(
    features,
    labels,
    random_state=1,
    stratify=labels
)

In [None]:
# Create a StandardScaler instances
standard_scaler = StandardScaler()

# Fit the StandardScaler
features_scaler = standard_scaler.fit(X_train)

# Scale the data
features_train_scaled = features_scaler.transform(X_train)
features_test_scaled = features_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model
neural_network_model = tf.keras.models.Sequential()

# First hidden layer
neural_network_model.add(tf.keras.layers.Dense(units=80, input_dim=X_train.shape[1], activation="relu"))

# Second hidden layer
neural_network_model.add(tf.keras.layers.Dense(units=30, activation="relu"))

# Output layer
neural_network_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
neural_network_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
neural_network_model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [None]:
# Train the model
trained_model = neural_network_model.fit(
    features_train_scaled,
    y_train,
    epochs=200

)

Epoch 1/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 8ms/step - accuracy: 0.7028 - loss: 0.5897
Epoch 2/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7273 - loss: 0.5569
Epoch 3/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7298 - loss: 0.5538
Epoch 4/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7280 - loss: 0.5536
Epoch 5/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7322 - loss: 0.5505
Epoch 6/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7297 - loss: 0.5525
Epoch 7/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7273 - loss: 0.5488
Epoch 8/200
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7325 - loss: 0.5475
Epoch 9/200
[1m804/804[0m [3

In [None]:
model_loss, final_model_accuracy = neural_network_model.evaluate(
    features_test_scaled,
    test_labels,
    verbose=2
)
print(f"Loss: {model_loss}, Accuracy: {final_model_accuracy}")

268/268 - 1s - 3ms/step - accuracy: 0.7301 - loss: 0.5721
Loss: 0.5721433758735657, Accuracy: 0.7300735116004944


In [None]:
# Exporting the model to HDF5 file
neural_network_model.save('AlphabetSoupCharity_optimization1.h5')
files.download('AlphabetSoupCharity_optimization1.h5')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>