## Alphabet Soup Classification

In [475]:
# Import Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [476]:
# Import Dataset
charity_df = pd.read_csv("charity_data.csv")
charity_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


### Data Preprocessing

#### Bucketing Categorical Values

In [477]:
# Classification and Application Type value counts
class_charity = charity_df.CLASSIFICATION.value_counts()
application_charity=charity_df.APPLICATION_TYPE.value_counts()
affiliation_charity=charity_df.AFFILIATION.value_counts()

In [478]:
# Determine which values to replace
replace_class = list(class_charity[class_charity < 1000].index)
replace_application = list(application_charity[application_charity < 1065].index)
replace_affiliation = list(affiliation_charity[affiliation_charity < 1500].index)

In [479]:
# Replace in DataFrame
for Class in replace_class:
    charity_df.CLASSIFICATION = charity_df.CLASSIFICATION.replace(Class, "Other")
    
for app in replace_application:
    charity_df.APPLICATION_TYPE = charity_df.APPLICATION_TYPE.replace(app, "Other")
    
for aff in replace_affiliation:
    charity_df.AFFILIATION = charity_df.AFFILIATION.replace(app, "Other")

In [480]:
# Check to see if binning was successful
charity_df.CLASSIFICATION.value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [481]:
# Check to see if binning was successful
charity_df.APPLICATION_TYPE.value_counts()

T3       27037
Other     2266
T4        1542
T6        1216
T5        1173
T19       1065
Name: APPLICATION_TYPE, dtype: int64

In [482]:
charity_df.AFFILIATION.value_counts()

Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64

#### Encoding Categorical Items

In [483]:
# Create the OneHotEncoder Instance
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_class_df = pd.DataFrame(enc.fit_transform(charity_df.CLASSIFICATION.values.reshape(-1,1)))
encode_apptype_df = pd.DataFrame(enc.fit_transform(charity_df.APPLICATION_TYPE.values.reshape(-1,1)))
encode_afftype_df = pd.DataFrame(enc.fit_transform(charity_df.AFFILIATION.values.reshape(-1,1)))

# Rename encoded columns
encode_class_df.columns = enc.get_feature_names(["CLASSIFICATION"])
encode_apptype_df.columns = enc.get_feature_names(["APPLICATION_TYPE"])
encode_afftype_df.columns = enc.get_feature_names(["AFFILIATION"])

In [484]:
# Merge DataFrames and dropping the CLASSIFICATION and APPLICATION_TYPE columns
charity_df=charity_df.merge(encode_class_df, left_index=True, right_index=True).drop("CLASSIFICATION",1)
charity_df=charity_df.merge(encode_apptype_df, left_index=True, right_index=True).drop("APPLICATION_TYPE",1)
charity_df=charity_df.merge(encode_afftype_df, left_index=True, right_index=True).drop("AFFILIATION",1)
charity_df.head()

Unnamed: 0,EIN,NAME,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,CLASSIFICATION_CompanySponsored,...,APPLICATION_TYPE_Independent,APPLICATION_TYPE_National,APPLICATION_TYPE_Other,APPLICATION_TYPE_Regional,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,ProductDev,Association,1,0,N,5000,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,Preservation,Co-operative,1,1-9999,N,108590,1,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,ProductDev,Association,1,0,N,5000,0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,Preservation,Trust,1,10000-24999,N,6692,1,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,Heathcare,Trust,1,100000-499999,N,142590,1,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Dummify Variables

In [485]:
charity_df = pd.get_dummies(charity_df, columns=["USE_CASE", "ORGANIZATION", 
                                                 "SPECIAL_CONSIDERATIONS", "INCOME_AMT", "IS_SUCCESSFUL"])
charity_df.head()

Unnamed: 0,EIN,NAME,STATUS,ASK_AMT,CLASSIFICATION_CompanySponsored,CLASSIFICATION_Family/Parent,CLASSIFICATION_Independent,CLASSIFICATION_National,CLASSIFICATION_Other,CLASSIFICATION_Regional,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,IS_SUCCESSFUL_0,IS_SUCCESSFUL_1
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,1,5000,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,1,108590,0.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,1,5000,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,1,6692,0.0,0.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,1,142590,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1


#### Creating Separate DataFrame with names of Organizations

In [486]:
# Creating a DataFrame for all the names of the organizations
org_name = pd.DataFrame(charity_df, columns = ['EIN','NAME'])
org_name.set_index('EIN', inplace=True)
org_name.index.name=None
org_name.head()

Unnamed: 0,NAME
10520599,BLUE KNIGHTS MOTORCYCLE CLUB
10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR
10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS
10553066,SOUTHSIDE ATHLETIC ASSOCIATION
10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT


#### Creating Separate DataFrame EIN numbers as the Index/Drop Names of Organizations

In [487]:
charity_df = charity_df.drop(columns="NAME")
charity_df.set_index('EIN', inplace=True)
charity_df.index.name=None
charity_df.head()

Unnamed: 0,STATUS,ASK_AMT,CLASSIFICATION_CompanySponsored,CLASSIFICATION_Family/Parent,CLASSIFICATION_Independent,CLASSIFICATION_National,CLASSIFICATION_Other,CLASSIFICATION_Regional,APPLICATION_TYPE_CompanySponsored,APPLICATION_TYPE_Family/Parent,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,IS_SUCCESSFUL_0,IS_SUCCESSFUL_1
10520599,1,5000,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,1
10531628,1,108590,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,1
10547893,1,5000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
10553066,1,6692,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
10556103,1,142590,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1


### Defining Target and Measures


In [488]:
y = charity_df.IS_SUCCESSFUL_1.values
X = charity_df.drop(columns=["IS_SUCCESSFUL_1", "IS_SUCCESSFUL_0"]).values

# Split training/testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=85, stratify=y)

### Standardize Data

In [489]:
# Creating StandardScaler Instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Comparing a Random Forest to Deep Neural Network

In [490]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.714


### Creating Neural Network Model

In [491]:
# Define the model - deep neural net
import tensorflow as tf

number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 36
hidden_nodes_layer2 = 18
# hidden_nodes_layer3 = 9

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features,
                         activation="sigmoid")
)

# Second Hidden Layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# # Third Hidden Layer
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output Layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [492]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
268/268 - 0s - loss: 0.5526 - accuracy: 0.7322
Loss: 0.5525825023651123, Accuracy: 0.7322449088096619


In [493]:
nn.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_77 (Dense)             (None, 36)                1476      
_________________________________________________________________
dense_78 (Dense)             (None, 18)                666       
_________________________________________________________________
dense_79 (Dense)             (None, 1)                 19        
Total params: 2,161
Trainable params: 2,161
Non-trainable params: 0
_________________________________________________________________
