<a href="https://colab.research.google.com/github/Bag0niku/Neural_Network_Charity_Analysis/blob/main/Charity_Funding_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the Environment

In [1]:
%matplotlib
# Import our dependencies
import pandas as pd
import matplotlib as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
import tensorflow as tf

filepath = "https://nn-charity-analysis.s3.us-west-2.amazonaws.com/charity_data.csv"

Using matplotlib backend: agg


# Import and clean the data for use in the Neural Network Model

In [2]:
# Import the data into a dataframe
df = pd.read_csv(filepath)
df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [3]:
# Look for null values and incorrect datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EIN                     34299 non-null  int64 
 1   NAME                    34299 non-null  object
 2   APPLICATION_TYPE        34299 non-null  object
 3   AFFILIATION             34299 non-null  object
 4   CLASSIFICATION          34299 non-null  object
 5   USE_CASE                34299 non-null  object
 6   ORGANIZATION            34299 non-null  object
 7   STATUS                  34299 non-null  int64 
 8   INCOME_AMT              34299 non-null  object
 9   SPECIAL_CONSIDERATIONS  34299 non-null  object
 10  ASK_AMT                 34299 non-null  int64 
 11  IS_SUCCESSFUL           34299 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 3.1+ MB


In [5]:
#  Transform the string categories into numerical values and scale the data.
# "APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION", "USE_CASE", "ORGANIZATION", "SPECIAL_CONSIDERATIONS", "INCOME_AMT"

cleaned_df = df.loc[:,["EIN", "NAME"]] 
for col in df.loc[:,["APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION", "USE_CASE", "ORGANIZATION", "SPECIAL_CONSIDERATIONS", "INCOME_AMT"]].columns:
    cleaned_df[col] = LabelEncoder().fit_transform(df[col])

cleaned_df = cleaned_df.join(df.loc[:, [x for x in df.columns.to_list() if x not in cleaned_df.columns.to_list()]])
cleaned_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,SPECIAL_CONSIDERATIONS,INCOME_AMT,STATUS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,0,2,1,4,0,0,0,1,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,10,2,36,3,1,0,1,1,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,12,0,51,4,0,0,0,1,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,10,0,36,3,3,0,2,1,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,10,2,1,1,3,0,3,1,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,11,2,1,4,0,0,0,1,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,11,0,51,4,0,0,0,1,5000,0
34296,996012607,PTA HAWAII CONGRESS,10,0,36,3,0,0,0,1,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,12,2,51,4,0,0,0,1,5000,1


In [6]:
# Name and EIN will be removed for the computation, they will not help
# the machine weigh options and metrics.
encoded_df = cleaned_df.drop(["EIN", "NAME"], axis=1)
encoded_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,SPECIAL_CONSIDERATIONS,INCOME_AMT,STATUS,ASK_AMT,IS_SUCCESSFUL
0,0,2,1,4,0,0,0,1,5000,1
1,10,2,36,3,1,0,1,1,108590,1
2,12,0,51,4,0,0,0,1,5000,0
3,10,0,36,3,3,0,2,1,6692,1
4,10,2,1,1,3,0,3,1,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,11,2,1,4,0,0,0,1,5000,0
34295,11,0,51,4,0,0,0,1,5000,0
34296,10,0,36,3,0,0,0,1,5000,0
34297,12,2,51,4,0,0,0,1,5000,1


In [7]:
# Checking the status of the data
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   APPLICATION_TYPE        34299 non-null  int64
 1   AFFILIATION             34299 non-null  int64
 2   CLASSIFICATION          34299 non-null  int64
 3   USE_CASE                34299 non-null  int64
 4   ORGANIZATION            34299 non-null  int64
 5   SPECIAL_CONSIDERATIONS  34299 non-null  int64
 6   INCOME_AMT              34299 non-null  int64
 7   STATUS                  34299 non-null  int64
 8   ASK_AMT                 34299 non-null  int64
 9   IS_SUCCESSFUL           34299 non-null  int64
dtypes: int64(10)
memory usage: 2.6 MB


In [8]:
# does the data need scaled?
encoded_df.describe()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,SPECIAL_CONSIDERATIONS,INCOME_AMT,STATUS,ASK_AMT,IS_SUCCESSFUL
count,34299.0,34299.0,34299.0,34299.0,34299.0,34299.0,34299.0,34299.0,34299.0,34299.0
mean,10.135631,1.084696,15.153532,3.123152,2.073442,0.000787,1.242194,0.999854,2769199.0,0.532406
std,1.938174,1.000247,19.781661,0.517742,1.374908,0.028046,2.170626,0.012073,87130450.0,0.498956
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5000.0,0.0
25%,10.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,5000.0,0.0
50%,10.0,2.0,1.0,3.0,3.0,0.0,0.0,1.0,5000.0,1.0
75%,10.0,2.0,36.0,3.0,3.0,0.0,3.0,1.0,7742.0,1.0
max,16.0,5.0,70.0,4.0,3.0,1.0,8.0,1.0,8597806000.0,1.0


In [9]:
std_scaled_df = pd.DataFrame(StandardScaler().fit_transform(encoded_df), index=encoded_df.index, columns=encoded_df.columns)
std_scaled_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,SPECIAL_CONSIDERATIONS,INCOME_AMT,STATUS,ASK_AMT,IS_SUCCESSFUL
0,-5.229551,0.915091,-0.715498,1.693625,-1.508081,-0.028068,-0.572283,0.012075,-0.031725,0.937158
1,-0.069980,0.915091,1.053843,-0.237868,-0.780749,-0.028068,-0.111579,0.012075,-0.030536,0.937158
2,0.961935,-1.084444,1.812132,1.693625,-1.508081,-0.028068,-0.572283,0.012075,-0.031725,-1.067056
3,-0.069980,-1.084444,1.053843,-0.237868,0.673915,-0.028068,0.349124,0.012075,-0.031706,0.937158
4,-0.069980,0.915091,-0.715498,-4.100853,0.673915,-0.028068,0.809827,0.012075,-0.030146,0.937158
...,...,...,...,...,...,...,...,...,...,...
34294,0.445977,0.915091,-0.715498,1.693625,-1.508081,-0.028068,-0.572283,0.012075,-0.031725,-1.067056
34295,0.445977,-1.084444,1.812132,1.693625,-1.508081,-0.028068,-0.572283,0.012075,-0.031725,-1.067056
34296,-0.069980,-1.084444,1.053843,-0.237868,-1.508081,-0.028068,-0.572283,0.012075,-0.031725,-1.067056
34297,0.961935,0.915091,1.812132,1.693625,-1.508081,-0.028068,-0.572283,0.012075,-0.031725,0.937158


In [10]:
MinMax_scaled_df = pd.DataFrame(MinMaxScaler().fit_transform(encoded_df), index=encoded_df.index, columns=encoded_df.columns)
MinMax_scaled_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,SPECIAL_CONSIDERATIONS,INCOME_AMT,STATUS,ASK_AMT,IS_SUCCESSFUL
0,0.0000,0.4,0.014286,1.00,0.000000,0.0,0.000,1.0,0.000000e+00,1.0
1,0.6250,0.4,0.514286,0.75,0.333333,0.0,0.125,1.0,1.204843e-05,1.0
2,0.7500,0.0,0.728571,1.00,0.000000,0.0,0.000,1.0,0.000000e+00,0.0
3,0.6250,0.0,0.514286,0.75,1.000000,0.0,0.250,1.0,1.967945e-07,1.0
4,0.6250,0.4,0.014286,0.25,1.000000,0.0,0.375,1.0,1.600293e-05,1.0
...,...,...,...,...,...,...,...,...,...,...
34294,0.6875,0.4,0.014286,1.00,0.000000,0.0,0.000,1.0,0.000000e+00,0.0
34295,0.6875,0.0,0.728571,1.00,0.000000,0.0,0.000,1.0,0.000000e+00,0.0
34296,0.6250,0.0,0.514286,0.75,0.000000,0.0,0.000,1.0,0.000000e+00,0.0
34297,0.7500,0.4,0.728571,1.00,0.000000,0.0,0.000,1.0,0.000000e+00,1.0


# Build The Neural Network Model

In [12]:
nn_model = tf.keras.models.Sequential()

In [13]:
nn_model.add(tf.keras.layers.Dense(units="5", activation="relu", input_dim=10))

In [14]:
nn_model.add(tf.keras.layers.Dense(units="1", activation="sigmoid"))

In [15]:
nn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5)                 55        
                                                                 
 dense_1 (Dense)             (None, 1)                 6         
                                                                 
Total params: 61
Trainable params: 61
Non-trainable params: 0
_________________________________________________________________
