### Preprocessing the Data for a Neural Network

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from pathlib import Path

#  Import and read the charity_data.csv.

# Loading data USING CSV that contains cleanded values for MaritalStatus and Gender 
file_path = Path("../Resources/TravelCleanedSkiLearnImputer.csv")
vacay_df = pd.read_csv(file_path)
vacay_df.head()

Unnamed: 0,customerid,prodtaken,age,citytier,occupation,gender,numberofpersonvisiting,preferredpropertystar,maritalstatus,numberoftrips,passport,owncar,numberofchildrenvisiting,designation,monthlyincome,typeofcontact,durationofpitch,numberoffollowups,productpitched,pitchsatisfactionscore
0,200000,1,41.0,3,Salaried,Female,3,3.0,Single,1.0,1,1,0.0,Manager,20993.0,Self Enquiry,6.0,3.0,Deluxe,2
1,200001,0,49.0,1,Salaried,Male,3,4.0,Single,2.0,0,1,2.0,Manager,20130.0,Company Invited,14.0,4.0,Deluxe,3
2,200002,1,37.0,1,Free Lancer,Male,3,3.0,Single,7.0,1,0,0.0,Executive,17090.0,Self Enquiry,8.0,4.0,Basic,3
3,200003,0,33.0,1,Salaried,Female,2,3.0,Single,2.0,1,1,1.0,Executive,17909.0,Company Invited,9.0,3.0,Basic,5
4,200004,0,37.622265,1,Small Business,Male,2,4.0,Single,1.0,0,1,0.0,Executive,18468.0,Self Enquiry,8.0,3.0,Basic,5


In [5]:
# Drop the non-beneficial ID columns,
vacay_df = vacay_df.drop(["customerid","designation","numberofpersonvisiting","numberofchildrenvisiting"],axis = 1)
vacay_df.head()

Unnamed: 0,prodtaken,age,citytier,occupation,gender,preferredpropertystar,maritalstatus,numberoftrips,passport,owncar,monthlyincome,typeofcontact,durationofpitch,numberoffollowups,productpitched,pitchsatisfactionscore
0,1,41.0,3,Salaried,Female,3.0,Single,1.0,1,1,20993.0,Self Enquiry,6.0,3.0,Deluxe,2
1,0,49.0,1,Salaried,Male,4.0,Single,2.0,0,1,20130.0,Company Invited,14.0,4.0,Deluxe,3
2,1,37.0,1,Free Lancer,Male,3.0,Single,7.0,1,0,17090.0,Self Enquiry,8.0,4.0,Basic,3
3,0,33.0,1,Salaried,Female,3.0,Single,2.0,1,1,17909.0,Company Invited,9.0,3.0,Basic,5
4,0,37.622265,1,Small Business,Male,4.0,Single,1.0,0,1,18468.0,Self Enquiry,8.0,3.0,Basic,5


In [6]:
# Determine the number of unique values in each column.
vacay_df.nunique()

prodtaken                    2
age                         45
citytier                     3
occupation                   4
gender                       2
preferredpropertystar        4
maritalstatus                2
numberoftrips               13
passport                     2
owncar                       2
monthlyincome             2476
typeofcontact                2
durationofpitch             35
numberoffollowups            7
productpitched               5
pitchsatisfactionscore       5
dtype: int64

In [7]:
# Generate our categorical variable lists
vacay_cat = vacay_df.dtypes[vacay_df.dtypes == "object"].index.tolist()
vacay_cat

['occupation', 'gender', 'maritalstatus', 'typeofcontact', 'productpitched']

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(vacay_df[vacay_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(vacay_cat)
encode_df.head()

Unnamed: 0,occupation_Free Lancer,occupation_Large Business,occupation_Salaried,occupation_Small Business,gender_Female,gender_Male,maritalstatus_Married,maritalstatus_Single,typeofcontact_Company Invited,typeofcontact_Self Enquiry,typeofcontact_nan,productpitched_Basic,productpitched_Deluxe,productpitched_King,productpitched_Standard,productpitched_Super Deluxe
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
# Merge one-hot encoded features and drop the originals
vacay_df = vacay_df.merge(encode_df, left_index=True, right_index=True)
vacay_df = vacay_df.drop(vacay_cat,1)
vacay_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,prodtaken,age,citytier,preferredpropertystar,numberoftrips,passport,owncar,monthlyincome,durationofpitch,numberoffollowups,...,maritalstatus_Married,maritalstatus_Single,typeofcontact_Company Invited,typeofcontact_Self Enquiry,typeofcontact_nan,productpitched_Basic,productpitched_Deluxe,productpitched_King,productpitched_Standard,productpitched_Super Deluxe
0,1,41.0,3,3.0,1.0,1,1,20993.0,6.0,3.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,49.0,1,4.0,2.0,0,1,20130.0,14.0,4.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,37.0,1,3.0,7.0,1,0,17090.0,8.0,4.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,33.0,1,3.0,2.0,1,1,17909.0,9.0,3.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,37.622265,1,4.0,1.0,0,1,18468.0,8.0,3.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
# Split our preprocessed data into our features and target arrays
y = vacay_df["prodtaken"].values
X = vacay_df.drop(["prodtaken"],1).values
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Compile, Train and Evaluate the Model

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(
    tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                2160      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 4,621
Trainable params: 4,621
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

#Define the checkpoint path and filenames
#os.makedirs("checkpoints/", exist_ok=True)
#checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [15]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

#cp_callback = ModelCheckpoint(
    #filepath=checkpoint_path,
    #verbose=1,
    #save_weights_only=True,
    #save_freq = 'epoch',
    #period = 5)

In [16]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)#,callbacks= [cp_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [17]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.1465 - accuracy: 0.9403 - 131ms/epoch - 3ms/step
Loss: 0.14651383459568024, Accuracy: 0.9402618408203125


In [None]:
#Export our model to HDF5 file
#nn.save("test.h5")