#####  I used a database of an audiobook app from customers that made a purchase at least once. The Algorithm takes it as input and predict whether the customers will buy again from the audiobook company. With that, the company could target customers that are more likely to come back.

In [341]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf
import keras

### Extract the data

In [342]:
# Loading data
raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')
# Removing the first column 'ids' and last column targets to the main data
unscaled_inputs_all = raw_csv_data[:,1:-1]
#  Getting the last column targets
targets_all = raw_csv_data[:,-1]

### Balance the dataset

In [343]:
# Defining the number of '1' in the targets column
num_one_targets = int(np.sum(targets_all))

# Defining the number of '0' in the targets column and appending the difference from '1' to be removed
zero_targets_counter = 0
indices_to_remove = []
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter +=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(int(i))
# Removing '0' from inputs and targets, and balancing with the number of '1'             
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

### Standardize inputs

In [344]:
# Standardizing the inputs
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

### Shuffle data

In [345]:
# Shuffling a variable
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Mapping the shuffled variable on inputs and targets also shuffling them
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

### Split into train, validation and test

In [346]:
# Creating the proportion of train, validation and targets
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = int(0.1*samples_count)

# Creating the train input and targets section
train_inputs = shuffled_inputs[: train_samples_count]
train_targets = shuffled_targets[: train_samples_count]

# Creating the validation input and targets section
validation_inputs = shuffled_inputs[train_samples_count : train_samples_count + validation_samples_count]
validation_targets = shuffled_targets[train_samples_count : train_samples_count + validation_samples_count]

# Creating the test input and targets section
test_inputs = shuffled_inputs[train_samples_count + validation_samples_count :]
test_targets = shuffled_targets[train_samples_count + validation_samples_count :]

# printing the targets to analise how disperse they are
print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets),validation_samples_count,np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets),test_samples_count,np.sum(test_targets)/test_samples_count)

1796.0 3579 0.5018161497625034
232.0 447 0.5190156599552572
209.0 447 0.46756152125279643


### Save in .npz

In [347]:
# Saving train, validation and test in npz files
np.savez('Audiobooks_data_train',inputs=train_inputs,targets=train_targets)
np.savez('Audiobooks_data_validation',inputs=validation_inputs,targets=validation_targets)
np.savez('Audiobooks_data_test',inputs=test_inputs,targets=test_targets)

### Data

In [348]:
# Loading train data and creating variables in float in int
npz = np.load('Audiobooks_data_train.npz')
train_input = npz['inputs'].astype(np.float64)
train_target = npz['targets'].astype(np.int32)

# Loading validation data and creating variables in float in int
npz = np.load('Audiobooks_data_validation.npz')
validation_input = npz['inputs'].astype(np.float64)
validation_target = npz['targets'].astype(np.int32)

# Loading test data and creating variables in float in int
npz = np.load('Audiobooks_data_test.npz')
test_input = npz['inputs'].astype(np.float64)
test_target = npz['targets'].astype(np.int32)

### Model

In [349]:
# Declaring variables for width of the inputs, outputs and hidden layers.
input_size = 10
output_size = 2
hidden_layer_size = 50

# Defining the actual model
model = tf.keras.Sequential([
                          # Biulding the neural network. It takes the inputs provided to the model and calculates
                          # the inputs and weights dot product and adds the bias. Also applies the activation function.
                          tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                          # Creating a second hidden layer with the activation function.
                          tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                          # Creating the output layer. Because this program is a classifier, activation function of the
                          # output layer must transforme the values into probabilities, for that I used 'SoftMax'
                          tf.keras.layers.Dense(output_size,activation='softmax')
                         ])

### Optimizer

In [350]:
# Creating the optimizer and loss function, for optimazer I defined 'Adam'.
# Loss has to be for classifiers, then I defined 'sparse_categorical_crossentropy' -
# because I did not one-hot encoded the targets
# Including metrics to calculate throughout the training and testing processes.
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

### Training

In [351]:
# Creating a batch size
batch_size = 100
# Creating a variable to store the number of epoch
max_epochs = 100

# Defining a early stopping in the algorithm to stop before the overfitting, setting patience to 2
# to avoid an excessive restriction
early_stopping = tf.keras.callbacks.EarlyStopping(patience=1)

# Fitting the model
model.fit(train_input,
          train_targets,
          batch_size=batch_size,
          epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data=(validation_inputs,validation_targets),
          verbose=2
         )

Epoch 1/100
36/36 - 1s - loss: 0.5750 - accuracy: 0.6806 - val_loss: 0.4916 - val_accuracy: 0.7763 - 1s/epoch - 29ms/step
Epoch 2/100
36/36 - 0s - loss: 0.4636 - accuracy: 0.7656 - val_loss: 0.4199 - val_accuracy: 0.7919 - 98ms/epoch - 3ms/step
Epoch 3/100
36/36 - 0s - loss: 0.4116 - accuracy: 0.7837 - val_loss: 0.3885 - val_accuracy: 0.8076 - 103ms/epoch - 3ms/step
Epoch 4/100
36/36 - 0s - loss: 0.3854 - accuracy: 0.7885 - val_loss: 0.3722 - val_accuracy: 0.8076 - 99ms/epoch - 3ms/step
Epoch 5/100
36/36 - 0s - loss: 0.3714 - accuracy: 0.7974 - val_loss: 0.3685 - val_accuracy: 0.8031 - 126ms/epoch - 3ms/step
Epoch 6/100
36/36 - 0s - loss: 0.3647 - accuracy: 0.7974 - val_loss: 0.3559 - val_accuracy: 0.8121 - 125ms/epoch - 3ms/step
Epoch 7/100
36/36 - 0s - loss: 0.3543 - accuracy: 0.8089 - val_loss: 0.3593 - val_accuracy: 0.8054 - 141ms/epoch - 4ms/step


<keras.callbacks.History at 0x2a1de3d2850>

In [352]:
# Testing the model, it returns the loss and metrics for the model in 'test_mode'
test_loss, test_accuracy = model.evaluate(test_input,test_targets)

