# Create a machine learning algorithm to predict if a customer will buy again or not using an audiobook app

How are we going through this real world problem:<br>
-1Preprocess data:<br> balance dataset<br>divide dataset into training<br> validation and test <br> Save data into tensorflow dataformat (.npz)<br>
-2.Creating a class that handles batching<br>
-3. Create ML algorithm

In this case in our dataset to balance it we have  to look up at our target, we have to count the total number of target 1 and matching the same number of 0-s to them

# Extract the data from CSV

In [81]:
import numpy as np
from sklearn import preprocessing
raw_csv_data=np.loadtxt("Audiobooks_data.csv",delimiter=",")
unscaled_input_all=raw_csv_data[:,1:-1]
targets_all=raw_csv_data[:,-1]

# Balance the dataset

In [82]:
num_one_target=int(np.sum(targets_all))
#Initialize zero counter target
zero_target_counter=0
indices_to_remove=[]
for i in range(targets_all.shape[0]):
    if targets_all[i]==0:
        zero_target_counter+=1
        if zero_target_counter>num_one_target:
            indices_to_remove.append(i)
            
unscaled_inputs_equal_priors=np.delete(unscaled_input_all,indices_to_remove,axis=0)
target_equal_priors=np.delete(targets_all,indices_to_remove,axis=0)


# Standardize the inputs

In [83]:
scaled_inputs=preprocessing.scale(unscaled_inputs_equal_priors)


# Shuffle the data

In [84]:
#keeping same information but in different order because later on we will do batching
shuffled_indices=np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices) #indixes
shuffled_inputs=scaled_inputs[shuffled_indices]
shuffled_targets=target_equal_priors[shuffled_indices]


# Split the dataset into train, validation and test

In [85]:
samples_count=shuffled_inputs.shape[0]
print(samples_count)

4474


In [86]:
#We are using 80-10-10 split
train_samples_count=int(0.8* samples_count)
print(train_samples_count)

3579


In [87]:
validatio_samples_count=int(0.1* samples_count)
print(validatio_samples_count)

447


In [88]:
test_samples_count=samples_count-train_samples_count-validatio_samples_count
print(test_samples_count)

448


In [89]:
#Above we have dicided the dimensions, now let exract them from the big dataset
train_inputs=shuffled_inputs[:train_samples_count]
train_targets=shuffled_targets[:train_samples_count]

validation_input=shuffled_inputs[train_samples_count: train_samples_count+validatio_samples_count]
validation_target=shuffled_targets[train_samples_count:train_samples_count+validatio_samples_count]

test_input=shuffled_inputs[train_samples_count+validatio_samples_count:]
test_target=shuffled_targets[train_samples_count+validatio_samples_count:]

#Let check if dataset balanced
print(np.sum(train_targets),train_samples_count,np.sum(train_targets)/train_samples_count)
print(np.sum(validation_target),validatio_samples_count,np.sum(validation_target)/validatio_samples_count)
print(np.sum(test_target),test_samples_count,np.sum(test_target)/test_samples_count)

1804.0 3579 0.5040514110086617
209.0 447 0.46756152125279643
224.0 448 0.5


# Save three datasets in npz

In [90]:
np.savez("Audiobook_data_train", inputs=train_inputs,targets=train_targets)
np.savez("Audiobook_data_validation",inputs=validation_input,targets=validation_target)
np.savez("Audiobook_data_test",inputs=test_input,targets=test_target)

# Create methods that will batch the data 

In [91]:
#We can use a iterator where data is an instance of a class, the oepration we want the loop to perform
class AudioBooks_Data_Reader():
    #If you do not put input it will automatically take none
    #The init method loads the data from .npz
    def __init__(self,dataset,batch_size=None):
        npz=np.load("AudioBook_data_{0}.npz".format(dataset))
        
        #Two variables for input and target
        self.inputs,self.targets= npz['inputs'].astype(np.float),npz['targets'].astype(np.int)
        
        #Count the batch size
        #If batchsize is none we are either validating or testing so we want all data in a single batch
        if batch_size is None:
            self.batch_size=self.inputs.shape[0]
        else:
            self.batch_size=batch_size
        self.curr_batch=0
        self.batch_count=self.inputs.shape[0]//self.batch_size
    #A method which loads the next batch
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
            
        # You slice the dataset in batches and then the "next" function loads them one after the other
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch += 1
        
        # One-hot encode the targets. In this example it's a bit superfluous since we have a 0/1 column 
        # as a target already but we're giving you the code regardless, as it will be useful for any 
        # classification task with more than one target column
        classes_num = 2
        targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] = 1
        
        # The function will return the inputs batch and the one-hot encoded targets
        return inputs_batch, targets_one_hot
        
    # A method needed for iterating over the batches, as we will put them in a loop
    # This tells Python that the class we're defining is iterable, i.e. that we can use it like:
    # for input, output in data: 
        # do things
    # An iterator in Python is a class with a method __next__ that defines exactly how to iterate through its objects
    def __iter__(self):
        return self
        
        
            
        
    
    
    

# Create the machine learning algorithm

In [92]:
import tensorflow as tf

input_size = 10
output_size = 2
hidden_layer_size = 50

tf.reset_default_graph()

inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])

weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.get_variable("biases_1", [hidden_layer_size])

outputs_1 = tf.nn.relu(tf.matmul(inputs,weights_1) + biases_1)


weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable("biases_2",[hidden_layer_size])

outputs_2 = tf.nn.relu(tf.matmul(outputs_1,weights_2) + biases_2)


weights_3 = tf.get_variable("weights_3", [hidden_layer_size, output_size])
biases_3 = tf.get_variable("biases_3", [output_size])

outputs = tf.matmul(outputs_2, weights_3) + biases_3


loss = tf.nn.softmax_cross_entropy_with_logits(logits = outputs, labels = targets)

mean_loss = tf.reduce_mean(loss)

optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(mean_loss)

out_equals_target = tf.equal(tf.argmax(outputs,1), tf.argmax(targets,1))

accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))


sess = tf.InteractiveSession()

initializer = tf.global_variables_initializer()

sess.run(initializer)


batch_size = 100

max_epochs = 50

prev_validation_loss = 9999999.

train_data = AudioBooks_Data_Reader('train', batch_size)
validation_data =AudioBooks_Data_Reader ('validation')

for epoch_counter in range(max_epochs):
    
    curr_epoch_loss = 0.
    
    for input_batch, target_batch in train_data:
        _, batch_loss = sess.run([optimize, mean_loss], 
            feed_dict={inputs: input_batch, targets: target_batch})
        
        curr_epoch_loss += batch_loss
        
    curr_epoch_loss /= train_data.batch_count
    
    validation_loss = 0.
    validation_accuracy = 0.
    
    for input_batch, target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run([mean_loss, accuracy], 
        feed_dict={inputs: input_batch, targets: target_batch})   
        
    print('Epoch '+str(epoch_counter+1)+
          '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
          '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
          '. Validation accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')
    
    if validation_loss > prev_validation_loss:
        break
        
    prev_validation_loss = validation_loss
    
print('End of training.')



Epoch 1. Training loss: 0.652. Validation loss: 0.508. Validation accuracy: 71.81%
Epoch 2. Training loss: 0.500. Validation loss: 0.431. Validation accuracy: 76.51%
Epoch 3. Training loss: 0.442. Validation loss: 0.386. Validation accuracy: 78.30%
Epoch 4. Training loss: 0.409. Validation loss: 0.359. Validation accuracy: 79.64%
Epoch 5. Training loss: 0.389. Validation loss: 0.344. Validation accuracy: 80.76%
Epoch 6. Training loss: 0.376. Validation loss: 0.335. Validation accuracy: 81.43%
Epoch 7. Training loss: 0.367. Validation loss: 0.329. Validation accuracy: 81.21%
Epoch 8. Training loss: 0.361. Validation loss: 0.324. Validation accuracy: 80.76%
Epoch 9. Training loss: 0.356. Validation loss: 0.321. Validation accuracy: 80.98%
Epoch 10. Training loss: 0.352. Validation loss: 0.317. Validation accuracy: 80.76%
Epoch 11. Training loss: 0.348. Validation loss: 0.315. Validation accuracy: 81.21%
Epoch 12. Training loss: 0.345. Validation loss: 0.313. Validation accuracy: 81.66%
E

# Test the model

In [93]:
#creating instance of class that batches the data, but for testing we will have one single iteration
test_data=AudioBooks_Data_Reader('test')
for input_batch, target_batch in test_data:
         test_accuracy = sess.run([ accuracy], #list
        feed_dict={inputs: input_batch, targets: target_batch}) 
        
test_accuracy_percent=  test_accuracy[0] *100      

print('Test accuracy: '+'{0:.2f}'.format(test_accuracy_percent )+'%')

Test accuracy: 81.03%
