<a href="https://colab.research.google.com/github/AlphAxe/Audiobooks_Business_problem/blob/master/Audiobooks_Business_problem_with_Machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Package for Preprocessing of data**

In [1]:
import numpy as np
from sklearn import preprocessing
import pandas as pd


**Loading Data**

In [2]:
raw_data = np.loadtxt('/content/Audiobooks_data.csv', delimiter= ',')
unscaled_all_input = raw_data[:,1:-1]
target_all = raw_data[:,-1]


**Balancing of Data**

let's make the data in balanced state means, same number of samples for both target classes

In [3]:
num_targets_one = int(np.sum(target_all))
targets_zero_count = 0
indexs_to_remove = []
for i in range (target_all.shape[0]):
  if target_all [i] == 0:
    targets_zero_count += 1
    if targets_zero_count > num_targets_one:
      indexs_to_remove.append(i)



In [4]:
#deleting excessive samples
unscaled_inputs_equal_prior = np.delete(unscaled_all_input,indexs_to_remove, axis= 0)
targets_equal_prior = np.delete(target_all, indexs_to_remove, axis = 0)

**Standardization of inputs**

In [5]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_prior)

**Shuffle the input data for better training**

In [6]:
shuffled_indexes = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indexes)

shuffled_inputs = scaled_inputs[shuffled_indexes]
shuffled_targets = targets_equal_prior[shuffled_indexes]


**Splitting the input dataset in Training, validation, and test targets in ratio 80, 10, 10 %**

In [7]:
total_samples = shuffled_inputs.shape[0]
train_samples_counts = int(0.8 * total_samples)
valid_samples_counts = int(0.1 * total_samples)
test_samples_counts = total_samples - train_samples_counts - valid_samples_counts

train_inputs = shuffled_inputs[: train_samples_counts]
train_targets = shuffled_targets [: train_samples_counts]

valid_inputs = shuffled_inputs[train_samples_counts : train_samples_counts + valid_samples_counts]
valid_targets = shuffled_targets[ train_samples_counts : train_samples_counts + valid_samples_counts]

test_inputs = shuffled_inputs[train_samples_counts + valid_samples_counts :]
test_targets = shuffled_targets[train_samples_counts + valid_samples_counts :]


**To check: our data is balanced or not**

In [9]:
print(np.sum(train_targets), train_samples_counts, np.sum(train_targets/train_samples_counts))
print(np.sum(valid_targets), valid_samples_counts, np.sum(valid_targets/valid_samples_counts))
print(np.sum(test_targets), test_samples_counts, np.sum(test_targets/test_samples_counts))

1794.0 3579 0.5012573344509639
213.0 447 0.4765100671140939
230.0 448 0.513392857142857


**Save all the datasets in .npz format**

In [10]:
np.savez('audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('audiobooks_data_validation', inputs = valid_inputs, targets = valid_targets)
np.savez('audiobooks_data_test', inputs = test_inputs, targets = test_targets)

**Loading the preprocessed data for modelling purpose**

In [11]:
#importing tensorflow package for further process

import tensorflow as tf

#loading training data
training_data = np.load('/content/audiobooks_data_train.npz')
validation_data = np.load('/content/audiobooks_data_validation.npz')
testing_data = np.load ('/content/audiobooks_data_test.npz')

In [12]:
training_inputs = training_data['inputs'].astype(np.float)
training_targets = training_data['targets'].astype(np.int)

In [13]:
validation_inputs = validation_data['inputs'].astype(np.float)
validation_targets = validation_data['targets'].astype(np.int)

In [14]:
testing_inputs = testing_data['inputs'].astype(np.float)
testing_targets = testing_data['targets'].astype(np.int)

**Configure Model**

In [15]:
input_size = 10
output_size = 2
hidden_layer_size = 50

model = tf.keras.Sequential([
                             
                             tf.keras.layers.Dense(hidden_layer_size, activation= 'relu'),
                             tf.keras.layers.Dense(hidden_layer_size, activation= 'relu'),
                             tf.keras.layers.Dense(output_size, activation = 'softmax')
])


#selecting optimizer and loss function for model

model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])



**Training Model**

In [20]:
batch_size = 100
max_epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience = 1)
model.fit(training_inputs,training_targets, 
          batch_size = batch_size,
          epochs = max_epochs,
          callbacks = [early_stopping],
          validation_data = (validation_inputs,validation_targets), verbose = 2)

Epoch 1/100
36/36 - 0s - loss: 0.2053 - accuracy: 0.9206 - val_loss: 0.2156 - val_accuracy: 0.9239
Epoch 2/100
36/36 - 0s - loss: 0.2076 - accuracy: 0.9195 - val_loss: 0.2087 - val_accuracy: 0.9239
Epoch 3/100
36/36 - 0s - loss: 0.2080 - accuracy: 0.9204 - val_loss: 0.2300 - val_accuracy: 0.9239


<tensorflow.python.keras.callbacks.History at 0x7fe98d53bfd0>

**Testing**

In [21]:
test_loss, test_accuracy = model.evaluate(testing_inputs,testing_targets)

