# Loading The Data

In [1]:
import numpy as np
from sklearn import preprocessing
raw_csv=np.loadtxt('Audiobooks_data.csv',delimiter=',')
unscaled_data=raw_csv[:,1:-1]
target_all=raw_csv[:,-1]

In [2]:
unscaled_data

array([[2.160e+03, 2.160e+03, 1.013e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.404e+03, 2.808e+03, 6.660e+00, ..., 0.000e+00, 0.000e+00,
        1.820e+02],
       [3.240e+02, 3.240e+02, 1.013e+01, ..., 0.000e+00, 1.000e+00,
        3.340e+02],
       ...,
       [1.080e+03, 1.080e+03, 6.550e+00, ..., 0.000e+00, 0.000e+00,
        2.900e+01],
       [2.160e+03, 2.160e+03, 6.140e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.620e+03, 1.620e+03, 5.330e+00, ..., 0.000e+00, 0.000e+00,
        9.000e+01]])

In [15]:
target_all

array([1., 1., 1., ..., 0., 0., 0.])

In [14]:
target_all.shape

(14084,)

# Fixing the imbalanced dataset by removing some rows with 0 targets

In [16]:
n_1_target=int(np.sum(target_all))
zero_counter=0
indices_to_remove=[]
for i in range(target_all.shape[0]):
    if target_all[i]==0:
        zero_counter+=1
        if zero_counter>n_1_target:
            indices_to_remove.append(i)
            
            

In [18]:
unscaled_inputs_equal_prior=np.delete(unscaled_data,indices_to_remove,axis=0)
targets_equal_prior=np.delete(target_all,indices_to_remove,axis=0)

# Standardizing the dataset

In [22]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [23]:
sc.fit(unscaled_inputs_equal_prior)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [24]:
scaled_data=sc.transform(unscaled_inputs_equal_prior)

In [25]:
scaled_data

array([[ 1.18956512,  0.36398846,  0.67728889, ..., -0.8635056 ,
        -0.20536617, -0.77240946],
       [-0.33022754,  1.10843845, -0.08841391, ..., -0.8635056 ,
        -0.20536617,  1.16499791],
       [-2.50135991, -1.74528653,  0.67728889, ..., -0.8635056 ,
         2.23179102,  2.78305242],
       ...,
       [ 1.18956512,  0.36398846,  0.67728889, ..., -0.20129479,
        -0.20536617, -0.62337812],
       [ 1.18956512,  0.36398846,  0.27347444, ..., -0.20129479,
        -0.20536617,  0.21758442],
       [ 1.18956512,  0.36398846,  0.20727535, ..., -0.20129479,
        -0.20536617, -0.51692717]])

In [26]:
scaled_data.shape

(4474, 10)

# Shuffling the dataset

In [29]:
shuffled_indices=np.arange(scaled_data.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs=scaled_data[shuffled_indices]
shuffled_target=targets_equal_prior[shuffled_indices]

# Splitting into train/validation/test

In [32]:
sample_count=shuffled_inputs.shape[0]
train_count=int(0.8*sample_count)
validation_count=int(0.1*sample_count)
test_count=sample_count-train_count-validation_count

train_inputs=shuffled_inputs[:train_count]
train_target=shuffled_target[:train_count]

validation_inputs=shuffled_inputs[train_count:train_count+validation_count]
validation_target=shuffled_target[train_count:train_count+validation_count]

test_inputs=shuffled_inputs[train_count+validation_count:]
test_target=shuffled_target[train_count+validation_count:]

print(np.sum(train_target),train_count,np.sum(train_target)/train_count)
print(np.sum(validation_target),validation_count,np.sum(validation_target)/validation_count)
print(np.sum(validation_target),test_count,np.sum(test_target)/test_count)

1769.0 3579 0.49427214305671974
247.0 447 0.5525727069351231
247.0 448 0.49330357142857145


# Saving as npz file

In [33]:
np.savez('Audiobook data train',inputs=train_inputs,targets=train_target)
np.savez('Audiobook data Validation',inputs=validation_inputs,targets=validation_target)
np.savez('Audiobook data test',inputs=test_inputs,targets=test_target)


# Loading the npz file

In [34]:
npz=np.load('Audiobook data train.npz')

train_inputs=npz['inputs'].astype(np.float)
train_targets=npz['targets'].astype(np.int)


npz=np.load('Audiobook data Validation.npz')

validation_inputs=npz['inputs'].astype(np.float)
validation_targets=npz['targets'].astype(np.int)


npz=np.load('Audiobook data test.npz')

test_inputs=npz['inputs'].astype(np.float)
test_targets=npz['targets'].astype(np.int)

# Building the model and training it

In [35]:
input_size = 10
output_size = 2

hidden_layer_size = 100
    

model = tf.keras.Sequential([
    
    
   
   
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 2nd hidden layer
    
   
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [39]:
batch_size=100
max_epochs=100

early_stopping=tf.keras.callbacks.EarlyStopping(patience=2)
model.fit(train_inputs,train_targets,batch_size=batch_size,epochs=max_epochs,callbacks=[early_stopping],validation_data=(validation_inputs,validation_targets),verbose=2)

<IPython.core.display.Javascript object>

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 0s - loss: 0.2046 - accuracy: 0.9218 - val_loss: 0.1952 - val_accuracy: 0.9284
Epoch 2/100
3579/3579 - 0s - loss: 0.2059 - accuracy: 0.9215 - val_loss: 0.2179 - val_accuracy: 0.9284
Epoch 3/100
3579/3579 - 0s - loss: 0.2027 - accuracy: 0.9204 - val_loss: 0.2367 - val_accuracy: 0.9217


<tensorflow.python.keras.callbacks.History at 0x2352c549948>

In [41]:
test_loss,test_accuracy=model.evaluate(test_inputs,test_targets)

