In [1]:
##Import relevant libraries 

import numpy as np
from sklearn import preprocessing



In [4]:
##Load dataset
raw_csv_data = np.loadtxt("C:/Users/Administrator/Downloads/Audiobooks_data.csv", delimiter = ',')

unscaled_inputs_all = raw_csv_data[:, 1:-1]
targets_all = raw_csv_data[:,-1]

In [60]:
##Balance the dataset

num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range (targets_all.shape[0]):
    if targets_all[i] ==0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)

In [71]:
##Standardize inputs using sklearn

scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

In [72]:
scaled_inputs = scaled_inputs[:targets_equal_priors.shape[0], :]
shuffled_indices = np.arange(targets_equal_priors.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]


In [73]:
##Split dataset into train, validation and test


from sklearn.model_selection import train_test_split

# Suppose shuffled_inputs and shuffled_targets are your full dataset
# Example:
# shuffled_inputs = np.array([...])
# shuffled_targets = np.array([...])

# First, split off the test set (10% of total samples)
train_val_inputs, test_inputs, train_val_targets, test_targets = train_test_split(
    shuffled_inputs, shuffled_targets, test_size=0.1, 
    stratify=shuffled_targets, random_state=42
)

# Now, split the remaining 90% into train (80%) and validation (10% of total)
# Validation fraction relative to train_val set:
val_fraction = 0.1111  # 0.1 / 0.9 ≈ 11.11%

train_inputs, validation_inputs, train_targets, validation_targets = train_test_split(
    train_val_inputs, train_val_targets, test_size=val_fraction, 
    stratify=train_val_targets, random_state=42
)

# Check sizes and fractions of 1s
print("Train set: ", np.sum(train_targets), train_targets.shape[0], np.sum(train_targets)/train_targets.shape[0])
print("Validation set: ", np.sum(validation_targets), validation_targets.shape[0], np.sum(validation_targets)/validation_targets.shape[0])
print("Test set: ", np.sum(test_targets), test_targets.shape[0], np.sum(test_targets)/test_targets.shape[0])


Train set:  1789.0 3578 0.5
Validation set:  224.0 448 0.5
Test set:  224.0 448 0.5


In [74]:
##Save all three datasets in .npz format

np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

In [75]:
##Time to build the model using tensorflow

import tensorflow as tf

In [76]:
##Load the three .npz datasets

npz = np.load("C:/Users/Administrator/Audiobooks_data_train.npz")

train_inputs = npz['inputs'].astype(np.float64)
train_targets = npz['targets'].astype(np.int32)

npz = np.load("C:/Users/Administrator/Audiobooks_data_validation.npz")   

validation_inputs = npz['inputs'].astype(np.float64)
validation_targets = npz['targets'].astype(np.int32)

npz = np.load("C:/Users/Administrator/Audiobooks_data_test.npz")   

test_inputs = npz['inputs'].astype(np.float64)
test_targets = npz['targets'].astype(np.int32)


In [77]:
##Model

input_size= 10
output_size = 2
hidden_layer_size = 100

model = tf.keras.Sequential([tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                             tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                             tf.keras.layers.Dense(output_size, activation='softmax')
                            ])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 32
max_epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)
model.fit(train_inputs, train_targets,
         batch_size = batch_size, epochs = max_epochs,
          callbacks = early_stopping,
         validation_data=(validation_inputs, validation_targets),
         verbose = 2)

Epoch 1/100
112/112 - 1s - 12ms/step - accuracy: 0.7457 - loss: 0.4587 - val_accuracy: 0.8259 - val_loss: 0.3321
Epoch 2/100
112/112 - 0s - 2ms/step - accuracy: 0.7937 - loss: 0.3748 - val_accuracy: 0.7991 - val_loss: 0.3227
Epoch 3/100
112/112 - 0s - 2ms/step - accuracy: 0.8046 - loss: 0.3557 - val_accuracy: 0.8482 - val_loss: 0.2991
Epoch 4/100
112/112 - 0s - 2ms/step - accuracy: 0.8007 - loss: 0.3535 - val_accuracy: 0.8504 - val_loss: 0.2964
Epoch 5/100
112/112 - 0s - 2ms/step - accuracy: 0.8004 - loss: 0.3487 - val_accuracy: 0.8147 - val_loss: 0.3139
Epoch 6/100
112/112 - 0s - 2ms/step - accuracy: 0.8133 - loss: 0.3398 - val_accuracy: 0.8549 - val_loss: 0.2832
Epoch 7/100
112/112 - 0s - 2ms/step - accuracy: 0.8083 - loss: 0.3431 - val_accuracy: 0.8371 - val_loss: 0.3081
Epoch 8/100
112/112 - 0s - 2ms/step - accuracy: 0.8102 - loss: 0.3385 - val_accuracy: 0.8571 - val_loss: 0.2963


<keras.src.callbacks.history.History at 0x1bdff891550>

In [78]:
##Testing the model

test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8504 - loss: 0.3199 
