# import relevant packages

In [38]:
import tensorflow as tf

import tensorflow_datasets as tfds

from sklearn import preprocessing

# data

In [39]:
raw_csv_data = np.loadtxt("Audiobooks_data.csv", delimiter = ',') #inputs are all variable in csv except first and last col

# extract inputs and targets

In [40]:
unscaled_inputs_all = raw_csv_data[:,1:-1] #takes all columns excluding the ID and target ( 1st and last col )
targets_all = raw_csv_data[:,-1] #last column (target column)

# balance the dataset

In [41]:
num_one_targets = int(np.sum(targets_all)) ###count number of targets that are ones since we know less than 0 & keep as many zeros as ones
num_zero_targets = 0 ###counter for targets equal to 0 
indices_to_remove = [] ###variable that declares indices to remove

for i in range(targets_all.shape[0]): #shape on 0 axis is basically length of the vector so shows the number of all targets 
    if targets_all[i] == 0: # in loop increase counter by 1 if the target at position i is 0 
        num_zero_targets += 1
        if num_zero_targets > num_one_targets: #in same if add another if which adds an index for the variable indices to remove if the zeros counter is over the number of ones 
            indices_to_remove.append(i) #after counter for 0 matches # of ones, all indexes will be removed, indices_to_remove will contain all indexes we dont need 
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0) #deleting balances the dataset 

targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

# standardize the inputs

In [42]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors) #the scale method standardizes the dataset along each variable ( so basically all inputs will be standardized)

# shuffle the data

In [43]:
#keep same information in a different order.Possible that dataset collected in order of date so batch to make random
shuffled_indices = np.arange(scaled_inputs.shape[0]) #take indexes from axis 0 of scaled input shape and place them in a variable 
np.random.shuffle(shuffled_indices) #use np.random.shuffle method to shuffle them

shuffled_inputs = scaled_inputs[shuffled_indices] #equal to s
shuffled_targets = targets_equal_priors[shuffled_indices]

# split data into train, validation, and test

In [44]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8* samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

int(samples_count)

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count + validation_samples_count]
validation_targets = shuffled_inputs[train_samples_count:train_samples_count + validation_samples_count]


test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_inputs[train_samples_count+validation_samples_count:]

In [45]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1763.0 3579 0.49259569712210116
-80.98237910092038 447 -0.18116863333539235
130.82591270747614 448 0.2920221265791878


# save 3 datasets in *.npz

In [49]:
np.savez("Audiobooks_train_data", inputs=train_inputs, targets=train_targets)
np.savez("Audiobooks_validation_data", inputs=validation_inputs, targets=validation_targets)
np.savez("Audiobooks_test_data", inputs=test_inputs, targets=test_targets)