In [1]:
### Import and show unprocessed data as DataFrame ###

import pandas as pd

unproc = pd.read_csv('data/audiobooks_data_unprocessed.csv')
print(unproc.shape)
unproc.head()

(14084, 12)


Unnamed: 0,customer_id,book_len_total,book_len_avg,spent_total,spent_avg,left_review,review_score,minutes_listened,completion_amt,support_requests,last_visited_minus_purchase_date,purchased_again
0,873,2160.0,2160,10.13,10.13,0,,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [2]:
### DataFrame preprocessing to fill empty cells and remove headers ###

proc = unproc.copy()

# Fill empty cells in review_score col with average of the column
avg_review_score = round(proc['review_score'].mean(), 2)
proc['review_score'] = proc['review_score'].fillna(avg_review_score)

# Shuffle rows (this may have already been done for this dataset and technically isn't
# needed due to random.choice() being used in a later step while balancing the dataset.)
proc = proc.sample(frac=1, random_state=42)

# Remove headers and save as new df
proc.to_csv('data/audiobooks_data_processed.csv', index=False, header=False)

# Reloading df without headers
proc = pd.read_csv('data/audiobooks_data_processed.csv')
print(proc.shape)
proc.head()

(14083, 12)


Unnamed: 0,11612,1620.0,1620,10.38,10.38.1,0,8.91,0.0,475.2,0.1,0.2,1
0,14444,1242.0,2484,5.99,11.99,0,9.0,0.0,0.0,0,329,1
1,11619,1188.0,1188,6.4,6.4,0,8.91,0.0,475.2,0,0,0
2,21304,2160.0,2160,5.33,5.33,0,8.91,0.37,567.0,0,33,0
3,9160,1080.0,1080,6.4,6.4,0,8.91,0.0,0.0,0,0,0
4,28393,216.0,216,8.0,8.0,0,8.91,0.0,0.0,0,0,0


In [3]:
### Extract csv data (remainder of preprocessing done with np and sklearn.preprocessing ###

import numpy as np
from sklearn import preprocessing

raw_csv_data = np.loadtxt('data/audiobooks_data_processed.csv', delimiter=',') # Note: Rows have already been shuffled
X_unscaled_unequal_priors = raw_csv_data[:, 1:-1] # Extract all but customer_id and purchased_again columns
y_unequal_priors = raw_csv_data[:, -1]

In [4]:
### Balance the dataset (remove some 0-rows to match # of 1-rows), otherwise we'll have way more nonreturning customers (0s) than returning (1s) ###
### Unbalanced dataset (unequal priors) could cause the model to just always guess the class that has more rows in the training data ###

# See how much data we have for each target
num_returning = int(sum(y_unequal_priors))
num_nonreturning = np.count_nonzero(y_unequal_priors == 0) # Or could just do: len(y_unequal_priors) - num_returning
print(f'Prebalanced data: {num_returning = } | {num_nonreturning = }')

# num_nonreturning > num_returning, so get a random num_returning # of indices for rows representing returning customers
# Note: Rows were already shuffled in pandas preprocessing, but the first instances of nonreturning indices is insufficient since
#       the data will be sliced in a couple steps during train/val/test splitting. Alternatively instead of np.random.choice()
#       being used here, np.random.shuffle() could be used later right before that train/val/test splitting step.
nonreturning_indices = np.where(y_unequal_priors == 0)[0]
np.random.seed(42)
indices_to_remove = np.random.choice(nonreturning_indices, num_nonreturning - num_returning, replace=False)

# Remove those rows to make a balanced dataset with equal priors
y = np.delete(y_unequal_priors, indices_to_remove, axis=0)
X_unscaled = np.delete(X_unscaled_unequal_priors, indices_to_remove, axis=0)
print(f'Balanced data: {y.shape = } | {X_unscaled.shape = }')

Prebalanced data: num_returning = 2237 | num_nonreturning = 11847
Balanced data: y.shape = (4474,) | X_unscaled.shape = (4474, 10)


In [5]:
### Scale/standardize the inputs ###

# sklearn.preprocessing is easier than using tf's .map(*custom_scaling_function*) method
# .scale() will, for each column, subtract the col's mean and divide by the col's std deviation
# This makes each column/feature have a mean of 0 and a std deviation of 1, ensuring each col/feature contributes to the model's calculations equally
X = preprocessing.scale(X_unscaled)
print(f'Unscaled mean={round(np.mean(X_unscaled), 2)} and std={round(np.std(X_unscaled), 2)}.')
print(f'Scaled mean={round(np.mean(X), 2)} and std={round(np.std(X), 2)}. This applies to every column too.')

Unscaled mean=361.08 and std=749.66.
Scaled mean=-0.0 and std=1.0. This applies to every column too.


In [6]:
### Split data into train, val, and test (80%/10%/10% split) ###

# Calculate number of samples to split into each dataset
num_samples = len(y)
num_train = int(0.8 * num_samples)
num_val = int(0.1 * num_samples)
num_test = num_samples - num_train - num_val

# Split the data
X_train, y_train = X[:num_train], y[:num_train]
X_val, y_val = X[num_train:num_train + num_val], y[num_train:num_train + num_val]
X_test, y_test = X[num_train + num_val:], y[num_train + num_val:]

print(f'{num_samples=}')
print(f'{X_train.shape=}, {X_val.shape=}, {X_test.shape=}')
print(f'{y_train.shape=}, {y_val.shape=}, {y_test.shape=}')

num_samples=4474
X_train.shape=(3579, 10), X_val.shape=(447, 10), X_test.shape=(448, 10)
y_train.shape=(3579,), y_val.shape=(447,), y_test.shape=(448,)


In [7]:
### Save the inputs and targets to an .npz file ###

np.savez(
    'data/audiobooks_data_split.npz',
    X_train=X_train, 
    X_val=X_val, 
    X_test=X_test, 
    y_train=y_train, 
    y_val=y_val, 
    y_test=y_test
)