In [1]:
### Extract csv data (remainder of preprocessing done with np and sklearn.preprocessing ###

from collections import Counter
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

raw_csv_data = np.loadtxt('data/audiobooks_data_processed.csv', delimiter=',')
X_unscaled_unequal_priors = raw_csv_data[:, 1:-1] # Extract all but customer_id and purchased_again columns
y_unequal_priors = raw_csv_data[:, -1].astype(np.int8)

In [None]:
### Balance the dataset with SMOTE (oversampling technique that adds synthetic samples for the minority class) ###

# See how much data we have for each target
priors_count_dict = Counter(y_unequal_priors)
num_nonreturning, num_returning = priors_count_dict[0], priors_count_dict[1]
print(f'Prebalanced data: {num_returning = } | {num_nonreturning = }')

# Apply SMOTE to add samples
smote = SMOTE(random_state=42) # Note to self: It may have been more prudent to set aside test data prior to this step.
X_unscaled, y = smote.fit_resample(X_unscaled_unequal_priors, y_unequal_priors)

# Info on balanced data
balanced_priors_count_dict = Counter(y)
num_nonreturning_bal, num_returning_bal = balanced_priors_count_dict[0], balanced_priors_count_dict[1]
print(f'Balanced data: {num_returning_bal = } | {num_nonreturning_bal = }')
print(f'Balanced data: {y.shape = } | {X_unscaled.shape = }')


Prebalanced data: num_returning = 2237 | num_nonreturning = 11847
Balanced data: num_returning_bal = 11847 | num_nonreturning_bal = 11847
Balanced data: y.shape = (23694,) | X_unscaled.shape = (23694, 10)


In [3]:
### Scale/standardize the inputs, shuffle and split into train/val/test, and save as .npz ###

# Scale/standardize inputs so each feature has a mean of 0 and std of 1
X = preprocessing.scale(X_unscaled)

# Shuffle and split the data (80/10/10), while using stratisfy to maintain priors balance
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=1/9, stratify=y_train_val, random_state=42
)

# Print info on the data
print(f'Number of samples={len(y)}')
print(f'{X_train.shape=}, {y_train.shape=}, Priors Count={dict(Counter(y_train))}')
print(f'{X_val.shape=}, {y_val.shape=}, Priors Count={dict(Counter(y_val))}')
print(f'{X_test.shape=}, {y_test.shape=}, Priors Count={dict(Counter(y_test))}')

### Save the inputs and targets to an .npz file ###
np.savez(
    'data/audiobooks_data_split_smote.npz',
    X_train=X_train, 
    X_val=X_val, 
    X_test=X_test, 
    y_train=y_train, 
    y_val=y_val, 
    y_test=y_test
)

Number of samples=23694
X_train.shape=(18954, 10), y_train.shape=(18954,), Priors Count={1: 9477, 0: 9477}
X_val.shape=(2370, 10), y_val.shape=(2370,), Priors Count={0: 1185, 1: 1185}
X_test.shape=(2370, 10), y_test.shape=(2370,), Priors Count={1: 1185, 0: 1185}
