# Data Preparation and Processing

## Import Libraries

In [19]:
import numpy as np
from random import randint
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

In [20]:
# List of samples and labels
train_labels =    []
train_samples = []

#### Example Data:

- An experiemental drug was tested on individuals from ages 13 to 100 in clinical trial.
- The trial had 2,100 participants. Half were under 65 yesrs old, and half were 65 years or older.
- Around 95% of patients 65 or older experienced side effects.
- Around 95% of patients uders 65 experienced no side effects.

In [21]:
for _ in range(50):
    # The ~5% younger individuals who did experience side effects.
    random_younger = randint(13, 64)
    train_samples.append(random_younger)
    train_labels.append(1)  # (1) people who experienced side effects

    # The ~5% older individuals who did not experienced any side effects
    random_older = randint(65, 100)
    train_samples.append(random_older)
    train_labels.append(0)  # (0) people who experienced no side effects

for _ in range(1000):
    # The ~95% younger individuals who did not experience any side effects.
    random_younger = randint(13, 64)
    train_samples.append(random_younger)
    train_labels.append(0)
    
    # The ~95% younger individuals who did not experience any side effects.
    random_older = randint(65, 100)
    train_samples.append(random_older)
    train_labels.append(1)

In [22]:
train_labels = np.asarray(train_labels)
train_samples = np.asarray(train_samples)

train_samples, train_labels = shuffle(train_samples, train_labels)

In [23]:
# Normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_train_samples = scaler.fit_transform(train_samples.reshape(-1, 1))

In [7]:
scaled_train_samples.shape

(2100, 1)

In [8]:
train_labels.shape

(2100,)

In [32]:
# Let's save our arries to use it into our next sections
with open('drug_train_samples.npy', 'wb') as savNP:
    np.save(savNP, scaled_train_samples)
    np.save(savNP, train_labels)
print('Train arrays has been created!')

Train arrays has been created!


Now, let's create the test simples and do the same process that we did in the previous couple of lines

Test set: it will help us to test our model accuracy before produce it to the production level.

In [26]:
test_samples = []
test_labels = []

In [27]:
for _ in range(10):
    # The ~5% younger individuals who did experience side effects.
    random_younger = randint(13, 64)
    test_samples.append(random_younger)
    test_labels.append(1)  # (1) people who experienced side effects

    # The ~5% older individuals who did not experienced any side effects
    random_older = randint(65, 100)
    test_samples.append(random_older)
    test_labels.append(0)  # (0) people who experienced no side effects

for _ in range(200):
    # The ~95% younger individuals who did not experience any side effects.
    random_younger = randint(13, 64)
    test_samples.append(random_younger)
    test_labels.append(0)
    
    # The ~95% younger individuals who did not experience any side effects.
    random_older = randint(65, 100)
    test_samples.append(random_older)
    test_labels.append(1)

In [28]:
test_samples = np.asarray(test_samples)
test_labels = np.asarray(test_labels)

test_samples, test_labels = shuffle(test_samples, test_labels)

In [29]:
scaled_test_samples = scaler.fit_transform(test_samples.reshape(-1, 1))

In [30]:
# Store the arrays into file
with open("drug_test_samples.npy", 'wb') as testNP:
    np.save(testNP, scaled_test_samples)
    np.save(testNP, test_labels)
print('Test arrays has been created!')

Test arrays has been created!
