In [8]:
import numpy as np
import tensorflow_datasets as tfds
from tensorflow.keras.metrics import AUC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

[Dataset documentation](https://www.tensorflow.org/datasets/catalog/titanic)


In [2]:
ds = tfds.load("titanic",
               shuffle_files=True,
               as_supervised=True,
               )["train"]

# lists are easier to manipulate, and no significative performance overhead
# format: list of tuples as (dict of features, target)
ds = list(ds.as_numpy_iterator())

allf = {key: set() for key in ds[0][0].keys()}
for elem in ds:
    for key in elem[0].keys():
        allf[key].add(elem[0][key])

In [3]:
ded = 0
vivant = 0
for elem in ds:
    if elem[1]:
        vivant += 1
    else:
        ded += 1
print(vivant, ded)
print(809/1309 * 100)

500 809
61.80290297937356


In [4]:
for key in allf.keys():
    print(key, " : ", allf[key])

age  :  {0.75, 1.0, 2.0, 3.0, 0.6667, 5.0, 6.0, 0.8333, 7.0, 9.0, 10.0, 11.0, 4.0, 0.3333, 14.0, 15.0, 16.0, 17.0, 18.0, 18.5, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.5, 28.0, 26.5, 30.0, 31.0, 32.0, 33.0, 29.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 38.5, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 54.0, 55.0, 56.0, 57.0, 58.0, 53.0, 60.0, 61.0, 62.0, 63.0, 60.5, 65.0, 64.0, 59.0, 66.0, 12.0, 70.0, 71.0, 70.5, 14.5, 74.0, 67.0, 76.0, 80.0, 19.0, 20.5, 22.5, 23.5, 24.5, 30.5, 0.4167, 0.9167, 0.1667, 32.5, 34.0, 34.5, 36.5, 40.5, 8.0, 43.0, 45.5, 55.5, 11.5, 13.0, -1.0}
boat  :  {b'6', b'14', b'13 15', b'10', b'5', b'8', b'3', b'15', b'Unknown', b'4', b'C', b'13', b'12', b'7', b'C D', b'11', b'1', b'13 15 B', b'8 10', b'5 7', b'D', b'16', b'A', b'2', b'9', b'B', b'15 16', b'5 9'}
body  :  {1, 4, 7, 9, 14, 15, 16, 17, 18, 19, 22, 32, 35, 37, 38, 43, 45, 46, 47, 50, 51, 52, 53, 58, 61, 62, 67, 68, 69, 70, 72, 75, 79, 80, 81, 89, 96, 97, 98, 101, 103, 108, 109

Available features:
  - age (float32),
  - boat (string),
  - body (int32),
  - cabin (string),
  - embarked (int64),
  - fare (float32),
  - home.dest (string), 
  - name (string),
  - parch (int32) - number of parents and children on board,
  - pclass (int64) - passenger class, 
  - sex (int64),
  - sibsp (int32) - number of siblings and spouses on board,
  - ticket (string)

In [16]:
def preprocessing(ds, one_hot_keys=[], ignored_keys=[]):
    res_x = np.empty((1, len(ds)), dtype=np.float32)
    tmp = np.empty_like(res_x)
    ds_keys = ds[0][0].keys()
    for key in ds_keys:
        if key in ignored_keys:
            continue
        elif key in one_hot_keys:
            allvals = set()
            for elem in ds:
                allvals.add(elem[0][key])
            allvals = sorted(list(allvals))
            tmp_oh = np.zeros((len(allvals), len(ds)), dtype=np.float32)
            for i, elem in enumerate(ds):
                tmp_oh[allvals.index(elem[0][key]), i] = 1.
            res_x = np.concatenate((res_x, tmp_oh), axis=0)
        else:
            for i, elem in enumerate(ds):
                tmp[0, i] = elem[0][key]
            res_x = np.concatenate((res_x, tmp), axis=0)
    res_y = np.empty((1, len(ds)), dtype=np.float32)
    for i, elem in enumerate(ds):
        res_y[0, i] = elem[1]
    return (np.transpose(np.delete(res_x, 0, axis=0)), np.transpose(res_y))

In [17]:
#  dataset preprocessing

one_hot_keys = ["boat", "cabin"] # to be filled in
ignored_keys = ["name", "ticket", "home.dest"]

xs, ys = preprocessing(ds, one_hot_keys=one_hot_keys, ignored_keys=ignored_keys)
print(xs.shape, ys.shape)

(1309, 223) (1309, 1)


In [20]:
# build model
model = Sequential()

# fill in the blank
model.add(Dense(1000, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

opt = Adam(lr=0.001)

model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy', AUC()],
              )

In [21]:
# train model
BATCH_SIZE = 16
VAL_SPLIT = 0.1

model.fit(x=xs, y=ys, batch_size=BATCH_SIZE,
          epochs=10,
          validation_split=VAL_SPLIT)

Train on 1178 samples, validate on 131 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f03bdf088d0>