In [1]:
import functools

import numpy as np
import tensorflow as tf

In [2]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [3]:
train_file_path, test_file_path

('C:\\Users\\jjgk9\\.keras\\datasets\\train.csv',
 'C:\\Users\\jjgk9\\.keras\\datasets\\eval.csv')

In [4]:
# Numpy 형식을 읽기 편하게 설정
np.set_printoptions(precision=3, suppress=True)

## Load data
csv의 파일 형식을 확인

In [7]:
! head {train_file_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


일반적으로, csv형식은 pandas로 load하고 `numpy`형식으로 `tensorflow`에 넘길 수 있다. 파일이 매우 크거나 tensorflow/tf.data를 이용해 사용하기 위해서는 `tf.data.experimental.make_csv_dataset` 함수를 사용한다.

In [8]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

In [9]:
def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # 보이는 크기를 작게 설정
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True, 
      **kwargs)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.


In [14]:
raw_train_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [26]:
def show_batch(dataset):
  for batch, label in dataset.take(1):
#     print("batch : ", batch)
#     print("label : ", label)
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))

In [24]:
show_batch(raw_train_data)

batch :  OrderedDict([('sex', <tf.Tensor: id=267, shape=(5,), dtype=string, numpy=array([b'male', b'male', b'male', b'male', b'male'], dtype=object)>), ('age', <tf.Tensor: id=259, shape=(5,), dtype=float32, numpy=array([21., 37., 36., 35., 28.], dtype=float32)>), ('n_siblings_spouses', <tf.Tensor: id=265, shape=(5,), dtype=int32, numpy=array([0, 2, 1, 0, 0])>), ('parch', <tf.Tensor: id=266, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0])>), ('fare', <tf.Tensor: id=264, shape=(5,), dtype=float32, numpy=array([ 7.796,  7.925, 15.55 , 26.   ,  7.75 ], dtype=float32)>), ('class', <tf.Tensor: id=261, shape=(5,), dtype=string, numpy=array([b'Third', b'Third', b'Third', b'Second', b'Third'], dtype=object)>), ('deck', <tf.Tensor: id=262, shape=(5,), dtype=string, numpy=array([b'unknown', b'unknown', b'unknown', b'unknown', b'F'], dtype=object)>), ('embark_town', <tf.Tensor: id=263, shape=(5,), dtype=string, numpy=
array([b'Southampton', b'Southampton', b'Southampton', b'Southampton',
   

열이 포함된 데이터의 경우 자동으로 열 이름을 지정함. 열 이름이 없는 데이터셋이라면, 직접 열 이름을 지정하고 데이터를 얻을 수 있다.

In [27]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

sex                 : [b'male' b'male' b'male' b'male' b'male']
age                 : [35. 19. 36. 28. 20.]
n_siblings_spouses  : [0 3 0 0 0]
parch               : [0 2 0 0 0]
fare                : [ 26.288 263.     12.875  30.5     8.05 ]
class               : [b'First' b'First' b'Second' b'First' b'Third']
deck                : [b'E' b'C' b'D' b'C' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'y' b'n' b'y' b'y' b'y']


## Data preprocessing
