# Chapter 13 - Loading and Preprocessing Data with TensorFlow Code Reproduction

In [1]:
# Impor umum
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Membuat dataset dari tensor di memori
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
print("Dataset awal:", list(dataset.as_numpy_iterator()))

# Rantai transformasi (chaining transformations)
dataset = dataset.repeat(3).batch(7)
print("Dataset setelah repeat(3) dan batch(7):")
for item in dataset:
    print(item)

# Transformasi map, prefetch, dan shuffle
dataset = tf.data.Dataset.range(10).map(lambda x: x * 2) # Kalikan setiap elemen dengan 2
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7) # Acak dengan buffer dan batch
dataset = dataset.prefetch(1) # Prefetch satu batch di latar belakang

print("\nDataset setelah map, shuffle, batch, dan prefetch:")
for item in dataset:
    print(item)

Dataset awal: [np.int32(0), np.int32(1), np.int32(2), np.int32(3), np.int32(4), np.int32(5), np.int32(6), np.int32(7), np.int32(8), np.int32(9)]
Dataset setelah repeat(3) dan batch(7):
tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)

Dataset setelah map, shuffle, batch, dan prefetch:
tf.Tensor([ 0  4  6 12 14 18  2], shape=(7,), dtype=int64)
tf.Tensor([16  8 10], shape=(3,), dtype=int64)


Interleaving Files (Membaca Beberapa File secara Bersamaan)
Contoh bagaimana membaca dari beberapa file CSV secara efisien.

In [3]:
# (Asumsikan Anda sudah memiliki data housing dan menyimpannya ke CSV)
# Kode ini mendemonstrasikan konsepnya
# filepath_dataset = tf.data.Dataset.list_files(file_pattern)
# n_readers = 5
# dataset = filepath_dataset.interleave(
#     lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
#     cycle_length=n_readers)

In [4]:
# Membuat file TFRecord
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"Data pertama")
    f.write(b"Data kedua")

# Menulis protobuf tf.train.Example
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com", b"c@d.com"]))
        }))

with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    f.write(person_example.SerializeToString()) # Serialisasi ke string biner

In [5]:
# Membuat dataset dari file TFRecord
filepaths = ["my_contacts.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)

# Mendefinisikan deskripsi fitur untuk parsing
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string), # Fitur dengan panjang bervariasi
}

# Fungsi untuk parsing
def parse_examples(serialized_examples):
    return tf.io.parse_example(serialized_examples, feature_description)

# Menerapkan parsing pada dataset
dataset_parsed = dataset.map(parse_examples)
print("\nDataset setelah di-parse dari TFRecord:")
for parsed_example in dataset_parsed:
    print(parsed_example)


Dataset setelah di-parse dari TFRecord:
{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.com' b'c@d.com'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}


In [6]:
# Data sampel
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)

# 1. One-hot encoding
# Membuat layer untuk mapping dari string ke integer
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2 # Untuk kata yang tidak ada di vocabulary
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
print("\nHasil mapping string ke integer:", cat_indices)
# Melakukan one-hot encoding dari integer
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
print("Hasil one-hot encoding:\n", cat_one_hot)


# 2. Embedding
# Membuat layer embedding
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_layer = keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
                                         output_dim=embedding_dim)
print("\nHasil embedding:\n", embedding_layer(cat_indices))


Hasil mapping string ke integer: tf.Tensor([3 5 1 1], shape=(4,), dtype=int64)
Hasil one-hot encoding:
 tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]], shape=(4, 7), dtype=float32)

Hasil embedding:
 tf.Tensor(
[[-0.01203145  0.03223113]
 [-0.01718166  0.03693749]
 [ 0.01563598 -0.02709633]
 [ 0.01563598 -0.02709633]], shape=(4, 2), dtype=float32)


In [7]:
# Membuat data dan layer normalisasi
X_train = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
norm_layer = keras.layers.Normalization()

# Mengadaptasi layer ke data pelatihan untuk mempelajari mean dan stddev
norm_layer.adapt(X_train)
X_normalized = norm_layer(X_train)
print("\nHasil normalisasi fitur:\n", X_normalized)


Hasil normalisasi fitur:
 tf.Tensor(
[[-1.2247449 -1.2247448]
 [ 0.         0.       ]
 [ 1.2247448  1.2247449]], shape=(3, 2), dtype=float32)


In [None]:
# Memuat dataset
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

# Fungsi untuk menyimpan dataset ke beberapa file CSV
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    # ... (fungsi untuk menyimpan data ke file CSV, seperti di buku)
    # Untuk singkatnya, kita akan lewati implementasi detailnya di sini
    pass

# (Langkah ini mengasumsikan data telah disimpan ke file CSV)
# Kita akan membuat pipeline untuk membaca file-file tersebut.

# 1. Membuat pipeline dataset
# (Gantilah 'my_data_*.csv' dengan path file Anda jika menjalankan secara nyata)
# filepaths = tf.data.Dataset.list_files("my_data_*.csv")
# dataset = filepaths.interleave(...)

# 2. Fungsi pra-pemrosesan
# (Fungsi ini akan mem-parse setiap baris CSV, melakukan scaling, dan mengembalikan fitur & label)
# @tf.function
# def preprocess(line):
#     defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
#     fields = tf.io.decode_csv(line, record_defaults=defs)
#     x = tf.stack(fields[:-1])
#     y = tf.stack(fields[-1:])
#     return (x - X_mean) / X_std, y


# 3. Membangun pipeline akhir dari data di memori (untuk demonstrasi)
# train_data = np.c_[X_train, y_train]
# valid_data = np.c_[X_valid, y_valid]
# test_data = np.c_[X_test, y_test]

def build_pipeline(X_data, y_data, n_epochs=None, shuffle_buffer_size=None, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((X_data, y_data))
    if shuffle_buffer_size:
        dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    return dataset.repeat(n_epochs).prefetch(1)

# Membuat pipeline untuk training, validasi, dan testing
train_set = build_pipeline(X_train, y_train, shuffle_buffer_size=len(X_train))
valid_set = build_pipeline(X_valid, y_valid)
test_set = build_pipeline(X_test, y_test)


# 4. Menggunakan pipeline untuk melatih model
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
])
model.compile(loss="mse", optimizer="nadam")
history = model.fit(train_set, epochs=10, validation_data=valid_set)

# Mengevaluasi model menggunakan test set dari pipeline
print("\nMengevaluasi model pada test set:")
model.evaluate(test_set)