In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import os
import sklearn
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__, module.__version__)

2.0.0-rc1
sys.version_info(major=3, minor=6, micro=3, releaselevel='final', serial=0)
matplotlib 2.1.0
numpy 1.18.0
pandas 0.20.3
sklearn 0.21.3
tensorflow 2.0.0-rc1
tensorflow_core.keras 2.2.4-tf


tfrecord 文件格式
-> tf.train.Example
     ->tf.train.Features->{"key":tf.train.Feature}
        -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

In [2]:
favorite_books = [name.encode('utf-8') for name in ["machine learning","cc150"]]

favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books)

hours_floatlist = tf.train.FloatList(value = [15.5,9.5,7.0,8.0])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [42])
print(age_int64list)

features = tf.train.Features(
    feature = {
        "favorite_books":tf.train.Feature(bytes_list=favorite_books_bytelist),
        "hours":tf.train.Feature(float_list = hours_floatlist),
        "age":tf.train.Feature(int64_list = age_int64list),
    }
)

print(features)

example = tf.train.Example(features = features)
print(example)

serialized_example = example.SerializeToString()
print(serialized_example)

[b'machine learning', b'cc150']
value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 42

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n

In [3]:
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = "test.tfrecord"
filename_fullpath = os.path.join(output_dir,filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [4]:
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)


In [5]:
expected_features = {
    "favorite_books":tf.io.VarLenFeature(dtype = tf.string),
    'hours':tf.io.VarLenFeature(dtype = tf.float32),
    "age":tf.io.FixedLenFeature([],dtype = tf.int64),
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],default_value = b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

machine learning
cc150
machine learning
cc150
machine learning
cc150


In [6]:
#压缩形式存储和读取
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter(filename_fullpath_zip,options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [7]:
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],compression_type="GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],default_value = b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

machine learning
cc150
machine learning
cc150
machine learning
cc150


In [12]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)
print(housing.data[:5,:])
print(housing.target[:5])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [13]:
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_valid.shape, y_valid.shape)

(11610, 8) (11610,)
(5160, 8) (5160,)
(3870, 8) (3870,)


In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaler = scaler.fit_transform(x_train)
x_valid_scaler = scaler.transform(x_valid)
x_test_scaler = scaler.transform(x_test)

In [18]:
train=np.c_[x_train_scaler,y_train]
valid=np.c_[x_valid_scaler,y_valid]
test=np.c_[x_test_scaler,y_test]

np.savetxt("data/csv/train.csv",train,delimiter=',')
np.savetxt("data/csv/valid.csv",valid,delimiter=',')
np.savetxt("data/csv/test.csv",test,delimiter=',')
# print(train.shape)
# print(train[:5,:])
# print(x_train_scaler[:5,:])
# print(y_train[:5])

In [19]:
def parse_csv_line(line,n_fields=9):
    defs = [tf.constant(np.nan)]*n_fields
    parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x,y

# def csv_reader_dataset(filenames,n_readers=5,batch_size=32,n_parse_threads=5,
#                       shuffle_buffer_size=10000):
#     dataset = tf.data.Dataset.list_files(filenames)
#     dataset = dataset.repeat()
#     dataset = dataset.interleave(
#         lambda filename:tf.data.TextLineDataset(filename),#tf.data.TextLineDataset(filename).skip(1)
#         cycle_length = n_readers)
#     dataset.shuffle(shuffle_buffer_size)
#     dataset = dataset.map(parse_csv_line, num_parallel_calls = n_parse_threads)
#     dataset = dataset.batch(batch_size)
    
#     x = tf.stack(parsed_fields[0:-1])
#     y = tf.stack(parsed_fields[-1:])
#     return x,y

def csv_reader_dataset(filenames,n_readers=5,batch_size=32,n_parse_threads=5,
                      shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename:tf.data.TextLineDataset(filename),
        cycle_length = n_readers)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [21]:
batch_size=3
train_filenames=["data/csv/train.csv"]
valid_filenames=["data/csv/valid.csv"]
test_filenames=["data/csv/test.csv"]
train_set = csv_reader_dataset(train_filenames,batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size=batch_size)
# for items in train_set.take(1):
#     print(items)

In [26]:
def serialize_example(x,y):
    """Converts x,y to tf.train.Example and serialize"""
    input_features = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    features = tf.train.Features(
        feature = {
            "input_features":tf.train.Feature(float_list = input_features),
            "label":tf.train.Feature(float_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()

def csv_to_tfrecords(csv_filename,tfrecords_filename,compression_type = None):
    options = tf.io.TFRecordOptions(compression_type=compression_type)
    data=np.loadtxt(csv_filename,delimiter=',')
    
    with tf.io.TFRecordWriter(tfrecords_filename,options) as writer:
        for line in data:
            x=line[:-1]
            y=line[-1:]
            writer.write(serialize_example(x,y))
        
csv_to_tfrecords("data/csv/train.csv","data/tfrecords/train.tfrecords")
csv_to_tfrecords("data/csv/valid.csv","data/tfrecords/valid.tfrecords")
csv_to_tfrecords("data/csv/test.csv","data/tfrecords/test.tfrecords")

csv_to_tfrecords("data/csv/train.csv","data/zip_tfrecords/train.tfrecords",compression_type="GZIP")
csv_to_tfrecords("data/csv/valid.csv","data/zip_tfrecords/valid.tfrecords",compression_type="GZIP")
csv_to_tfrecords("data/csv/test.csv","data/zip_tfrecords/test.tfrecords",compression_type="GZIP")

In [28]:
expected_features = {
    "input_features":tf.io.FixedLenFeature([8],dtype = tf.float32),
    "label":tf.io.FixedLenFeature([1],dtype = tf.float32)
}

def parse_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example, expected_features)
    return example["input_features"],example["label"]

def tfrecords_reader_dataset(filenames,n_readers=5,batch_size=32,n_parse_threads=5,
                             shuffle_buffer_size=10000, compression_type = None):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename:tf.data.TFRecordDataset(
            filename,compression_type = compression_type),
        cycle_length = n_readers)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_example,num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

tfr_train_filenames=["data/zip_tfrecords/train.tfrecords"]
tfr_valid_filenames=["data/zip_tfrecords/valid.tfrecords"]
tfr_test_filenames=["data/zip_tfrecords/test.tfrecords"]

train_dataset=tfrecords_reader_dataset(tfr_train_filenames,n_readers=1,compression_type="GZIP")
valid_dataset=tfrecords_reader_dataset(tfr_valid_filenames,n_readers=1,compression_type="GZIP")
test_dataset=tfrecords_reader_dataset(tfr_test_filenames,n_readers=1,compression_type="GZIP")

# for item in train_dataset.take(1):
#     print(item)

In [30]:
model = keras.Sequential([
    keras.layers.Dense(30,activation="relu",input_shape=[8]),
    keras.layers.Dense(1),
])

model.compile(loss="mean_squared_error",optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-2)]

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 30)                270       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________


In [33]:
history = model.fit(train_dataset,
                    validation_data = valid_dataset,
                    steps_per_epoch = 11160//32,
                    validation_steps = 3870//32,
                    epochs = 100,
                    callbacks = callbacks)

Train for 348 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [34]:
model.evaluate(test_dataset,steps = 5160//32)



0.37270829386962867