In [16]:
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import pandas as pd
import time
import sklearn
import tensorflow as tf

print(tf.__version__)

for module in np, sklearn, tf.keras, pd:
    print(module.__name__, module.__version__)

2.0.0
numpy 1.17.4
sklearn 0.20.0
tensorflow_core.keras 2.2.4-tf
pandas 0.25.3


In [17]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(housing.data, housing.target, random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, random_state=11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scalered = scaler.fit_transform(x_train)
x_valid_scalered = scaler.transform(x_valid)
x_test_scalered = scaler.transform(x_test)
print(x_train_scalered.shape)
print(len(x_train_scalered.shape))
print(type(x_train_scalered))

(11610, 8)
2
<class 'numpy.ndarray'>


In [20]:
output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, '{}_{:02d}.csv')
    filenames = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n')
            for row_index in row_indices:
                f.write(','.join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames
                
train_data = np.c_[x_train_scalered, y_train]
valid_data = np.c_[x_valid_scalered, y_valid]
test_data = np.c_[x_test_scalered, y_test]
header_col = housing.feature_names + ['MidianHouseValue']
header_str = ','.join(header_col)
print(header_col)
print(header_str)

train_filenames = save_csv(output_dir, train_data, 'train', header_str, 20)
valid_filenames = save_csv(output_dir, valid_data, 'valid', header_str, 10)
test_filenames = save_csv(output_dir, test_data, 'test', header_str, 10)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MidianHouseValue']
MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MidianHouseValue


In [21]:
filenames_dataset = tf.data.Dataset.list_files(train_filenames)
dataset = filenames_dataset.interleave(
    lambda x: tf.data.TextLineDataset(x).skip(1),
    cycle_length=5
)
for filename in filenames_dataset:
    print(filename)
for line in dataset.take(15):
    print(line.numpy())

tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_15.csv', 

In [22]:
def parse_csv_line(line, n_fields=9):
    defs=[tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line,defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line(b'-1.0775077698160966,-0.44874070548966555,-0.5680568205591913,-0.14269262164909954,-0.09666677138213985,0.12326468238687088,-0.3144863716683942,-0.4818958888413162,0.978', 9)

(<tf.Tensor: id=782, shape=(8,), dtype=float32, numpy=
 array([-1.0775077 , -0.4487407 , -0.5680568 , -0.14269263, -0.09666677,
         0.12326469, -0.31448638, -0.4818959 ], dtype=float32)>,
 <tf.Tensor: id=783, shape=(1,), dtype=float32, numpy=array([0.978], dtype=float32)>)

In [27]:
def csv_reader_dataset(filenames, n_readers=5, batch_size=32, n_parsed_threads=5, shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers
    )
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parsed_threads)
    dataset = dataset.batch(batch_size)
    return dataset

import pprint
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch,y_batch in train_set.take(2):
    pprint.pprint(x_batch)
    pprint.pprint(y_batch)
    

<tf.Tensor: id=1203, shape=(3, 8), dtype=float32, numpy=
array([[-0.8816899 , -1.4900438 , -0.43490088, -0.09926124, -0.40514407,
        -0.15319672,  1.6730974 , -0.7415562 ],
       [ 0.4845059 , -0.68904144,  0.2768636 , -0.12265252, -0.05402432,
        -0.00813306,  1.318505  , -1.5405111 ],
       [-1.051079  ,  1.0731637 , -0.42531   , -0.2708388 , -0.6510186 ,
         0.11650889, -0.72040135,  1.1260008 ]], dtype=float32)>
<tf.Tensor: id=1204, shape=(3, 1), dtype=float32, numpy=
array([[1.327],
       [1.905],
       [0.597]], dtype=float32)>
<tf.Tensor: id=1205, shape=(3, 8), dtype=float32, numpy=
array([[ 0.39973143,  0.51246214,  0.05106371, -0.0889643 , -0.6129126 ,
         0.14187497, -0.6597474 ,  0.5817128 ],
       [ 1.533983  , -0.4487407 ,  0.70806104, -0.18414612, -0.5131111 ,
        -0.0761603 , -1.3316067 ,  1.2658179 ],
       [ 1.1067004 , -0.12833975,  0.04851111, -0.3518474 , -0.9413502 ,
         0.00929013, -0.8043838 ,  0.83637965]], dtype=float32)>
<tf.

In [28]:
train_set = csv_reader_dataset(train_filenames)
valid_set = csv_reader_dataset(valid_filenames)
test_set = csv_reader_dataset(test_filenames)

In [30]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(30, activation='relu', input_shape=[8]),
    tf.keras.layers.Dense(1)
])
callbacks = [tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]
model.compile(optimizer='sgd', loss='mse')
model.summary()
history = model.fit(train_set,validation_data=valid_set,
                    steps_per_epoch= 11160 // 32,
                    validation_steps=3870 // 32,
                    epochs=100,callbacks=callbacks)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 30)                270       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________
Train for 348 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [31]:
model.evaluate(test_set,steps=5160 // 32)



0.405245676590419