In [None]:
import pandas as pd
import numpy as np

FILE_PATH = '../datasets/cylinder/'
OUTDIR = '../models/cylinder'
DATASET = FILE_PATH + 'cylinder.csv'
TRAIN_SET = FILE_PATH + 'cylinder.train.csv'
VALIDATION_SET = FILE_PATH + 'cylinder.validation.csv'
TEST_SET = FILE_PATH + 'cylinder.test.csv'

In [None]:
def generate_data(count):
    ndata = np.random.rand(count, 2) * 1.5 + 0.5
    df = pd.DataFrame(data=ndata, columns=['Radius', 'Height'])
    df['Volume'] = 3.14 * df['Radius'] * df['Radius'] * df['Height']
    df.to_csv(DATASET, index=False)

In [None]:
generate_data(100000)

In [None]:
df = pd.read_csv(DATASET)
df.head()

In [None]:
import hashlib

def split(df):
    train_ratio = 0.7
    test_validation_ratio = 0.15
    df_with_id = df
    df_with_id['Id'] = df_with_id['Radius'] * 1000 + df_with_id['Height']
    
    train_set, temp_set = df_split(df_with_id, train_ratio, 'Id')
    validation_set, test_set = df_split(temp_set, test_validation_ratio, 'Id')
    
    train_set.drop('Id', axis=1, inplace=True)
    validation_set.drop('Id', axis=1, inplace=True)
    test_set.drop('Id', axis=1, inplace=True)
    
    train_set.to_csv(TRAIN_SET, index=False)
    validation_set.to_csv(VALIDATION_SET, index=False)
    test_set.to_csv(TEST_SET, index=False)
    

def test_set_check(identifier, ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * ratio
    
def df_split(data, ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]
    
    

In [None]:
split(df)

In [None]:
import tensorflow as tf

CSV_COLUMNS = ['Radius', 'Height', 'Volume']
FEATURES = CSV_COLUMNS[0:len(CSV_COLUMNS) - 1]
LABEL = CSV_COLUMNS[len(CSV_COLUMNS) - 1]

In [None]:
df_train = pd.read_csv(TRAIN_SET)
df_train.head()

In [None]:
df_valid = pd.read_csv(VALIDATION_SET)
df_valid.head()

In [None]:
df_test = pd.read_csv(TEST_SET)
df_test.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plot
plot.style.use(['dark_background'])

In [None]:
df_train.plot(kind='scatter', x='Radius', y='Volume', figsize=(10,10), alpha=0.4)

In [None]:
df_train.plot(kind='scatter', x='Height', y='Volume', figsize=(10,10), alpha=0.4)

In [None]:
def make_train_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=df[LABEL],
        batch_size=128,
        num_epochs=num_epochs,
        shuffle=True,
        queue_capacity=1000
    )

In [None]:
def make_eval_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=df[LABEL],
        batch_size=128,
        shuffle=False,
        queue_capacity=1000
    )

In [None]:
def make_prediction_input_fn(df):
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=None,
        batch_size=128,
        shuffle=False,
        queue_capacity=1000
    )

In [None]:
def make_feature_cols():
    input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES]
    return input_columns

In [None]:
tf.logging.set_verbosity(tf.logging.INFO)
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

model = tf.estimator.DNNRegressor(
    hidden_units = [32, 8, 2],
    feature_columns = make_feature_cols(), 
    model_dir = OUTDIR)

model.train(input_fn = make_train_input_fn(df_train, num_epochs = 100));

In [None]:
def eval_rmse(model, df):
    metrics = model.evaluate(input_fn=make_eval_input_fn(df))
    print('RMSE on dataset = {}'.format(np.sqrt(metrics['average_loss'])))

In [None]:
eval_rmse(model, df_valid)

In [None]:
predictions = model.predict(input_fn = make_prediction_input_fn(df_test))
for items in predictions:
    print(items)

In [None]:
df_predict = pd.DataFrame(
    data=[[1.0, 1.0],[2.0,2.0]],
    columns=FEATURES
)

df_predict.head()

In [None]:
results = model.predict(input_fn = make_prediction_input_fn(df_predict))

for item in results:
    print(item)