In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import pandas as pd
import numpy as np
import sklearn
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
#from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

### Preprocessing

In [2]:
dfTrain = pd.read_csv('dfTrain.csv', index_col=0).reset_index()
dfTest = pd.read_csv('dfTest.csv', index_col=0).reset_index()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def changeCate(df, col, values, nor = False):
    if not nor:
        for value in values:
            df.loc[df[col] == value, col] = 'other'
    else:
        df.loc[~df[col].isin(values), col] = 'other'
    return df

In [4]:
def preprocess(df):
    drop_col = list(df.filter(regex = 'trafficSource.adwordsClickInfo.+').columns.values)
    drop_col.extend(['index', 'date', 'device.deviceCategory', 'geoNetwork.city',
                     'geoNetwork.continent', 'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.region',
                     'trafficSource.adContent', 'trafficSource.keyword', 'trafficSource.referralPath'])
    df = df.drop(drop_col, axis = 1)
    df = changeCate(df, 'channelGrouping', ['Affiliates', 'Paid Search', 'Display', '(Other)'])
    df = changeCate(df, 'device.browser', ['Safari', 'Chrome'], nor = True)
    df = changeCate(df, 'device.operatingSystem', ['iOS', 'Windows', 'Android', 'Macintosh'], nor = True)
    df = changeCate(df, 'geoNetwork.networkDomain', ['(not set)', 'unknown.unknown'], nor = True)
    df = changeCate(df, 'geoNetwork.subContinent', ['Southeast Asia', 'Northern America', 'Southern Asia',
                                                    'Eastern Asia', 'Western Europe', 'Western Asia', 'Southern Europe',
                                                    'Northern Europe', 'South America', 'Eastern Europe'], nor = True)
    df = changeCate(df, 'trafficSource.campaign', ['(not set)'], nor = True)
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True)
    df = changeCate(df, 'trafficSource.medium', ['organic', 'referral', '(none)'], nor = True)
    df = changeCate(df, 'trafficSource.source', ['google', 'youtube.com', '(direct)', 'mall.googleplex.com'], nor = True)
    return df

In [5]:
train = preprocess(dfTrain)
cateCols = ['channelGrouping', 'device.browser', 'device.isMobile', 'device.operatingSystem', 
            'geoNetwork.networkDomain', 'geoNetwork.subContinent', 'totals.bounces', 'totals.newVisits',
            'trafficSource.campaign', 'trafficSource.isTrueDirect', 'trafficSource.medium', 'trafficSource.source']
for col in cateCols:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))

numCols = ['visitNumber', 'visitStartTime', 'totals.hits', 'totals.pageviews']

In [6]:
test = preprocess(dfTest)
for col in cateCols:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(test[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))

In [7]:
for cols in cateCols:
    train[cols] = train[cols].astype('category')
#train[cateCols] = train[cateCols].astype('category')
trainset, valset = train_test_split(train, test_size=0.2, random_state = 1)
train_X = trainset[cateCols + numCols]
train_y = trainset['hasRevenue'].values.astype(int)
val_X = valset[cateCols + numCols]
val_y = valset['hasRevenue'].values.astype(int)

In [8]:
for cols in cateCols:
    test[cols] = test[cols].astype('category')
test_X = test[cateCols + numCols]
test_y = test['hasRevenue'].values.astype(int)

#### Original

In [14]:
train_num_cols = train_X[numCols]
train_cat_cols = train_X[cateCols]
new_y = train_y

In [15]:
val_num_cols = val_X[numCols]
val_cat_cols = val_X[cateCols]
new_val_y = val_y

In [16]:
test_num_cols = test_X[numCols]
test_cat_cols = test_X[cateCols]
new_test_y = test_y

#### Oversampling

In [9]:
ros = RandomOverSampler(random_state=0, sampling_strategy = 0.05)
X_resampled, y_resampled = ros.fit_resample(train_X, train_y)
X_resampled = pd.DataFrame(X_resampled, columns=train_X.columns)
train_num_cols = X_resampled[numCols]
train_cat_cols = X_resampled[cateCols]
new_y = y_resampled

#### SMOTE 0.05

In [11]:
sm = SMOTE(random_state=0,sampling_strategy = 0.05)
X_smote, y_smote = sm.fit_resample(train_X, train_y)
X_smote = pd.DataFrame(X_smote, columns=train_X.columns)
train_num_cols = X_smote[numCols]
train_cat_cols = X_smote[cateCols]
new_y = y_smote

#### SMOTE all

In [12]:
sm = SMOTE(random_state=0)
X_smote, y_smote = sm.fit_resample(train_X, train_y)
X_smote = pd.DataFrame(X_smote, columns=train_X.columns)
train_num_cols = X_smote[numCols]
train_cat_cols = X_smote[cateCols]
new_y = y_smote

### DNN Classifier

In [14]:
data_scaler = StandardScaler()
#train_num_cols = train_X[numCols]
data_scaler.fit(train_num_cols)
train_num_cols = data_scaler.transform(train_num_cols)
#val_num_cols = data_scaler.transform(val_X[numCols])
val_num_cols = data_scaler.transform(val_num_cols)
test_num_cols = data_scaler.transform(test_num_cols)
train_num_cols_df = pd.DataFrame(train_num_cols,columns = numCols)
val_num_cols_df = pd.DataFrame(val_num_cols,columns = numCols)
test_num_cols_df = pd.DataFrame(test_num_cols,columns = numCols)

In [15]:
#train_cat_cols = train_X[cateCols]
one_hot = OneHotEncoder(handle_unknown='ignore')
one_hot.fit(train_cat_cols)
train_cat_cols = one_hot.transform(train_cat_cols).toarray()
#val_cat_cols = one_hot.transform(val_X[cateCols]).toarray()
val_cat_cols = one_hot.transform(val_cat_cols).toarray()
test_cat_cols = one_hot.transform(test_cat_cols).toarray()

In [16]:
#feature_names = list(one_hot.get_feature_names())
feature_names = [str(i) for i in list(one_hot.active_features_)]
train_cat_cols_df = pd.DataFrame(train_cat_cols,columns = feature_names)
val_cat_cols_df = pd.DataFrame(val_cat_cols,columns = feature_names)
test_cat_cols_df = pd.DataFrame(test_cat_cols,columns = feature_names)
newcateCols = list(train_cat_cols_df.columns)

In [17]:
feature_cols = [tf.contrib.layers.real_valued_column(k) for k in numCols+newcateCols]

Instructions for updating:
Use the retry module or similar alternatives.


In [18]:
len(feature_cols)

50

In [19]:
train_dnn = pd.concat([train_num_cols_df,train_cat_cols_df],axis = 1)
val_dnn = pd.concat([val_num_cols_df,val_cat_cols_df],axis = 1)
test_dnn = pd.concat([test_num_cols_df,test_cat_cols_df],axis = 1)

In [128]:
# define DNN classifier: l1 > l2, without dropout
layer1 = [32,64,128,256,512,1024]
layer2 = [16,32,64,128,256,512,1024]
classifiers = []
for l1 in layer1:
    for l2 in layer2:
        if l1 > l2:
            classifiers.append(tf.estimator.DNNClassifier(
             feature_columns=feature_cols,
             hidden_units=[l1, l2],
             #model_dir="./models/dnnclassifier"
            ))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp06l4zv2g', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fceafb2ce80>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp0y835rk1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_clus

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpoi0iwy60', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fceac32b198>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmph1f1kf92', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_clus

In [41]:
# define DNN classifier: l1 > l2, with dropout
layer1 = [32,64,128,256,512,1024]
layer2 = [16,32,64,128,256,512,1024]
classifiers = []
for l1 in layer1:
    for l2 in layer2:
        if l1 > l2:
            classifiers.append(tf.estimator.DNNClassifier(
             feature_columns=feature_cols,
             hidden_units=[l1, l2],
             dropout = 0.1,
             #model_dir="./models/dnnclassifier"
            ))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp5u_r_vn0', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcf3fc5c630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7yz5flg2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_clus

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpi_3olfdw', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcf240e4d68>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpwdtw0gca', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_clus

In [80]:
# DNN Classifier on original dataset with batch
classifier = tf.estimator.DNNClassifier(
 feature_columns=feature_cols,
 hidden_units=[1024, 16],
 #model_dir="./models/oridnnclassifier"
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpzsr9v0c9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fced4b44da0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [20]:
# DNN Classifier on original dataset without batch
classifier = tf.estimator.DNNClassifier(
 feature_columns=feature_cols,
 hidden_units=[128, 16],
 model_dir="./models/oridnnclassifier"
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './models/oridnnclassifier', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f57e6e44358>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
# DNN Classifier on random oversampling dataset
classifier = tf.estimator.DNNClassifier(
 feature_columns=feature_cols,
 hidden_units=[32, 16],
 model_dir="./models/overdnnclassifier"
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './models/overdnnclassifier', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f07b300b9b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [48]:
# DNN Classifier on SMOTE dataset
classifier = tf.estimator.DNNClassifier(
 feature_columns=feature_cols,
 hidden_units=[256, 16],
 model_dir="./models/smote1dnnclassifier"
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './models/smote1dnnclassifier', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f07b3038630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [23]:
train_dnn_columns = list(train_dnn.columns)
val_dnn_columns = list(val_dnn.columns)
test_dnn_columns = list(test_dnn.columns)

In [139]:
# input with batch
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: train_dnn[k].values for k in train_dnn_columns},
    y = new_y,
    batch_size=256,
    num_epochs=1,
    shuffle=False
)
test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: val_dnn[k].values for k in val_dnn_columns},
    y = new_val_y,
    batch_size=256,
    num_epochs=1,
    shuffle=False
)

In [24]:
# input without batch
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: train_dnn[k].values for k in train_dnn_columns},
    y = new_y,
    num_epochs=1,
    shuffle=False
)
test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: val_dnn[k].values for k in val_dnn_columns},
    y = new_val_y,
    num_epochs=1,
    shuffle=False
)

In [129]:
# Tune hyperparameters of DNN


count = 1
aucs = []
aauc = []
prc = []
for classifier in classifiers:
    print('classifier: {}'.format(count))
    classifier.train(input_fn=train_input_fn, steps=10000)
    ev = classifier.evaluate(input_fn=test_input_fn)
    aauc.append(ev['auc'])
    prc.append(ev['auc_precision_recall'])
    print('ev: {}'.format(ev))
    count +=1
    predictions = classifier.predict(input_fn=test_input_fn)
    pred = list(predictions)
    predicted_classes = [p["classes"] for p in pred]
    probs = [p["probabilities"] for p in pred]
    prob = [p[1] for p in probs]
    pred_class = []
    for i in predicted_classes:
        pred_class.append(int(i[0]))
    print(classification_report(new_val_y,pred_class))
    #auc = roc_auc_score(new_val_y, pred_class)
    auc = roc_auc_score(new_val_y, prob)
    aucs.append(auc)
    
    print(auc)

classifier: 1
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp06l4zv2g/model.ckpt.
INFO:tensorflow:loss = 56.374504, step = 0
INFO:tensorflow:global_step/sec: 86.4865
INFO:tensorflow:loss = 2.8844814, step = 100 (1.157 sec)
INFO:tensorflow:global_step/sec: 118.894
INFO:tensorflow:loss = 7.1246667, step = 200 (0.841 sec)
INFO:tensorflow:global_step/sec: 118.443
INFO:tensorflow:loss = 3.9526439, step = 300 (0.844 sec)
INFO:tensorflow:global_step/sec: 120.527
INFO:tensorflow:loss = 5.486334, step = 400 (0.830 sec)
INFO:tensorflow:global_step/sec: 115.389
INFO:tensorflow:loss = 3.12123, step = 500 (0.867 sec)
INFO:tensorflow:global_step/sec: 120.6
INFO:tensorflow:loss = 6.108175, step = 600 (0.829 sec)
INFO:tensorflow:global_step/sec: 113.871
INFO:ten

INFO:tensorflow:loss = 3.074279, step = 1600 (0.895 sec)
INFO:tensorflow:global_step/sec: 117.51
INFO:tensorflow:loss = 2.2210736, step = 1700 (0.851 sec)
INFO:tensorflow:global_step/sec: 121.861
INFO:tensorflow:loss = 3.0845373, step = 1800 (0.821 sec)
INFO:tensorflow:global_step/sec: 126.691
INFO:tensorflow:loss = 3.145428, step = 1900 (0.789 sec)
INFO:tensorflow:global_step/sec: 126.519
INFO:tensorflow:loss = 4.520192, step = 2000 (0.791 sec)
INFO:tensorflow:global_step/sec: 118.909
INFO:tensorflow:loss = 1.4421222, step = 2100 (0.840 sec)
INFO:tensorflow:global_step/sec: 114.076
INFO:tensorflow:loss = 9.620087, step = 2200 (0.877 sec)
INFO:tensorflow:global_step/sec: 116.156
INFO:tensorflow:loss = 6.4544687, step = 2300 (0.861 sec)
INFO:tensorflow:global_step/sec: 117.232
INFO:tensorflow:loss = 1.281802, step = 2400 (0.853 sec)
INFO:tensorflow:global_step/sec: 114.288
INFO:tensorflow:loss = 2.8140712, step = 2500 (0.875 sec)
INFO:tensorflow:global_step/sec: 114.463
INFO:tensorflow:

INFO:tensorflow:loss = 1.4900408, step = 3500 (0.752 sec)
INFO:tensorflow:global_step/sec: 124.496
INFO:tensorflow:loss = 5.3644557, step = 3600 (0.804 sec)
INFO:tensorflow:global_step/sec: 116.999
INFO:tensorflow:loss = 1.9616966, step = 3700 (0.854 sec)
INFO:tensorflow:global_step/sec: 114.941
INFO:tensorflow:loss = 4.478167, step = 3800 (0.870 sec)
INFO:tensorflow:global_step/sec: 118
INFO:tensorflow:loss = 4.6890626, step = 3900 (0.848 sec)
INFO:tensorflow:global_step/sec: 119.775
INFO:tensorflow:loss = 1.2695642, step = 4000 (0.835 sec)
INFO:tensorflow:global_step/sec: 117.3
INFO:tensorflow:loss = 0.9533674, step = 4100 (0.853 sec)
INFO:tensorflow:global_step/sec: 118.986
INFO:tensorflow:loss = 2.3107972, step = 4200 (0.840 sec)
INFO:tensorflow:global_step/sec: 119.805
INFO:tensorflow:loss = 4.264905, step = 4300 (0.835 sec)
INFO:tensorflow:global_step/sec: 122.101
INFO:tensorflow:loss = 3.1075828, step = 4400 (0.819 sec)
INFO:tensorflow:global_step/sec: 121.517
INFO:tensorflow:lo

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp11tjfj_v/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
             precision    recall  f1-score   support

          0       0.99      1.00      0.99    142739
          1       0.62      0.15      0.24      1846

avg / total       0.98      0.99      0.98    144585

0.9863840670882709
classifier: 5
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp452dzhvu/model.ckpt.
INFO:tensorflow:loss = 72.532974, step = 0
INFO:tensorflow:global_step/sec: 90.0526
INFO:tensorflow:loss = 2.7377515, step = 100 (1.112 sec)
INFO:tensorflow:global_step/sec: 130.384
INFO:tensorflow:loss = 7

INFO:tensorflow:global_step/sec: 127.774
INFO:tensorflow:loss = 1.060128, step = 1200 (0.783 sec)
INFO:tensorflow:global_step/sec: 121.073
INFO:tensorflow:loss = 2.760914, step = 1300 (0.826 sec)
INFO:tensorflow:global_step/sec: 118.279
INFO:tensorflow:loss = 2.105317, step = 1400 (0.846 sec)
INFO:tensorflow:global_step/sec: 112.683
INFO:tensorflow:loss = 4.0841827, step = 1500 (0.887 sec)
INFO:tensorflow:global_step/sec: 119.972
INFO:tensorflow:loss = 3.107001, step = 1600 (0.834 sec)
INFO:tensorflow:global_step/sec: 117.78
INFO:tensorflow:loss = 2.1119077, step = 1700 (0.849 sec)
INFO:tensorflow:global_step/sec: 118.032
INFO:tensorflow:loss = 3.3721597, step = 1800 (0.847 sec)
INFO:tensorflow:global_step/sec: 116.117
INFO:tensorflow:loss = 3.249062, step = 1900 (0.861 sec)
INFO:tensorflow:global_step/sec: 117.897
INFO:tensorflow:loss = 5.0543346, step = 2000 (0.848 sec)
INFO:tensorflow:global_step/sec: 119.068
INFO:tensorflow:loss = 1.4872034, step = 2100 (0.840 sec)
INFO:tensorflow:

INFO:tensorflow:global_step/sec: 121.044
INFO:tensorflow:loss = 4.934226, step = 3100 (0.826 sec)
INFO:tensorflow:global_step/sec: 114.333
INFO:tensorflow:loss = 4.3376193, step = 3200 (0.875 sec)
INFO:tensorflow:global_step/sec: 117.41
INFO:tensorflow:loss = 3.4331691, step = 3300 (0.854 sec)
INFO:tensorflow:global_step/sec: 118.933
INFO:tensorflow:loss = 3.1543412, step = 3400 (0.839 sec)
INFO:tensorflow:global_step/sec: 117.716
INFO:tensorflow:loss = 1.419305, step = 3500 (0.850 sec)
INFO:tensorflow:global_step/sec: 122.675
INFO:tensorflow:loss = 5.277373, step = 3600 (0.815 sec)
INFO:tensorflow:global_step/sec: 132.825
INFO:tensorflow:loss = 2.0120668, step = 3700 (0.753 sec)
INFO:tensorflow:global_step/sec: 127.679
INFO:tensorflow:loss = 4.1060743, step = 3800 (0.783 sec)
INFO:tensorflow:global_step/sec: 123.223
INFO:tensorflow:loss = 4.7555923, step = 3900 (0.812 sec)
INFO:tensorflow:global_step/sec: 118.702
INFO:tensorflow:loss = 1.3285298, step = 4000 (0.843 sec)
INFO:tensorflo

INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-04:15:30
INFO:tensorflow:Saving dict for global step 4519: accuracy = 0.9878065, accuracy_baseline = 0.98723245, auc = 0.98222643, auc_precision_recall = 0.44598582, average_loss = 0.028566856, global_step = 4519, label/mean = 0.012767577, loss = 3.655167, prediction/mean = 0.012203695
ev: {'accuracy': 0.9878065, 'accuracy_baseline': 0.98723245, 'auc': 0.98222643, 'auc_precision_recall': 0.44598582, 'average_loss': 0.028566856, 'label/mean': 0.012767577, 'loss': 3.655167, 'prediction/mean': 0.012203695, 'global_step': 4519}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp09v41djn/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
             precision    recall  f1-score   support

          0       0.99      1.00      0.99    142739
 

INFO:tensorflow:global_step/sec: 130.35
INFO:tensorflow:loss = 3.0864751, step = 500 (0.767 sec)
INFO:tensorflow:global_step/sec: 130.384
INFO:tensorflow:loss = 4.921526, step = 600 (0.769 sec)
INFO:tensorflow:global_step/sec: 126.192
INFO:tensorflow:loss = 2.3645368, step = 700 (0.791 sec)
INFO:tensorflow:global_step/sec: 131.54
INFO:tensorflow:loss = 4.113914, step = 800 (0.760 sec)
INFO:tensorflow:global_step/sec: 121.121
INFO:tensorflow:loss = 5.8964915, step = 900 (0.826 sec)
INFO:tensorflow:global_step/sec: 121.833
INFO:tensorflow:loss = 2.8638353, step = 1000 (0.821 sec)
INFO:tensorflow:global_step/sec: 121.912
INFO:tensorflow:loss = 3.3193212, step = 1100 (0.820 sec)
INFO:tensorflow:global_step/sec: 122.968
INFO:tensorflow:loss = 1.0839465, step = 1200 (0.813 sec)
INFO:tensorflow:global_step/sec: 120.7
INFO:tensorflow:loss = 2.2492175, step = 1300 (0.829 sec)
INFO:tensorflow:global_step/sec: 119.449
INFO:tensorflow:loss = 2.043005, step = 1400 (0.837 sec)
INFO:tensorflow:global

INFO:tensorflow:global_step/sec: 129.495
INFO:tensorflow:loss = 1.2114197, step = 2400 (0.772 sec)
INFO:tensorflow:global_step/sec: 120.023
INFO:tensorflow:loss = 2.4438305, step = 2500 (0.835 sec)
INFO:tensorflow:global_step/sec: 117.385
INFO:tensorflow:loss = 2.9159963, step = 2600 (0.850 sec)
INFO:tensorflow:global_step/sec: 116.493
INFO:tensorflow:loss = 4.7678413, step = 2700 (0.858 sec)
INFO:tensorflow:global_step/sec: 122.633
INFO:tensorflow:loss = 1.4689592, step = 2800 (0.816 sec)
INFO:tensorflow:global_step/sec: 124.965
INFO:tensorflow:loss = 3.64643, step = 2900 (0.800 sec)
INFO:tensorflow:global_step/sec: 117.902
INFO:tensorflow:loss = 0.8312998, step = 3000 (0.849 sec)
INFO:tensorflow:global_step/sec: 121.312
INFO:tensorflow:loss = 4.998019, step = 3100 (0.824 sec)
INFO:tensorflow:global_step/sec: 116.049
INFO:tensorflow:loss = 4.581694, step = 3200 (0.862 sec)
INFO:tensorflow:global_step/sec: 118.607
INFO:tensorflow:loss = 3.2685547, step = 3300 (0.843 sec)
INFO:tensorflo

INFO:tensorflow:global_step/sec: 112.879
INFO:tensorflow:loss = 4.3246317, step = 4300 (0.886 sec)
INFO:tensorflow:global_step/sec: 112.877
INFO:tensorflow:loss = 3.124923, step = 4400 (0.886 sec)
INFO:tensorflow:global_step/sec: 116.785
INFO:tensorflow:loss = 3.1852474, step = 4500 (0.857 sec)
INFO:tensorflow:Saving checkpoints for 4519 into /tmp/tmplnkj50s4/model.ckpt.
INFO:tensorflow:Loss for final step: 0.62044984.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-04:19:33
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmplnkj50s4/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-04:19:45
INFO:tensorflow:Saving dict for global step 4519: accuracy = 0.987848, accuracy_baseline = 0.98723245, auc = 0.98225504, auc_precision_recall = 0.4506749, average_loss = 0.028527979, global_s

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmph1f1kf92/model.ckpt.
INFO:tensorflow:loss = 84.967186, step = 0
INFO:tensorflow:global_step/sec: 85.8345
INFO:tensorflow:loss = 3.0740187, step = 100 (1.166 sec)
INFO:tensorflow:global_step/sec: 122.056
INFO:tensorflow:loss = 7.3854136, step = 200 (0.819 sec)
INFO:tensorflow:global_step/sec: 123.459
INFO:tensorflow:loss = 4.108783, step = 300 (0.810 sec)
INFO:tensorflow:global_step/sec: 124.42
INFO:tensorflow:loss = 4.9807143, step = 400 (0.804 sec)
INFO:tensorflow:global_step/sec: 120.793
INFO:tensorflow:loss = 3.0304952, step = 500 (0.827 sec)
INFO:tensorflow:global_step/sec: 114.773
INFO:tensorflow:loss = 4.7104864, step = 600 (0.872 sec)
INFO:tensorflow:global_step/sec: 115.432
INFO:tensorflow:loss = 2.5031986, step = 700 (0.866 

INFO:tensorflow:global_step/sec: 117.216
INFO:tensorflow:loss = 2.3249261, step = 1700 (0.853 sec)
INFO:tensorflow:global_step/sec: 119.109
INFO:tensorflow:loss = 3.3708434, step = 1800 (0.840 sec)
INFO:tensorflow:global_step/sec: 116.638
INFO:tensorflow:loss = 3.3299365, step = 1900 (0.857 sec)
INFO:tensorflow:global_step/sec: 114.894
INFO:tensorflow:loss = 5.262789, step = 2000 (0.870 sec)
INFO:tensorflow:global_step/sec: 114.473
INFO:tensorflow:loss = 1.3434122, step = 2100 (0.874 sec)
INFO:tensorflow:global_step/sec: 117.034
INFO:tensorflow:loss = 8.911432, step = 2200 (0.854 sec)
INFO:tensorflow:global_step/sec: 125.323
INFO:tensorflow:loss = 6.453104, step = 2300 (0.798 sec)
INFO:tensorflow:global_step/sec: 125.172
INFO:tensorflow:loss = 1.2855037, step = 2400 (0.799 sec)
INFO:tensorflow:global_step/sec: 121.187
INFO:tensorflow:loss = 2.5599666, step = 2500 (0.825 sec)
INFO:tensorflow:global_step/sec: 129.654
INFO:tensorflow:loss = 2.826715, step = 2600 (0.771 sec)
INFO:tensorflo

INFO:tensorflow:global_step/sec: 119.193
INFO:tensorflow:loss = 5.5648837, step = 3600 (0.839 sec)
INFO:tensorflow:global_step/sec: 113.816
INFO:tensorflow:loss = 1.9486079, step = 3700 (0.878 sec)
INFO:tensorflow:global_step/sec: 117.8
INFO:tensorflow:loss = 4.144261, step = 3800 (0.849 sec)
INFO:tensorflow:global_step/sec: 118.803
INFO:tensorflow:loss = 4.511428, step = 3900 (0.842 sec)
INFO:tensorflow:global_step/sec: 119.61
INFO:tensorflow:loss = 1.2888125, step = 4000 (0.836 sec)
INFO:tensorflow:global_step/sec: 127.417
INFO:tensorflow:loss = 1.0296581, step = 4100 (0.785 sec)
INFO:tensorflow:global_step/sec: 120.551
INFO:tensorflow:loss = 2.232595, step = 4200 (0.830 sec)
INFO:tensorflow:global_step/sec: 117.366
INFO:tensorflow:loss = 4.1737385, step = 4300 (0.852 sec)
INFO:tensorflow:global_step/sec: 119.531
INFO:tensorflow:loss = 3.0512345, step = 4400 (0.837 sec)
INFO:tensorflow:global_step/sec: 123.668
INFO:tensorflow:loss = 3.251141, step = 4500 (0.809 sec)
INFO:tensorflow:S

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp859e9wa_/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
             precision    recall  f1-score   support

          0       0.99      1.00      0.99    142739
          1       0.59      0.16      0.25      1846

avg / total       0.98      0.99      0.98    144585

0.9864231245784143
classifier: 18
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpnwrh1hft/model.ckpt.
INFO:tensorflow:loss = 84.410835, step = 0
INFO:tensorflow:global_step/sec: 83.4119
INFO:tensorflow:loss = 2.9633193, step = 100 (1.200 sec)
INFO:tensorflow:global_step/sec: 114.371
INFO:tensorflow:loss = 

INFO:tensorflow:global_step/sec: 119.907
INFO:tensorflow:loss = 1.0468816, step = 1200 (0.835 sec)
INFO:tensorflow:global_step/sec: 120.973
INFO:tensorflow:loss = 1.9527804, step = 1300 (0.826 sec)
INFO:tensorflow:global_step/sec: 120.064
INFO:tensorflow:loss = 2.1275349, step = 1400 (0.833 sec)
INFO:tensorflow:global_step/sec: 115.065
INFO:tensorflow:loss = 3.8597133, step = 1500 (0.869 sec)
INFO:tensorflow:global_step/sec: 117.515
INFO:tensorflow:loss = 3.1796234, step = 1600 (0.851 sec)
INFO:tensorflow:global_step/sec: 119.48
INFO:tensorflow:loss = 2.0849664, step = 1700 (0.837 sec)
INFO:tensorflow:global_step/sec: 127.136
INFO:tensorflow:loss = 3.4008472, step = 1800 (0.787 sec)
INFO:tensorflow:global_step/sec: 121.206
INFO:tensorflow:loss = 3.3650317, step = 1900 (0.825 sec)
INFO:tensorflow:global_step/sec: 120.761
INFO:tensorflow:loss = 5.163628, step = 2000 (0.828 sec)
INFO:tensorflow:global_step/sec: 120.094
INFO:tensorflow:loss = 1.3855906, step = 2100 (0.833 sec)
INFO:tensorf

INFO:tensorflow:global_step/sec: 114.938
INFO:tensorflow:loss = 5.073212, step = 3100 (0.870 sec)
INFO:tensorflow:global_step/sec: 114.858
INFO:tensorflow:loss = 4.6602583, step = 3200 (0.871 sec)
INFO:tensorflow:global_step/sec: 115.922
INFO:tensorflow:loss = 3.3454714, step = 3300 (0.862 sec)
INFO:tensorflow:global_step/sec: 115.505
INFO:tensorflow:loss = 3.1446261, step = 3400 (0.866 sec)
INFO:tensorflow:global_step/sec: 114.037
INFO:tensorflow:loss = 1.4100351, step = 3500 (0.877 sec)
INFO:tensorflow:global_step/sec: 115.321
INFO:tensorflow:loss = 5.5882807, step = 3600 (0.867 sec)
INFO:tensorflow:global_step/sec: 117.185
INFO:tensorflow:loss = 1.9939243, step = 3700 (0.853 sec)
INFO:tensorflow:global_step/sec: 114.552
INFO:tensorflow:loss = 4.051032, step = 3800 (0.873 sec)
INFO:tensorflow:global_step/sec: 118.069
INFO:tensorflow:loss = 4.766689, step = 3900 (0.847 sec)
INFO:tensorflow:global_step/sec: 118.054
INFO:tensorflow:loss = 1.3137717, step = 4000 (0.847 sec)
INFO:tensorfl

INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-04:29:21
INFO:tensorflow:Saving dict for global step 4519: accuracy = 0.9878687, accuracy_baseline = 0.98723245, auc = 0.98240644, auc_precision_recall = 0.4457744, average_loss = 0.028651416, global_step = 4519, label/mean = 0.012767577, loss = 3.6659868, prediction/mean = 0.012648934
ev: {'accuracy': 0.9878687, 'accuracy_baseline': 0.98723245, 'auc': 0.98240644, 'auc_precision_recall': 0.4457744, 'average_loss': 0.028651416, 'label/mean': 0.012767577, 'loss': 3.6659868, 'prediction/mean': 0.012648934, 'global_step': 4519}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp04nyzw05/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
             precision    recall  f1-score   support

          0       0.99      1.00      0.99    142739
 

In [45]:
pd.DataFrame(prc)

Unnamed: 0,0
0,0.442769
1,0.45053
2,0.443785
3,0.470575
4,0.445463
5,0.446296
6,0.44715
7,0.447887
8,0.451004
9,0.447333


In [46]:
pd.DataFrame(aauc)

Unnamed: 0,0
0,0.981272
1,0.982025
2,0.982142
3,0.981182
4,0.981647
5,0.982501
6,0.982819
7,0.982728
8,0.982797
9,0.981801


In [25]:
classifier.train(input_fn=train_input_fn, steps=10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ./models/oridnnclassifier/model.ckpt.
INFO:tensorflow:loss = 84.17191, step = 0
INFO:tensorflow:global_step/sec: 96.2992
INFO:tensorflow:loss = 2.9296398, step = 100 (1.040 sec)
INFO:tensorflow:global_step/sec: 128.39
INFO:tensorflow:loss = 7.4242144, step = 200 (0.779 sec)
INFO:tensorflow:global_step/sec: 129.111
INFO:tensorflow:loss = 4.2442513, step = 300 (0.775 sec)
INFO:tensorflow:global_step/sec: 129.328
INFO:tensorflow:loss = 5.2015076, step = 400 (0.773 sec)
INFO:tensorflow:global_step/sec: 126.989
INFO:tensorflow:loss = 2.8330088, step = 500 (0.788 sec)
INFO:tensorflow:global_step/sec: 132.532
INFO:tensorflow:loss = 4.959489, step = 600 (0.754 sec)
INFO:tensorflow:global_step/sec: 129.196
INFO:tenso

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f57e6e44278>

In [26]:
ev = classifier.evaluate(input_fn=test_input_fn)
print('ev: {}'.format(ev))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-04:56:19
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models/oridnnclassifier/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-04:56:30
INFO:tensorflow:Saving dict for global step 4519: accuracy = 0.98799324, accuracy_baseline = 0.98723245, auc = 0.98177665, auc_precision_recall = 0.44959185, average_loss = 0.02855999, global_step = 4519, label/mean = 0.012767577, loss = 3.6542888, prediction/mean = 0.011957345
ev: {'accuracy': 0.98799324, 'accuracy_baseline': 0.98723245, 'auc': 0.98177665, 'auc_precision_recall': 0.44959185, 'average_loss': 0.02855999, 'label/mean': 0.012767577, 'loss': 3.6542888, 'prediction/mean': 0.011957345, 'global_step': 4519}


In [27]:
predictions = classifier.predict(input_fn=test_input_fn)
pred = list(predictions)
predicted_classes = [p["classes"] for p in pred]
probs = [p["probabilities"] for p in pred]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models/oridnnclassifier/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [122]:
pred_class = []
for i in predicted_classes:
    pred_class.append(int(i[0]))
prob = [p[1] for p in probs]

In [52]:
print(classification_report(new_val_y,pred_class))
print(roc_auc_score(new_val_y, prob))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99    142739
          1       0.59      0.16      0.25      1846

avg / total       0.98      0.99      0.98    144585

0.9865409706828632


In [89]:
# Find the best threshold
from sklearn.metrics import f1_score
thresholds = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
for t in thresholds:
    print('threshold {}'.format(t))
    print(f1_score(new_val_y, np.array(prob)>t))

threshold 0
0.025213240365769544
threshold 0.1
0.44092707744488413
threshold 0.2
0.49258746787902746
threshold 0.3
0.4795682343870471
threshold 0.4
0.4167758846657929
threshold 0.5
0.20099143758449756
threshold 0.6
0.0
threshold 0.7
0.0
threshold 0.8
0.0
threshold 0.9
0.0
threshold 1
0.0


  'precision', 'predicted', average, warn_for)


In [90]:
# confusion metrix with the best threshold
print(classification_report(new_val_y,np.array(prob)>0.2))

             precision    recall  f1-score   support

          0       1.00      0.99      0.99    142739
          1       0.39      0.67      0.49      1846

avg / total       0.99      0.98      0.98    144585



##### Test

In [126]:
ttest_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: test_dnn[k].values for k in test_dnn_columns},
    y = new_test_y,
    #batch_size=256,
    num_epochs=1,
    shuffle=False
)

predictions = classifier.predict(input_fn=ttest_input_fn)
pred = list(predictions)
predicted_classes = [p["classes"] for p in pred]
probs = [p["probabilities"] for p in pred]
pred_class = []
for i in predicted_classes:
    pred_class.append(int(i[0]))
prob = [p[1] for p in probs]

print(classification_report(new_test_y,np.array(prob)>0.2))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp315wyqlr/model.ckpt-4519
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
             precision    recall  f1-score   support

          0       1.00      0.99      0.99    178438
          1       0.39      0.69      0.50      2293

avg / total       0.99      0.98      0.98    180731



### DNN regressor

In [17]:
train_regression = trainset[trainset['hasRevenue']]
traindnn_x_r = train_regression[cateCols + numCols]
traindnn_y_r = np.log(train_regression['totals.transactionRevenue'].values + 1)
val_regression = valset[valset['hasRevenue']]
valdnn_x_r = val_regression[cateCols + numCols]
valdnn_y_r = np.log(val_regression['totals.transactionRevenue'].values + 1)
test_regression = test[test['hasRevenue']]
testdnn_x_r = test_regression[cateCols + numCols]
testdnn_y_r = np.log(test_regression['totals.transactionRevenue'].values + 1)


In [18]:
data_scaler = StandardScaler()
train_num_r = traindnn_x_r[numCols]
data_scaler.fit(train_num_r)
train_num_r = data_scaler.transform(train_num_r)
val_num_r = data_scaler.transform(valdnn_x_r[numCols])
test_num_r = data_scaler.transform(testdnn_x_r[numCols])
train_num_r_df = pd.DataFrame(train_num_r,columns = numCols)
val_num_r_df = pd.DataFrame(val_num_r,columns = numCols)
test_num_r_df = pd.DataFrame(test_num_r,columns = numCols)


In [19]:
train_cat_r = traindnn_x_r[cateCols]
one_hot = OneHotEncoder(handle_unknown='ignore')
one_hot.fit(train_cat_r)
train_cat_r = one_hot.transform(train_cat_r).toarray()
val_cat_r = one_hot.transform(valdnn_x_r[cateCols]).toarray()
test_cat_r = one_hot.transform(testdnn_x_r[cateCols]).toarray()
feature_names = [str(i) for i in list(one_hot.active_features_)]
train_cat_r_df = pd.DataFrame(train_cat_r,columns = feature_names)
val_cat_r_df = pd.DataFrame(val_cat_r,columns = feature_names)
test_cat_r_df = pd.DataFrame(test_cat_r,columns = feature_names)
newcateCols = list(train_cat_r_df.columns)
feature_cols = [tf.contrib.layers.real_valued_column(k) for k in numCols+newcateCols]

Instructions for updating:
Use the retry module or similar alternatives.


In [20]:
train_dnn_r = pd.concat([train_num_r_df,train_cat_r_df],axis = 1)
val_dnn_r = pd.concat([val_num_r_df,val_cat_r_df],axis = 1)
test_dnn_r = pd.concat([test_num_r_df,test_cat_r_df],axis = 1)

In [178]:
# best hyperparameters with batch
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, 
                                          hidden_units=[512, 32], 
                                          #model_dir='./models/dnnregressor'
                                     )

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpkco9bzwr', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f07d973a048>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [21]:
# best hyperparameters without batch
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, 
                                          hidden_units=[256, 16], 
                                          #model_dir='./models/dnnregressor'
                                     )

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpppx_z5t8', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc93a43d400>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [102]:
layer1 = [32,64,128,256,512,1024]
layer2 = [16,32,64,128,256,512,1024]
regressors = []
for l1 in layer1:
    for l2 in layer2:
        if l1 > l2:
            regressors.append(tf.estimator.DNNRegressor(feature_columns=feature_cols, 
                                          hidden_units=[128, 32], 
                                          #model_dir='./models/dnnregressor'
                                                       ))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpfaen6b0h', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcea6a9a828>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpv8qqhb6p', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_clus

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp0z1b0mut', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcea49df710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpqjpg3yj6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_clus

In [23]:
train_dnn_r_columns = list(train_dnn_r.columns)

In [24]:
# input with batch
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: train_dnn_r[k].values for k in train_dnn_r_columns},
    y = traindnn_y_r,
    batch_size=256,
    num_epochs=1,
    shuffle=False,
)
test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: val_dnn_r[k].values for k in train_dnn_r_columns},
    y = valdnn_y_r,
    batch_size=256,
    num_epochs=1,
    shuffle=False
)

In [25]:
# input without batch
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: train_dnn_r[k].values for k in train_dnn_r_columns},
    y = traindnn_y_r,
    num_epochs=1,
    shuffle=False,
)
test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: val_dnn_r[k].values for k in train_dnn_r_columns},
    y = valdnn_y_r,
    num_epochs=1,
    shuffle=False
)

In [106]:
count = 1
rmses = []
for regressor in regressors:
    print('regressor: {}'.format(count))
    regressor.train(input_fn=train_input_fn, steps=10000)
    ev = regressor.evaluate(input_fn=test_input_fn)
    print('ev: {}'.format(ev))
    predictions = regressor.predict(input_fn=test_input_fn)
    pred = list(predictions)
    preds = [p['predictions'] for p in pred]
    rmse = np.sqrt(metrics.mean_squared_error(valdnn_y_r, preds))
    print(rmse)
    rmses.append(rmse)
    count += 1

regressor: 1
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpfaen6b0h/model.ckpt.
INFO:tensorflow:loss = 36546.97, step = 0
INFO:tensorflow:Saving checkpoints for 58 into /tmp/tmpfaen6b0h/model.ckpt.
INFO:tensorflow:Loss for final step: 104.04146.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-01:58:20
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpfaen6b0h/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-01:58:20
INFO:tensorflow:Saving dict for global step 58: average_loss = 1.3705137, global_step = 58, loss = 168.66455
ev: {'average_lo

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp08_tj8q8/model.ckpt.
INFO:tensorflow:loss = 40190.652, step = 0
INFO:tensorflow:Saving checkpoints for 58 into /tmp/tmp08_tj8q8/model.ckpt.
INFO:tensorflow:Loss for final step: 102.05165.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-01:58:51
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp08_tj8q8/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-01:58:52
INFO:tensorflow:Saving dict for global step 58: average_loss = 1.3251204, global_step = 58, loss = 163.07816
ev: {'average_loss': 1.3251204, 'loss': 163.07816, 'global_step': 58}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
IN

INFO:tensorflow:loss = 37124.105, step = 0
INFO:tensorflow:Saving checkpoints for 58 into /tmp/tmp0z1b0mut/model.ckpt.
INFO:tensorflow:Loss for final step: 102.36057.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-01:59:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp0z1b0mut/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-01:59:24
INFO:tensorflow:Saving dict for global step 58: average_loss = 1.3864226, global_step = 58, loss = 170.6224
ev: {'average_loss': 1.3864226, 'loss': 170.6224, 'global_step': 58}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp0z1b0mut/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
1.1774

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-01:59:56
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmps3164g23/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-01:59:57
INFO:tensorflow:Saving dict for global step 58: average_loss = 1.3455526, global_step = 58, loss = 165.59268
ev: {'average_loss': 1.3455526, 'loss': 165.59268, 'global_step': 58}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmps3164g23/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
1.1599796734016494
regressor: 20
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph

In [107]:
pd.DataFrame(rmses)

Unnamed: 0,0
0,1.170689
1,1.168338
2,1.176855
3,1.187454
4,1.161746
5,1.173437
6,1.151139
7,1.178751
8,1.166644
9,1.189327


In [26]:
import time
starttime = time.time()
regressor.train(input_fn=train_input_fn, steps=10000)
print('time: {}'.format(time.time()-starttime))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpppx_z5t8/model.ckpt.
INFO:tensorflow:loss = 39884.508, step = 0
INFO:tensorflow:Saving checkpoints for 58 into /tmp/tmpppx_z5t8/model.ckpt.
INFO:tensorflow:Loss for final step: 98.71941.
time: 22.096778869628906


In [110]:
ev = regressor.evaluate(input_fn=test_input_fn)
print('ev: {}'.format(ev))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-02:04:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpivehxmp4/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-02:05:00
INFO:tensorflow:Saving dict for global step 58: average_loss = 1.329801, global_step = 58, loss = 163.65416
ev: {'average_loss': 1.329801, 'loss': 163.65416, 'global_step': 58}


In [111]:
predictions = regressor.predict(input_fn=test_input_fn)
pred = list(predictions)
preds = [p['predictions'] for p in pred]
preds = [p[0] for p in preds]
print(np.sqrt(metrics.mean_squared_error(valdnn_y_r, preds)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpivehxmp4/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
1.1531700349546707


##### Test

In [115]:
# without batch
ttest_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: test_dnn_r[k].values for k in train_dnn_r_columns},
    y = testdnn_y_r,
    num_epochs=1,
    shuffle=False
)
predictions = regressor.predict(input_fn=ttest_input_fn)
pred = list(predictions)
preds = [p['predictions'] for p in pred]
preds = [p[0] for p in preds]
print(np.sqrt(metrics.mean_squared_error(testdnn_y_r, preds)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpivehxmp4/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
1.1152783859169886


### DNN classifier + regressor

In [196]:
# with batch
newtest_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: val_dnn[k].values for k in val_dnn_columns},
    y = new_val_y,
    batch_size=256,
    num_epochs=1,
    shuffle=False
)
newpredictions = regressor.predict(input_fn=newtest_input_fn)
newpred = list(newpredictions)
newpreds = [p['predictions'] for p in newpred]
newpreds = [p[0] for p in newpreds]
print(np.sqrt(metrics.mean_squared_error(new_val_y, newpreds)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpkco9bzwr/model.ckpt-29
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
16.979684155871023


In [198]:
val_dnn['pred_hasRevenue'] = np.array(prob)>0.2
val_dnn['pred_Revenue'] = newpreds
allpred = val_dnn['pred_hasRevenue'].values * val_dnn['pred_Revenue'].values
allpred[allpred<0] = 0
print(np.sqrt(metrics.mean_squared_error(new_val_y, allpred)))

2.7715747169208598


In [116]:
# without batch
newtest_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: val_dnn[k].values for k in val_dnn_columns},
    y = new_val_y,
    num_epochs=1,
    shuffle=False
)
newpredictions = regressor.predict(input_fn=newtest_input_fn)
newpred = list(newpredictions)
newpreds = [p['predictions'] for p in newpred]
newpreds = [p[0] for p in newpreds]
print(np.sqrt(metrics.mean_squared_error(new_val_y, newpreds)))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpivehxmp4/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
16.781610810326576


In [123]:
# first get the prediction prob of val set from classification
val_dnn['pred_hasRevenue'] = np.array(prob)>0.2
val_dnn['pred_Revenue'] = newpreds
allpred = val_dnn['pred_hasRevenue'].values * val_dnn['pred_Revenue'].values
allpred[allpred<0] = 0
print(np.sqrt(metrics.mean_squared_error(new_val_y, allpred)))

2.602888802759607


##### Test

In [124]:
newtest_input_fn = tf.estimator.inputs.numpy_input_fn(
    x = {k: test_dnn[k].values for k in test_dnn_columns},
    y = new_test_y,
    num_epochs=1,
    shuffle=False
)
newpredictions = regressor.predict(input_fn=newtest_input_fn)
newpred = list(newpredictions)
newpreds = [p['predictions'] for p in newpred]
newpreds = [p[0] for p in newpreds]
print(np.sqrt(metrics.mean_squared_error(new_test_y, newpreds)))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpivehxmp4/model.ckpt-58
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
16.78067963215717


In [127]:
# first get the prediction prob of test set from classification
test_dnn['pred_hasRevenue'] = np.array(prob)>0.2
test_dnn['pred_Revenue'] = newpreds
allpred = test_dnn['pred_hasRevenue'].values * test_dnn['pred_Revenue'].values
allpred[allpred<0] = 0
print(np.sqrt(metrics.mean_squared_error(new_test_y, allpred)))

2.5988783539651905
