In [2]:
from datalab.context import Context
import google.datalab.storage as storage
import google.datalab.bigquery as bq
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import pandas as pd
import numpy as np
import shutil
import time

import argparse
import tensorflow as tf


In [2]:
# tf.enable_eager_execution() #enable eager execution to inspect this program as we run it 
#disable for estimators

In [3]:
%bq tables describe --name bigquery-public-data.google_analytics_sample.ga_sessions_20170801

To compare to a BQML logistic regression we will set up so that our data will split into 2 classes on the label for total transactions, either there was a transaction (1) or it was NULL (0)

Create Model as CUSTOM tf model

In [3]:
%bq query -n train
SELECT
  IF(totals.transactions IS NULL, 0, 1) AS label,
  IFNULL(device.operatingSystem, "") AS os,
  device.isMobile AS is_mobile,
  IFNULL(geoNetwork.country, "") AS country,
  IFNULL(totals.pageviews, 0) AS pageviews
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'

In [4]:
traindf = train.execute().result().to_dataframe()

In [5]:
traindf[:2]

Unnamed: 0,label,os,is_mobile,country,pageviews
0,0,Android,True,Chad,1
1,0,Android,True,Chad,1


In [6]:
traindf['label'].unique()

array([0, 1])

In [7]:
print("No purchase =", len(traindf[traindf['label'] == 0]['os']))
print("Purchase =", len(traindf[traindf['label'] == 1]['os']))
print("Training Set Length =", len(traindf['os']))
print(traindf['is_mobile'].dtype)
print(traindf['country'].dtype)

No purchase = 818807
Purchase = 10478
Training Set Length = 829285
bool
object


balance training set and representational testing set can help neural net accuracy?

------------------

In [8]:
traindf.loc[traindf['is_mobile'] == True, 'mobile'] = "Yes"
traindf.loc[traindf['is_mobile'] == False, 'mobile'] = "No"
traindf=traindf.drop(columns='is_mobile')
traindf.head()

Unnamed: 0,label,os,country,pageviews,mobile
0,0,Android,Chad,1,Yes
1,0,Android,Chad,1,Yes
2,0,Android,Chad,7,Yes
3,0,Android,Chad,5,Yes
4,0,Samsung,Chad,1,Yes


In [9]:
#traindf['is_mobile']=traindf['is_mobile'].astype('str')
#print(traindf['is_mobile'].dtype)
print(traindf['mobile'].dtype)

object


In [10]:
#investigate inputs
#only when eager execution is on 
def easy_input_function(df, label_key, num_epochs, shuffle, batch_size):
  label = df[label_key]
  ds = tf.data.Dataset.from_tensor_slices((dict(df),label))

  if shuffle:
    ds = ds.shuffle(10000)

  ds = ds.batch(batch_size).repeat(num_epochs)

  return ds

In [12]:
ds = easy_input_function(traindf, label_key='label', num_epochs=5, shuffle=True, batch_size=10)

for feature_batch, label_batch in ds.take(1):
  print('Some feature keys:', list(feature_batch.keys())[:5])
  print()
  print('A batch of OS :', feature_batch['os'])
  print()
  print('A batch of Labels:', label_batch )


Some feature keys: ['is_mobile', 'pageviews', 'os', 'label', 'country']

A batch of OS : tf.Tensor(
[b'Windows' b'Windows' b'Android' b'Linux' b'Linux' b'Windows'
 b'Macintosh' b'Android' b'Windows' b'iOS'], shape=(10,), dtype=string)

A batch of Labels: tf.Tensor([0 0 0 0 0 0 0 0 0 0], shape=(10,), dtype=int32)


In [14]:
ds #ds is a tensorflow dataset, here we inspect the dataset

<RepeatDataset shapes: ({is_mobile: (?,), pageviews: (?,), os: (?,), label: (?,), country: (?,)}, (?,)), types: ({is_mobile: tf.bool, pageviews: tf.int32, os: tf.string, label: tf.int32, country: tf.string}, tf.int32)>

In [10]:
traindf.loc[:,['mobile', 'pageviews', 'os', 'label', 'country']][:2]

Unnamed: 0,mobile,pageviews,os,label,country
0,Yes,1,Android,0,Chad
1,Yes,1,Android,0,Chad


In [11]:
#x type must be
train_input_fn = tf.estimator.inputs.pandas_input_fn(
  x=traindf, 
  y=traindf['label'], 
  num_epochs=None, 
  shuffle=True)

In [12]:
train_input_fn() #here we can see that mobile is string

({'country': <tf.Tensor 'random_shuffle_queue_DequeueMany:3' shape=(128,) dtype=string>,
  'label': <tf.Tensor 'random_shuffle_queue_DequeueMany:1' shape=(128,) dtype=int64>,
  'mobile': <tf.Tensor 'random_shuffle_queue_DequeueMany:5' shape=(128,) dtype=string>,
  'os': <tf.Tensor 'random_shuffle_queue_DequeueMany:2' shape=(128,) dtype=string>,
  'pageviews': <tf.Tensor 'random_shuffle_queue_DequeueMany:4' shape=(128,) dtype=int64>},
 <tf.Tensor 'random_shuffle_queue_DequeueMany:6' shape=(128,) dtype=int64>)

In [13]:
#traindf['country'].unique().tolist() #list of all countries
#traindf['os'].unique().tolist()
print(len(traindf['mobile'].unique()))
print(len(traindf['country'].unique()),len(traindf['os'].unique()))

2
221 20


In [14]:
mobile = tf.feature_column.categorical_column_with_vocabulary_list(key="mobile",vocabulary_list=traindf['mobile'].unique().tolist())
country = tf.feature_column.categorical_column_with_vocabulary_list(key="country", vocabulary_list=traindf['country'].unique().tolist())
os = tf.feature_column.categorical_column_with_vocabulary_list(key="os", vocabulary_list=traindf['os'].unique().tolist())

em_mobile = tf.feature_column.embedding_column(mobile, dimension=2) #embedding required for custom estimators
em_country = tf.feature_column.embedding_column(country, dimension=221)
em_os = tf.feature_column.embedding_column(os, dimension=20)
pageviews = tf.feature_column.numeric_column(key="pageviews")#

feature_columns=[em_mobile,em_country,em_os,pageviews]

In [15]:
#print(feature_columns)

In [16]:
#--------------------
# inputs are input tensors
#def neural_net(features):
#  x=traindf.loc[:,['is_mobile', 'pageviews', 'os', 'label', 'country']] #input - features only
#  x=features['is_mobile', 'pageviews', 'os', 'label', 'country']
#  x=feature_columns
#  x=features({"x": features}) #this is a tensor
#  h1 = tf.layers.dense(inputs=x, units=300, name='h1') #hidden layer 1, units are hidden units
#  h2 = tf.layers.dense(inputs=h1, units=100, name='h2') #hidden layer 2
#  h3 = tf.layers.dense(inputs=h2, units=50, name='h3') #hidden layer 3
#  logits = tf.layers.dense(inputs=h3, units=10, name='y') #units are number of input classes, 
#  #logits returned are probability of being a particular class
#  return logits

In [74]:
def model_fn(features, labels, mode, params):
##  logits = neural_net(features) #pass features to get logits
  net = tf.feature_column.input_layer(features, params['feature_columns'])
  for units in params['hidden_units']:
    net = tf.layers.dense(net, units=units, activation=tf.nn.relu)

# Compute logits (1 per class).
  logits = tf.layers.dense(net, params['n_classes'], activation=None)
  targets = tf.reshape(tf.one_hot(labels, params['n_classes']),logits.get_shape())
  ratio = 10478.0/(818807.0+10478.0) #0.01263 = true (1)/total, inverse total/true = 79.145, 78.14 false/true
  
  pred_probas = tf.nn.softmax(logits) #prediction probability
  pred_classes = tf.argmax(logits, axis=1) #predition classes
#Predict Section  
  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
      'class_ids': pred_classes[:, tf.newaxis],
      'probabilities': tf.nn.softmax(logits),
      'logits': logits,
    }
    spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
#Train and evaluate section
  else:
    classes_weights = tf.constant([0.1, (1.0 - 0.1 )]) # 0.01 bad, 0.1 is ok, 0.2 bad
#####    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,logits=logits)

  #The argument pos_weight is used as a multiplier for the positive targets.
#So, assuming you have 5 positive examples in your dataset and 7 negative, 
#if you set the pos_weight=2, then your loss would be as if you had 
#10 positive examples and 7 negative.

#weighted loss up or down weight cost of positive error relative to negative error, 
#pos_weights > 1 decreases false neg, pos weights < 1 decreases false pos

####    loss_op = tf.losses.sparse_softmax_cross_entropy(logits=logits,labels=labels,weights=10)
    cross_entropy = tf.nn.weighted_cross_entropy_with_logits(logits=logits, targets=targets, pos_weight=classes_weights)
    loss_op = tf.reduce_mean(cross_entropy)
  
    optimizer = tf.train.AdamOptimizer(learning_rate=params["learning_rate"]) #custom AdamOptimizer  
    #The AdamOptimizer class creates additional variables, called "slots", to hold values for the "m" and "v" accumulators
    train_op = optimizer.minimize(loss=loss_op, global_step=tf.train.get_global_step())
    metrics ={"accuracy": tf.metrics.accuracy(labels, pred_classes),"precision": tf.metrics.precision(labels, pred_classes),"recall": tf.metrics.recall(labels, pred_classes),"auc": tf.metrics.auc(labels, pred_classes)} #evaluation metrics
    spec = tf.estimator.EstimatorSpec( ### custom estimator
      mode=mode,
      predictions={
        'class_ids': pred_classes[:, tf.newaxis],
        'probabilities': tf.nn.softmax(logits),
        'logits': logits,},
      loss=loss_op,
      train_op=train_op,
      eval_metric_ops=metrics) #in train and evaluate you return loss, and metrics in addtion to logits
  return spec

In [75]:
#my_feature_columns = ["is_mobile", "pageviews", "os", "label", "country"]
#'hidden units': [10, 10] = two hidden layers of 10 nodes each
#params = {"learning_rate":1e-4} #model params
params={'feature_columns': feature_columns,'hidden_units': [40, 40, 40, 10, 10, 20, 20, 20],'n_classes': 2,'learning_rate':1e-3, }  # # 4 hidden layers of 10 nodes each. The model must choose between 3 classes.
#estimator
model = tf.estimator.Estimator(model_fn, model_dir='../saved_models/test', params=params) #store the model

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc955976f28>, '_session_config': None, '_save_checkpoints_secs': 600, '_tf_random_seed': None, '_model_dir': '../saved_models/test', '_save_checkpoints_steps': None, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_num_ps_replicas': 0, '_train_distribute': None, '_keep_checkpoint_max': 5, '_is_chief': True, '_master': '', '_num_worker_replicas': 1, '_global_id_in_cluster': 0, '_log_step_count_steps': 100, '_task_id': 0, '_task_type': 'worker', '_save_summary_steps': 100}


In [76]:
model.train(input_fn=train_input_fn, steps=10000) #steps is iterations to train model on input data

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ../saved_models/test/model.ckpt.
INFO:tensorflow:step = 1, loss = 0.38038832
INFO:tensorflow:global_step/sec: 105.868
INFO:tensorflow:step = 101, loss = 0.016573168 (0.946 sec)
INFO:tensorflow:global_step/sec: 167.608
INFO:tensorflow:step = 201, loss = 1.4145176e-05 (0.597 sec)
INFO:tensorflow:global_step/sec: 168.765
INFO:tensorflow:step = 301, loss = 1.408697e-05 (0.592 sec)
INFO:tensorflow:global_step/sec: 169.483
INFO:tensorflow:step = 401, loss = 3.7615464e-05 (0.590 sec)
INFO:tensorflow:global_step/sec: 171.263
INFO:tensorflow:step = 501, loss = 1.0991283e-05 (0.584 sec)
INFO:tensorflow:global_step/sec: 171.28
INFO:tensorflow:step = 601, loss = 0.0007501724 (0.585 sec)
INFO:tensorflow:global_step/sec: 

INFO:tensorflow:step = 7801, loss = 0.0045247786 (0.592 sec)
INFO:tensorflow:global_step/sec: 172.532
INFO:tensorflow:step = 7901, loss = 0.014441513 (0.579 sec)
INFO:tensorflow:global_step/sec: 167.631
INFO:tensorflow:step = 8001, loss = 0.0033725048 (0.597 sec)
INFO:tensorflow:global_step/sec: 165.058
INFO:tensorflow:step = 8101, loss = 0.020260558 (0.606 sec)
INFO:tensorflow:global_step/sec: 168.415
INFO:tensorflow:step = 8201, loss = 0.040351156 (0.594 sec)
INFO:tensorflow:global_step/sec: 172.588
INFO:tensorflow:step = 8301, loss = 0.036842957 (0.580 sec)
INFO:tensorflow:global_step/sec: 171.362
INFO:tensorflow:step = 8401, loss = 0.00089004077 (0.583 sec)
INFO:tensorflow:global_step/sec: 169.672
INFO:tensorflow:step = 8501, loss = 0.009204607 (0.590 sec)
INFO:tensorflow:global_step/sec: 168.708
INFO:tensorflow:step = 8601, loss = 0.010221897 (0.593 sec)
INFO:tensorflow:global_step/sec: 174.333
INFO:tensorflow:step = 8701, loss = 0.04359413 (0.574 sec)
INFO:tensorflow:global_step/

<tensorflow.python.estimator.estimator.Estimator at 0x7fc956301c18>

Just like in BQML after training we have some loss for each step, our final loss rate here is 0.08 but in BQML it went down to 0.04

Evaluate

In [20]:
%bq query -n evaluate
SELECT
  IF(totals.transactions IS NULL, 0, 1) AS label,
  IFNULL(device.operatingSystem, "") AS os,
  device.isMobile AS is_mobile,
  IFNULL(geoNetwork.country, "") AS country,
  IFNULL(totals.pageviews, 0) AS pageviews
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'

In [21]:
evaldf=evaluate.execute().result().to_dataframe()

In [22]:
evaldf[:2]

Unnamed: 0,label,os,is_mobile,country,pageviews
0,0,Android,True,Fiji,1
1,0,iOS,True,Guam,1


In [23]:
evaldf['label'].unique()

array([0, 1])

In [24]:
print("No purchase =", len(evaldf[evaldf['label'] == 0]['os']))
print("Purchase =", len(evaldf[evaldf['label'] == 1]['os']))
print("Evaluation Set Length =", len(evaldf['os']))

No purchase = 73294
Purchase = 1074
Evaluation Set Length = 74368


In [25]:
evaldf.loc[evaldf['is_mobile'] == True, 'mobile'] = "Yes"
evaldf.loc[evaldf['is_mobile'] == False, 'mobile'] = "No"
evaldf=evaldf.drop(columns='is_mobile')
evaldf.head()

Unnamed: 0,label,os,country,pageviews,mobile
0,0,Android,Fiji,1,Yes
1,0,iOS,Guam,1,Yes
2,0,iOS,Guam,2,Yes
3,0,Android,Guam,8,Yes
4,0,Windows,Guam,3,No


In [26]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
  x=evaldf, 
  y=evaldf['label'], 
  num_epochs=None, 
  shuffle=True)

In [27]:
eval_input_fn()

({'country': <tf.Tensor 'random_shuffle_queue_DequeueMany_1:3' shape=(128,) dtype=string>,
  'label': <tf.Tensor 'random_shuffle_queue_DequeueMany_1:1' shape=(128,) dtype=int64>,
  'mobile': <tf.Tensor 'random_shuffle_queue_DequeueMany_1:5' shape=(128,) dtype=string>,
  'os': <tf.Tensor 'random_shuffle_queue_DequeueMany_1:2' shape=(128,) dtype=string>,
  'pageviews': <tf.Tensor 'random_shuffle_queue_DequeueMany_1:4' shape=(128,) dtype=int64>},
 <tf.Tensor 'random_shuffle_queue_DequeueMany_1:6' shape=(128,) dtype=int64>)

In [28]:
#feature_columns=[em_mobile,em_country,em_os,pageviews]

In [77]:
result=model.evaluate(input_fn=eval_input_fn,steps=10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-18-05:52:31
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ../saved_models/test/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1000/10000]
INFO:tensorflow:Evaluation [2000/10000]
INFO:tensorflow:Evaluation [3000/10000]
INFO:tensorflow:Evaluation [4000/10000]
INFO:tensorflow:Evaluation [5000/10000]
INFO:tensorflow:Evaluation [6000/10000]
INFO:tensorflow:Evaluation [7000/10000]
INFO:tensorflow:Evaluation [8000/10000]
INFO:tensorflow:Evaluation [9000/10000]
INFO:tensorflow:Evaluation [10000/10000]
INFO:tensorflow:Finished evaluation at 2018-11-18-05:53:16
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.97395, auc = 0.83005244, global_step = 10000, loss = 0.023727657, precision = 0.3218722, recall = 0.6817512


In [39]:
result

{'accuracy': 0.92153907,
 'auc': 0.8345326,
 'global_step': 1000,
 'loss': 0.15353142,
 'precision': 0.090100214,
 'recall': 0.7457364}

In [56]:
print("Classification accuracy: {0:.2%}".format(result["accuracy"]))

Classification accuracy: 98.77%


Predict

In [32]:
#predict can use roughly the same data as evaluate
#for predict we use the evaluated predicted classifications

In [79]:
#some_data=evaldf[:10] #first 10 rows
some_data=evaldf
expected =some_data['label']
pred_input_fn = tf.estimator.inputs.pandas_input_fn(
  x=some_data, 
#  y=evaldf['label'], 
  num_epochs=1, 
  shuffle=False)

In [80]:
predictions = model.predict(input_fn=pred_input_fn)
predictions

<generator object Estimator.predict at 0x7fc93c7d9258>

In [83]:
#cls_pred = list(predictions)[:]
#cls_pred

In [84]:
#[cls_pred[i]['class_ids'] for i in range(len(cls_pred))]

In [41]:
#a=[p['class_ids'][0] for p in cls_pred]
#print(a)
#print(list(expected))

In [42]:
#template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')

#for pred_dict, expec in zip(cls_pred, expected):
#    class_id = pred_dict['class_ids'][0]
#    probability = pred_dict['probabilities'][class_id]

#    print(template.format(class_id,100 * probability, expec))

In [60]:
#confusion_matrix= tf.confusion_matrix(a, expected, num_classes=2)

In [52]:
## columns = prediction
## rows = real
#sess = tf.Session()
#with sess.as_default():
#  print(sess.run(confusion_matrix))

[[73294  1074]
 [    0     0]]


In [None]:
#                | predicted positive | predicted negative
#----------------|--------------------|--------------------
#actual positive |     TP             |     FN
#actual negative |     FP             |     TN