In [44]:
import itertools

import click
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_ranking as tfr
from sklearn.metrics import f1_score

import logging
logger = logging.getLogger(__name__)


tf.enable_eager_execution()
tf.executing_eagerly()

# Store the paths to files containing training and test instances.
# As noted above, we will assume the data is in the LibSVM format
# and that the content of each file is sorted by query ID.

_TRAIN_DATA_PATH = ''
_TEST_DATA_PATH = ''

# Define a loss function. To find a complete list of available
# loss functions or to learn how to add your own custom function
# please refer to the tensorflow_ranking.losses module.
_LOSS = "pairwise_logistic_loss"
# _LOSS = "sigmoid_cross_entropy_loss"

# In the TF-Ranking framework, a training instance is represented
# by a Tensor that contains features from a list of documents
# associated with a single query. For simplicity, we fix the shape
# of these Tensors to a maximum list size and call it "list_size,"
# the maximum number of documents per query in the dataset.
# In this demo, we take the following approach:
#   * If a query has fewer documents, its Tensor will be padded
#     appropriately.
#   * If a query has more documents, we shuffle its list of
#     documents and trim the list down to the prescribed list_size.
_LIST_SIZE = 100

# The total number of features per query-document pair.
# We set this number to the number of features in the MSLR-Web30K
# dataset.

# Parameters to the scoring function.
_BATCH_SIZE = 1000
_HIDDEN_LAYER_DIMS = ["20", "10"]


# _OUT_DIR = "../models/tfranking/"

def input_fn(path):
    train_dataset = tf.data.Dataset.from_generator(
        tfr.data.libsvm_generator(path, _NUM_FEATURES, _LIST_SIZE),
        output_types=(
            {str(k): tf.float32 for k in range(1, _NUM_FEATURES + 1)},
            tf.float32
        ),
        output_shapes=(
            {str(k): tf.TensorShape([_LIST_SIZE, 1])
             for k in range(1, _NUM_FEATURES + 1)},
            tf.TensorShape([_LIST_SIZE])
        )
    )

    train_dataset = train_dataset.batch(_BATCH_SIZE)
    return train_dataset.make_one_shot_iterator().get_next()


def example_feature_columns():
    """Returns the example feature columns."""
    feature_names = [
        "%d" % (i + 1) for i in range(0, _NUM_FEATURES)
    ]
    return {
        name: tf.feature_column.numeric_column(
            name, shape=(1,), default_value=0.0) for name in feature_names
    }


def make_score_fn():
    """Returns a scoring function to build `EstimatorSpec`."""

    def _score_fn(context_features, group_features, mode, params, config):
        """Defines the network to score a documents."""
        del params
        del config
        # Define input layer.
        example_input = [
            tf.layers.flatten(group_features[name])
            for name in sorted(example_feature_columns())
        ]
        input_layer = tf.concat(example_input, 1)

        cur_layer = input_layer
        for i, layer_width in enumerate(int(d) for d in _HIDDEN_LAYER_DIMS):
            cur_layer = tf.layers.dense(
                cur_layer,
                units=layer_width,
                activation="tanh")

        logits = tf.layers.dense(cur_layer, units=1)
        return logits

    return _score_fn


def eval_metric_fns():
    """Returns a dict from name to metric functions.

    This can be customized as follows. Care must be taken when handling padded
    lists.

    def _auc(labels, predictions, features):
    is_label_valid = tf_reshape(tf.greater_equal(labels, 0.), [-1, 1])
    clean_labels = tf.boolean_mask(tf.reshape(labels, [-1, 1], is_label_valid)
    clean_pred = tf.boolean_maks(tf.reshape(predictions, [-1, 1], is_label_valid)
    return tf.metrics.auc(clean_labels, tf.sigmoid(clean_pred), ...)
    metric_fns["auc"] = _auc

    Returns:
    A dict mapping from metric name to a metric function with above signature.
    """
    metric_fns = {}
    metric_fns.update({
        "metric/ndcg@%d" % topn: tfr.metrics.make_ranking_metric_fn(
            tfr.metrics.RankingMetricKey.NDCG, topn=topn)
        for topn in [1, 3, 5, 10]
    })

    return metric_fns


def get_estimator(hparams):
    """Create a ranking estimator.

    Args:
    hparams: (tf.contrib.training.HParams) a hyperparameters object.

    Returns:
    tf.learn `Estimator`.
    """

    def _train_op_fn(loss):
        """Defines train op used in ranking head."""
        return tf.contrib.layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            learning_rate=hparams.learning_rate,
            optimizer="Adagrad")

    ranking_head = tfr.head.create_ranking_head(
        loss_fn=tfr.losses.make_loss_fn(_LOSS),
        eval_metric_fns=eval_metric_fns(),
        train_op_fn=_train_op_fn)

    return tf.estimator.Estimator(
        model_fn=tfr.model.make_groupwise_ranking_fn(
            group_score_fn=make_score_fn(),
            group_size=1,
            transform_fn=None,
            ranking_head=ranking_head),
        params=hparams)


def ltr_to_submission(df, features, ranker, path):
    features = features + ['sid']

    preds = ranker.predict(input_fn=lambda: input_fn(path))
    import itertools
    import numpy as np
    # Not sure how to get all preds because it runs infinit
    # So I take all till list size
    preds_slice = itertools.islice(preds, len(df))
    count = 0
    a = np.zeros((len(df), _LIST_SIZE))

    for i in preds_slice:
        a[count] = i
        count += 1

    test_X = df[features]

    test_X = test_X.assign(yhat=a[:, 0])

    df_end = pd.DataFrame(columns=['yhat'], index=df.sid.unique())

    df_end = test_X.sort_values(['sid', 'yhat'], ascending=False).groupby('sid').first()[[
        'yhat', 'transport_mode'
    ]]

    from sklearn.metrics import f1_score
    score = f1_score(df.groupby("sid").first()['click_mode'], df_end.transport_mode, average='weighted')
    print('F1 Score is: {}'.format(score))

    return df_end


In [45]:
df_train_train = pd.read_pickle("../data/processed/ranking/train_all_row_sample_50.pickle")
df_train_test = pd.read_pickle("../data/processed/ranking/train_all_row_sample_50.pickle")

_TRAIN_DATA_PATH="../data/processed/ranking/train_all_row_sample_50.libsvm"
_TEST_DATA_PATH="../data/processed/ranking/train_all_row_sample_50.libsvm"

with open('../data/processed/ranking/features_tfranking.txt') as f:
    features = f.read().splitlines()

_NUM_FEATURES = len(features)

hparams = tf.contrib.training.HParams(learning_rate=0.05)
ranker = get_estimator(hparams)

ranker.train(input_fn=lambda: input_fn(_TRAIN_DATA_PATH), steps=10)

df_preds = ltr_to_submission(df_train_test, features, ranker, _TEST_DATA_PATH)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp98ufht78', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f048c03d9b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Use groupwise dnn v2.


Exception ignored in: <generator object EstimatorV2.predict at 0x7f04645bfde0>
Traceback (most recent call last):
  File "/home/sandro/anaconda3/envs/tf_ranking/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 639, in predict
    for key, value in six.iteritems(preds_evaluated)
  File "/home/sandro/anaconda3/envs/tf_ranking/lib/python3.7/contextlib.py", line 130, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/sandro/anaconda3/envs/tf_ranking/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 5253, in get_controller
    yield g
  File "/home/sandro/anaconda3/envs/tf_ranking/lib/python3.7/contextlib.py", line 130, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/sandro/anaconda3/envs/tf_ranking/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 5069, in get_controller
    type(default))
AssertionError: Nesting violated for default stack of <class 'tensorflow.python.framewo

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp98ufht78/model.ckpt.
INFO:tensorflow:loss = 0.0, step = 1
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp98ufht78/model.ckpt.
INFO:tensorflow:Loss for final step: 0.0.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Use groupwise dnn v2.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp98ufht78/model.ckpt-1
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
F1 Score is: 0.18365367965367965


  'precision', 'predicted', average, warn_for)


In [26]:
df = df_train_test.copy()
preds = ranker.predict(input_fn=lambda: input_fn(_TEST_DATA_PATH))
import itertools
import numpy as np
# Not sure how to get all preds because it runs infinit
# So I take all till list size
preds_slice = itertools.islice(preds, len(df))
count = 0
a = np.zeros((len(df), _LIST_SIZE))

for i in preds_slice:
    a[count] = i
    count += 1

test_X = df[features]


test_X = test_X.assign(yhat=a[:, 0])

df_end = pd.DataFrame(columns=['yhat'], index=df.sid.unique())

df_end = test_X.sort_values(['sid', 'yhat'], ascending=False).groupby('sid').first()[[
    'yhat', 'transport_mode'
]]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Use groupwise dnn v2.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpbl8iiks3/model.ckpt-1
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [27]:
a.shape

(50, 100)

In [28]:
a[:6,0]

array([-0.53100163, -0.53100163, -0.53100163, -0.53100163, -0.53100163,
        0.81304371])

In [43]:
fo = open(_TRAIN_DATA_PATH)
i=0
for f in fo:
    if i != 10:
        print(f)
    else:
        break
    i+=1

100 qid:188866 1:3 2:9260 3:1935 4:700

1 qid:344500 1:7 2:65325 3:9417 4:1700

1 qid:375157 1:6 2:8929 3:2699 4:700

1 qid:583940 1:7 2:39781 3:5461 4:900

100 qid:607867 1:3 2:11766 3:1657 4:700

1 qid:667946 1:5 2:1121 3:1108 4:700

100 qid:749281 1:3 2:17540 3:1626 4:700

100 qid:951159 1:2 2:34244 3:5376 4:600

100 qid:1298768 1:6 2:5157 3:1558 4:700

100 qid:1299815 1:7 2:20826 3:4818 4:700



In [36]:
df[features + ['click_mode']].head(10)

Unnamed: 0,transport_mode,distance_plan,eta,price,sid,click_mode
1471480,3,11766,1657,700.0,607867,1
561630,1,14964,4636,500.0,3322978,9
941835,3,19347,1904,700.0,3114958,2
1102346,6,3007,908,700.0,2815994,1
730756,3,9260,1935,700.0,188866,1
904418,2,9410,2440,400.0,2321721,9
751764,3,18386,1891,700.0,2695250,2
1339280,1,2818,1272,200.0,3236439,1
569957,6,4812,1454,700.0,3311561,1
1324049,6,5157,1558,700.0,1298768,1


In [34]:
test_X.head(10)

Unnamed: 0,transport_mode,distance_plan,eta,price,sid,yhat
1471480,3,11766,1657,700.0,607867,-0.531002
561630,1,14964,4636,500.0,3322978,-0.531002
941835,3,19347,1904,700.0,3114958,-0.531002
1102346,6,3007,908,700.0,2815994,-0.531002
730756,3,9260,1935,700.0,188866,-0.531002
904418,2,9410,2440,400.0,2321721,0.813044
751764,3,18386,1891,700.0,2695250,-0.531002
1339280,1,2818,1272,200.0,3236439,-0.531002
569957,6,4812,1454,700.0,3311561,0.287385
1324049,6,5157,1558,700.0,1298768,-0.531002


In [46]:
df[df.sid == 3322978]

Unnamed: 0,sid,click_time,click_mode,distance_plan,eta,price,transport_mode,plan_time_x,pid,req_time,...,max_temp,min_temp,weather,wind,weather_dy,weather_dyq,weather_q,weather_qdy,weather_xq,weather_xydy
561630,3322978,2018-10-16 09:06:46,9,14964,4636,500.0,1,2018-10-16 09:06:34,202997.0,2018-10-16 09:06:34,...,17,7,dyq,12,0,1,0,0,0,0


In [42]:
test_X.sort_values(['sid', 'yhat'], ascending=False)

Unnamed: 0,transport_mode,distance_plan,eta,price,sid,yhat
561630,1,14964,4636,500.0,3322978,-0.531002
569957,6,4812,1454,700.0,3311561,0.287385
1339280,1,2818,1272,200.0,3236439,-0.531002
1342851,4,18423,2979,5000.0,3221503,-0.531002
947585,3,1420,260,700.0,3123348,-0.531002
941835,3,19347,1904,700.0,3114958,-0.531002
1603907,4,18462,1873,5400.0,3081396,0.287385
1636971,1,7208,3044,400.0,3053400,0.26057
819189,7,19396,3308,700.0,3014885,0.287385
1197166,6,1,1,700.0,2997560,0.287385
