In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tempfile

# Import and prepare data

First we read `data.json`

We then start processing this data so we can prepare it for a DNN Classiffier. We prep 4 columns

- edge_count: Total number of edges
- edge_match: Nodes that we know that share the location of this node
- edge_mutual_match: Nodes that we know share the location of this node AND mutually follows this node
- edge_single_match: Nodes that we know share the location of this node BUT does not know this node

In [2]:
df = pd.read_json('data.json')

locations = list(set(df['location']))
df['edge_count'] =  df['edges'].apply(lambda x: len(x))
df['edge_match'] = df.apply(lambda x: len([1 for y in x['edges'] if df.iloc[y]['location']==x['location']]), axis=1)
df['edge_mutual_match'] = df.apply(lambda x: 
                                    len([1 for y in x['edges'] if 
                                         df.iloc[y]['location']==x['location'] and 
                                         int(x['pk']) in df.iloc[y]['edges']]), 
                                    axis=1
                                   )
df['edge_single_match'] = df.apply(lambda x: 
                                    x['edge_match'] -x['edge_mutual_match'], 
                                    axis=1)
df['location_num'] = df['location'].apply(lambda x: locations.index(x))
df

Unnamed: 0,edges,location,name,pk,edge_count,edge_match,edge_mutual_match,edge_single_match,location_num
0,"[59, 374, 450, 239]",Eindhoven,Robert Mulloy,0,4,3,1,2,1
1,"[483, 340]",Amsterdam,Frank Salmon,1,2,0,0,0,0
2,"[186, 424]",Amsterdam,Sheila Mandell,2,2,0,0,0,0
3,"[356, 93, 236, 456, 140, 30]",Eindhoven,Barbara Bolf,3,6,4,3,1,1
4,"[123, 38, 439, 7, 17, 109, 476, 394, 196, 307]",Amsterdam,Joseph Patterson,4,10,5,5,0,0
5,"[102, 399, 245, 499, 155, 380, 157, 416]",Eindhoven,Marilyn Spencer,5,8,7,4,3,1
6,"[410, 58, 337, 291, 29, 101]",Eindhoven,Aaron Roberts,6,6,2,2,0,1
7,"[280, 87, 179, 417, 4, 69, 105, 210, 243, 467]",Amsterdam,Marjorie Shipley,7,10,2,2,0,0
8,"[150, 238, 29, 251, 279]",Amsterdam,Kyra Benjamen,8,5,2,1,1,0
9,"[97, 140, 152, 73]",Eindhoven,Jerome Smith,9,4,2,1,1,1


# Start defining our features

In [3]:
base_columns = [
    tf.feature_column.numeric_column("edge_count"),
    tf.feature_column.numeric_column("edge_mutual_match"),
    tf.feature_column.numeric_column("edge_single_match")
]

In [4]:
m = tf.estimator.DNNClassifier(
    feature_columns=base_columns,
    hidden_units=[10, 50, 10],
    n_classes=len(locations)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpy6knz1jb', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [5]:
def input_fn():
    df_data = df.dropna()
    return tf.estimator.inputs.numpy_input_fn(
        x={
            "edge_count": np.array(df['edge_count']),
            "edge_mutual_match": np.array(df['edge_mutual_match']),
            "edge_single_match": np.array(df['edge_single_match'])
        },
        y=df_data['location_num'],
        num_epochs=None,
        shuffle=True
    )

In [6]:
m.train(input_fn=input_fn(), steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpy6knz1jb/model.ckpt.
INFO:tensorflow:loss = 156.791, step = 1
INFO:tensorflow:global_step/sec: 738.249
INFO:tensorflow:loss = 93.9771, step = 101 (0.137 sec)
INFO:tensorflow:global_step/sec: 624.935
INFO:tensorflow:loss = 79.6364, step = 201 (0.160 sec)
INFO:tensorflow:global_step/sec: 739.921
INFO:tensorflow:loss = 80.1293, step = 301 (0.135 sec)
INFO:tensorflow:global_step/sec: 735.331
INFO:tensorflow:loss = 90.3535, step = 401 (0.136 sec)
INFO:tensorflow:global_step/sec: 754.542
INFO:tensorflow:loss = 99.3667, step = 501 (0.132 sec)
INFO:tensorflow:global_step/sec: 737.6
INFO:tensorflow:loss = 82.6181, step = 601 (0.136 sec)
INFO:tensorflow:global_step/sec: 722.725
INFO:tensorflow:loss = 86.2122, step = 701 (0.138 sec)
INFO:tensorflow:global_step/sec: 483.691
INFO:tensorflow:loss = 79.7949, step = 801 (0.210 sec)
INFO:tensorflow:global_step/sec: 533.696
INFO:tensorflow:loss = 87.7103, s

INFO:tensorflow:loss = 83.9329, step = 8401 (0.134 sec)
INFO:tensorflow:global_step/sec: 766.191
INFO:tensorflow:loss = 73.2151, step = 8501 (0.130 sec)
INFO:tensorflow:global_step/sec: 645.305
INFO:tensorflow:loss = 79.9508, step = 8601 (0.154 sec)
INFO:tensorflow:global_step/sec: 799.759
INFO:tensorflow:loss = 71.5809, step = 8701 (0.126 sec)
INFO:tensorflow:global_step/sec: 790.853
INFO:tensorflow:loss = 80.5636, step = 8801 (0.126 sec)
INFO:tensorflow:global_step/sec: 771.115
INFO:tensorflow:loss = 78.0522, step = 8901 (0.130 sec)
INFO:tensorflow:global_step/sec: 811.86
INFO:tensorflow:loss = 88.2074, step = 9001 (0.123 sec)
INFO:tensorflow:global_step/sec: 829.847
INFO:tensorflow:loss = 72.5455, step = 9101 (0.120 sec)
INFO:tensorflow:global_step/sec: 802.407
INFO:tensorflow:loss = 82.8764, step = 9201 (0.125 sec)
INFO:tensorflow:global_step/sec: 759.714
INFO:tensorflow:loss = 84.812, step = 9301 (0.130 sec)
INFO:tensorflow:global_step/sec: 654.138
INFO:tensorflow:loss = 104.68, s

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f1b28672358>

## Start predicting

We now have a properly trained DNN. Time to start predicting

In [19]:
new_prods = np.array(
[
    [10, 5, 2],
    [9, 1, 7]
])

predict_input_fn =  tf.estimator.inputs.numpy_input_fn(
    x={
        "edge_count": new_prods[::,0],
        "edge_mutual_match": new_prods[::,1],
        "edge_single_match": new_prods[::, 2]
    },
    num_epochs=1,
    shuffle=False
)

In [24]:
predictions = list(m.predict(input_fn=predict_input_fn))
predicted_classes = [p["classes"] for p in predictions]
print("New Samples, Class Predictions: {}\n".format(locations[predicted_classes))

INFO:tensorflow:Restoring parameters from /tmp/tmpy6knz1jb/model.ckpt-10000


TypeError: list indices must be integers or slices, not list