In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tempfile

# Import and prepare data

First we read `data.json`

We then start processing this data so we can prepare it for a DNN Classiffier. We prep 4 columns

- edge_count: Total number of edges
- edge_match: Nodes that we know that share the location of this node
- edge_mutual_match: Nodes that we know share the location of this node AND mutually follows this node
- edge_single_match: Nodes that we know share the location of this node BUT does not know this node

In [2]:
df = pd.read_json('data.json')

locations = list(set(df['location']))
df['edge_count'] =  df['edges'].apply(lambda x: len(x))
df['edge_match'] = df.apply(lambda x: len([1 for y in x['edges'] if df.iloc[y]['location']==x['location']]), axis=1)
df['edge_mutual_match'] = df.apply(lambda x: 
                                    len([1 for y in x['edges'] if 
                                         df.iloc[y]['location']==x['location'] and 
                                         int(x['pk']) in df.iloc[y]['edges']]), 
                                    axis=1
                                   )
df['edge_single_match'] = df.apply(lambda x: 
                                    x['edge_match'] -x['edge_mutual_match'], 
                                    axis=1)
df['location_num'] = df['location'].apply(lambda x: locations.index(x))
df

Unnamed: 0,edges,location,name,pk,edge_count,edge_match,edge_mutual_match,edge_single_match,location_num
0,"[59, 374, 450, 239]",Eindhoven,Robert Mulloy,0,4,3,1,2,1
1,"[483, 340]",Amsterdam,Frank Salmon,1,2,0,0,0,0
2,"[186, 424]",Amsterdam,Sheila Mandell,2,2,0,0,0,0
3,"[356, 93, 236, 456, 140, 30]",Eindhoven,Barbara Bolf,3,6,4,3,1,1
4,"[123, 38, 439, 7, 17, 109, 476, 394, 196, 307]",Amsterdam,Joseph Patterson,4,10,5,5,0,0
5,"[102, 399, 245, 499, 155, 380, 157, 416]",Eindhoven,Marilyn Spencer,5,8,7,4,3,1
6,"[410, 58, 337, 291, 29, 101]",Eindhoven,Aaron Roberts,6,6,2,2,0,1
7,"[280, 87, 179, 417, 4, 69, 105, 210, 243, 467]",Amsterdam,Marjorie Shipley,7,10,2,2,0,0
8,"[150, 238, 29, 251, 279]",Amsterdam,Kyra Benjamen,8,5,2,1,1,0
9,"[97, 140, 152, 73]",Eindhoven,Jerome Smith,9,4,2,1,1,1


# Start defining our features

In [3]:
base_columns = [
    tf.feature_column.numeric_column("edge_count"),
    tf.feature_column.numeric_column("edge_mutual_match"),
    tf.feature_column.numeric_column("edge_single_match")
]

In [23]:
m = tf.estimator.DNNClassifier(
    feature_columns=base_columns,
    hidden_units=[10, 30, 10],
    n_classes=len(locations)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpksdilqqk', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


In [24]:
def input_fn():
    df_data = df.dropna()
    return tf.estimator.inputs.numpy_input_fn(
        x={
            "edge_count": np.array(df['edge_count']),
            "edge_mutual_match": np.array(df['edge_mutual_match']),
            "edge_single_match": np.array(df['edge_single_match'])
        },
        y=df_data['location_num'],
        num_epochs=None,
        shuffle=True
    )

In [25]:
m.train(input_fn=input_fn(), steps=10000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpksdilqqk/model.ckpt.
INFO:tensorflow:loss = 178.346, step = 1
INFO:tensorflow:global_step/sec: 698.025
INFO:tensorflow:loss = 94.1448, step = 101 (0.143 sec)
INFO:tensorflow:global_step/sec: 690.664
INFO:tensorflow:loss = 78.2101, step = 201 (0.146 sec)
INFO:tensorflow:global_step/sec: 522.39
INFO:tensorflow:loss = 83.4975, step = 301 (0.191 sec)
INFO:tensorflow:global_step/sec: 599.756
INFO:tensorflow:loss = 89.5809, step = 401 (0.167 sec)
INFO:tensorflow:global_step/sec: 613.344
INFO:tensorflow:loss = 82.6917, step = 501 (0.163 sec)
INFO:tensorflow:global_step/sec: 669.043
INFO:tensorflow:loss = 87.9747, step = 601 (0.150 sec)
INFO:tensorflow:global_step/sec: 643.894
INFO:tensorflow:loss = 76.6252, step = 701 (0.155 sec)
INFO:tensorflow:global_step/sec: 672.387
INFO:tensorflow:loss = 79.0786, step = 801 (0.149 sec)
INFO:tensorflow:global_step/sec: 583.279
INFO:tensorflow:loss = 88.1132, 

INFO:tensorflow:loss = 77.4241, step = 8401 (0.171 sec)
INFO:tensorflow:global_step/sec: 496.686
INFO:tensorflow:loss = 85.316, step = 8501 (0.202 sec)
INFO:tensorflow:global_step/sec: 545.544
INFO:tensorflow:loss = 83.2689, step = 8601 (0.181 sec)
INFO:tensorflow:global_step/sec: 728.283
INFO:tensorflow:loss = 80.2291, step = 8701 (0.137 sec)
INFO:tensorflow:global_step/sec: 745.098
INFO:tensorflow:loss = 81.1729, step = 8801 (0.135 sec)
INFO:tensorflow:global_step/sec: 741.274
INFO:tensorflow:loss = 65.3589, step = 8901 (0.134 sec)
INFO:tensorflow:global_step/sec: 758.948
INFO:tensorflow:loss = 90.3007, step = 9001 (0.133 sec)
INFO:tensorflow:global_step/sec: 750.535
INFO:tensorflow:loss = 73.4736, step = 9101 (0.132 sec)
INFO:tensorflow:global_step/sec: 740.483
INFO:tensorflow:loss = 82.5838, step = 9201 (0.135 sec)
INFO:tensorflow:global_step/sec: 754.495
INFO:tensorflow:loss = 87.3754, step = 9301 (0.133 sec)
INFO:tensorflow:global_step/sec: 779.993
INFO:tensorflow:loss = 77.8132,

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7fab9b6d8dd8>

## Start predicting

We now have a properly trained DNN. Time to start predicting

In [21]:
new_preds = np.array(
[
    [10, 1, 0],
    [9, 3, 1]
])

predict_input_fn =  tf.estimator.inputs.numpy_input_fn(
    x={
        "edge_count": new_preds[::,0],
        "edge_mutual_match": new_preds[::,1],
        "edge_single_match": new_preds[::, 2]
    },
    num_epochs=1,
    shuffle=False
)

In [22]:
predictions = list(m.predict(input_fn=predict_input_fn))
predicted_classes = [p["classes"] for p in predictions]
print("New Samples, Class Predictions: {}\n".format(predicted_classes))

INFO:tensorflow:Restoring parameters from /tmp/tmpmzgi3_n7/model.ckpt-10000
New Samples, Class Predictions: [array([b'0'], dtype=object), array([b'1'], dtype=object)]

