# Interpreting unknown locations through mutual relationships

This document is an attempt at finding the location of a person who has not declared this  in his or her profile. this is done through the following steps

- Find all edges in our graph
- Determine the amount of followbacks
- Determine the amount of followbacks that live in our specified location

## Importing data

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib
import json

%matplotlib inline

In [3]:
location= "Eindhoven"
difficulty = .85

In [4]:
df = pd.read_json('data.json')
df

Unnamed: 0,edges,location,name,pk
0,"[59, 374, 450, 239]",Eindhoven,Robert Mulloy,0
1,"[483, 340]",Amsterdam,Frank Salmon,1
2,"[186, 424]",Amsterdam,Sheila Mandell,2
3,"[356, 93, 236, 456, 140, 30]",Eindhoven,Barbara Bolf,3
4,"[123, 38, 439, 7, 17, 109, 476, 394, 196, 307]",Amsterdam,Joseph Patterson,4
5,"[102, 399, 245, 499, 155, 380, 157, 416]",Eindhoven,Marilyn Spencer,5
6,"[410, 58, 337, 291, 29, 101]",Eindhoven,Aaron Roberts,6
7,"[280, 87, 179, 417, 4, 69, 105, 210, 243, 467]",Amsterdam,Marjorie Shipley,7
8,"[150, 238, 29, 251, 279]",Amsterdam,Kyra Benjamen,8
9,"[97, 140, 152, 73]",Eindhoven,Jerome Smith,9


In [4]:
print(df.groupby('location').size())
df.describe()

location
             163
Amsterdam     57
Eindhoven    238
Helmond       42
dtype: int64


Unnamed: 0,pk
count,500.0
mean,249.5
std,144.481833
min,0.0
25%,124.75
50%,249.5
75%,374.25
max,499.0


## Parse our data and get accurate counts

In [5]:
df['edge_count'] = df['edges'].map(len)
df

Unnamed: 0,edges,location,name,pk,edge_count
0,"[258, 122, 111, 290, 327, 114, 464, 469]",Eindhoven,Chung Yin,0,8
1,"[235, 473, 472, 195, 21]",Helmond,Maria Johns,1,5
2,"[98, 205, 151, 321, 246, 400, 13, 453]",Eindhoven,Charles Miller,2,8
3,"[264, 167, 395, 365, 395, 302, 179, 356, 174]",Eindhoven,Sheldon Pulsifer,3,9
4,"[19, 301, 402, 386, 50]",Eindhoven,Curtis Rogers,4,5
5,"[80, 19, 407, 163, 357, 198]",Eindhoven,Karen Harris,5,6
6,"[460, 494, 382, 131, 383, 192, 69, 55]",,Maria Veale,6,8
7,"[353, 457, 13, 184]",Eindhoven,Sharon Ceaser,7,4
8,"[92, 10, 285, 339, 470, 279]",Helmond,Ronald Estrada,8,6
9,"[275, 159, 301, 280, 171, 190, 52, 233, 99, 318]",Eindhoven,Mathew Rhoden,9,10


## Finding a bi-directional edge

In [6]:
def edge_is_mutual(x):
    counter = 0
    for edge in x.edges:
        if x.pk in df.iloc[edge].edges:
            counter += 1
    return counter

df['followbacks'] = df.apply(edge_is_mutual, axis=1)
df

Unnamed: 0,edges,location,name,pk,edge_count,followbacks
0,"[258, 122, 111, 290, 327, 114, 464, 469]",Eindhoven,Chung Yin,0,8,8
1,"[235, 473, 472, 195, 21]",Helmond,Maria Johns,1,5,4
2,"[98, 205, 151, 321, 246, 400, 13, 453]",Eindhoven,Charles Miller,2,8,6
3,"[264, 167, 395, 365, 395, 302, 179, 356, 174]",Eindhoven,Sheldon Pulsifer,3,9,7
4,"[19, 301, 402, 386, 50]",Eindhoven,Curtis Rogers,4,5,5
5,"[80, 19, 407, 163, 357, 198]",Eindhoven,Karen Harris,5,6,6
6,"[460, 494, 382, 131, 383, 192, 69, 55]",,Maria Veale,6,8,7
7,"[353, 457, 13, 184]",Eindhoven,Sharon Ceaser,7,4,4
8,"[92, 10, 285, 339, 470, 279]",Helmond,Ronald Estrada,8,6,5
9,"[275, 159, 301, 280, 171, 190, 52, 233, 99, 318]",Eindhoven,Mathew Rhoden,9,10,9


In [7]:
def is_mutual_with_location(x):
    counter = 0
    for edge in x.edges:
        if x.pk in df.iloc[edge].edges and df.iloc[edge].location == location:
            counter += 1
    return counter

df['followbacks with location'] = df.apply(is_mutual_with_location, axis=1)

df

Unnamed: 0,edges,location,name,pk,edge_count,followbacks,followbacks with location
0,"[258, 122, 111, 290, 327, 114, 464, 469]",Eindhoven,Chung Yin,0,8,8,4
1,"[235, 473, 472, 195, 21]",Helmond,Maria Johns,1,5,4,3
2,"[98, 205, 151, 321, 246, 400, 13, 453]",Eindhoven,Charles Miller,2,8,6,2
3,"[264, 167, 395, 365, 395, 302, 179, 356, 174]",Eindhoven,Sheldon Pulsifer,3,9,7,5
4,"[19, 301, 402, 386, 50]",Eindhoven,Curtis Rogers,4,5,5,3
5,"[80, 19, 407, 163, 357, 198]",Eindhoven,Karen Harris,5,6,6,0
6,"[460, 494, 382, 131, 383, 192, 69, 55]",,Maria Veale,6,8,7,5
7,"[353, 457, 13, 184]",Eindhoven,Sharon Ceaser,7,4,4,2
8,"[92, 10, 285, 339, 470, 279]",Helmond,Ronald Estrada,8,6,5,3
9,"[275, 159, 301, 280, 171, 190, 52, 233, 99, 318]",Eindhoven,Mathew Rhoden,9,10,9,3


In [8]:
def is_single_with_location(x):
    location = "Eindhoven"
    counter = 0
    for edge in x.edges:
        if df.iloc[edge].location == location and x.pk not in df.iloc[edge].edges:
            counter += 1
    return counter

df['follows with location'] = df.apply(is_single_with_location, axis=1)

df

Unnamed: 0,edges,location,name,pk,edge_count,followbacks,followbacks with location,follows with location
0,"[258, 122, 111, 290, 327, 114, 464, 469]",Eindhoven,Chung Yin,0,8,8,4,0
1,"[235, 473, 472, 195, 21]",Helmond,Maria Johns,1,5,4,3,1
2,"[98, 205, 151, 321, 246, 400, 13, 453]",Eindhoven,Charles Miller,2,8,6,2,1
3,"[264, 167, 395, 365, 395, 302, 179, 356, 174]",Eindhoven,Sheldon Pulsifer,3,9,7,5,1
4,"[19, 301, 402, 386, 50]",Eindhoven,Curtis Rogers,4,5,5,3,0
5,"[80, 19, 407, 163, 357, 198]",Eindhoven,Karen Harris,5,6,6,0,0
6,"[460, 494, 382, 131, 383, 192, 69, 55]",,Maria Veale,6,8,7,5,0
7,"[353, 457, 13, 184]",Eindhoven,Sharon Ceaser,7,4,4,2,0
8,"[92, 10, 285, 339, 470, 279]",Helmond,Ronald Estrada,8,6,5,3,0
9,"[275, 159, 301, 280, 171, 190, 52, 233, 99, 318]",Eindhoven,Mathew Rhoden,9,10,9,3,1


In [9]:
def predict_location_is_match(x):  
    return x['followbacks with location'] + x['follows with location'] >=  difficulty * x['edge_count']
        
    
df['predictions'] = df.apply(predict_location_is_match, axis =1)

In [10]:
def check_location_match(x):
    if x['predictions'] == True and x['location'] == "Eindhoven":
        return True
    elif x['predictions'] == False and x['location'] != "Eindhoven":
        return True
    else:
        return False
    
df['prediction correct'] = df.apply(check_location_match, axis=1)

## Outcome

This is the outcome of our current guessing

In [3]:
correct_count = df['prediction correct'][lambda x: x==True].count()

print("{0} out of {1} predictions were correct".format(correct_count, len(df)))
print("{0}% was correct".format(correct_count/len(df) * 100))

NameError: name 'df' is not defined