# Load the dataset.

In [43]:
with open("./datasets/football_key.tsv") as f:
    # Each line is of form: <country_id> <country_name>
    def fmt(line):
        return (int(line[0])-1, line[1].strip('"'))
    data_key = [fmt(line.strip().split()) for line in f if line[0] != '*']

In [44]:
with open("./datasets/football_pairs.tsv") as f:
    # Each line is of form: <country_a_id> <country_b_id> <number_of_players>
    def fmt(pair):
        return (int(pair[0])-1, int(pair[1])-1, 1)
    data_pairs = [fmt(line.strip().split()) for line in f if line[0] != '*']

# Turn into useful format

Edit the `neighbours[]` below and `similarity` func below that to create a new metric.

In [45]:
neighbours = [set() for _ in range(len(data_key))]
for p in data_pairs:
    neighbours[p[0]].add(p[1])
    neighbours[p[1]].add(p[0])

In [46]:
def similarity_CN(x, y, ignore_set=None):
    # Common neighbours
    # x, y are indices to neighbours[]
    if ignore_set is None:
        ignore_set = set()
    return len((neighbours[x] & neighbours[y]) - set(t[1] for t in ignore_set))

# Compute similarity matrix.

In [47]:
def compute_similarities(ignore_set=None):
    # S_CN[x][y] contains the similarity of nodes x and y using the Common Neighbours (CN) metric.
    S_CN = [[0 for _ in range(len(data_key))] for _ in range(len(data_key))]
    for i in range(len(data_key)-1):
        for j in range(0, len(data_key)):
            S_CN[i][j] = similarity_CN(i, j, ignore_set=ignore_set)
    return S_CN
        

In [48]:
# A quick eyeball check of a subset of the data.
S_CN = compute_similarities()
num_to_print = len(data_key)//2
print(' '*4 + ' '.join(d[1] for d in data_key[:num_to_print]))
print('\n'.join(data_key[i][1] + ' ' + ','.join('{:>3}'.format(c) for c in S_CN[i][:num_to_print]) for i in range(num_to_print)))

    ARG AUT BEL BGR BRA CHE CHL CMR COL DEU DNK ESP FRA GBR GRE HRV IRN
ARG   4,  2,  1,  1,  3,  0,  1,  2,  2,  2,  2,  2,  1,  1,  0,  2,  0
AUT   2,  8,  3,  2,  3,  1,  1,  4,  2,  6,  4,  5,  4,  3,  1,  4,  1
BEL   1,  3,  6,  1,  2,  1,  1,  3,  1,  5,  3,  5,  3,  3,  1,  2,  1
BGR   1,  2,  1,  4,  2,  0,  0,  4,  1,  1,  3,  1,  1,  0,  0,  3,  1
BRA   3,  3,  2,  2,  7,  0,  1,  5,  2,  3,  2,  3,  1,  1,  0,  2,  0
CHE   0,  1,  1,  0,  0,  2,  0,  0,  0,  2,  0,  2,  2,  1,  0,  0,  0
CHL   1,  1,  1,  0,  1,  0,  3,  1,  3,  2,  1,  2,  1,  2,  0,  1,  0
CMR   2,  4,  3,  4,  5,  0,  1,  9,  2,  4,  4,  3,  3,  2,  0,  5,  1
COL   2,  2,  1,  1,  2,  0,  3,  2,  5,  3,  2,  3,  2,  2,  0,  2,  0
DEU   2,  6,  5,  1,  3,  2,  2,  4,  3, 19,  3, 13,  9, 10,  3,  3,  0
DNK   2,  4,  3,  3,  2,  0,  1,  4,  2,  3,  7,  3,  3,  3,  0,  5,  1
ESP   2,  5,  5,  1,  3,  2,  2,  3,  3, 13,  3, 18,  9,  9,  3,  3,  1
FRA   1,  4,  3,  1,  1,  2,  1,  3,  2,  9,  3,  9, 13,  5,  1,

# Create test sets.

Split the list of links into 10 random partitions, as the paper does, to get comparable measurements. Also create a set of all links which are not in the dataset.

In [49]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for it in range(0, len(l), n):
        yield l[it:it + n]
        
e = []
predict = []
for i in range(len(data_key)):
    for j in range(i+1, len(data_key)):
        if i in neighbours[j]:
            e.append((i, j))
        else:
            predict.append((i, j))
            
# e now contains all link pairs
# predict contains all non-existing links from the original data
# each pair is a tuple (a, b), where a < b

# We now randomly shuffle this list
import random
random.shuffle(e)

print('len(e)', len(e))
print('len(e)//10 =', len(e)//10)

# Create e_prime, a list of 10 partitions
e_prime = list(chunks(e, len(e)//10 + 1))

# The following is a quick eyeball test to make sure the partitions look ok.
print('10 subsets:')
for i in range(len(e_prime)):
    entry = e_prime[i]
    print(entry)


len(e) 118
len(e)//10 = 11
10 subsets:
[(9, 34), (7, 17), (10, 24), (17, 34), (1, 9), (10, 13), (13, 34), (9, 25), (2, 24), (9, 23), (13, 17), (13, 18)]
[(0, 11), (5, 34), (9, 24), (13, 32), (23, 31), (4, 8), (5, 23), (9, 12), (28, 31), (24, 28), (13, 24), (10, 31)]
[(11, 15), (1, 11), (0, 17), (23, 34), (13, 25), (10, 29), (10, 17), (2, 28), (4, 12), (24, 34), (8, 11), (11, 23)]
[(4, 17), (3, 11), (12, 20), (7, 26), (19, 33), (25, 29), (9, 17), (13, 33), (4, 11), (17, 21), (17, 23), (3, 31)]
[(1, 17), (12, 33), (17, 25), (31, 34), (0, 8), (4, 26), (11, 27), (11, 21), (8, 17), (9, 30), (17, 33), (15, 31)]
[(7, 19), (23, 24), (10, 11), (13, 29), (2, 23), (9, 28), (21, 30), (12, 34), (12, 30), (24, 32), (11, 17), (1, 12)]
[(11, 34), (2, 12), (9, 11), (6, 17), (7, 31), (7, 12), (4, 19), (2, 9), (17, 24), (0, 6), (19, 20), (23, 32)]
[(12, 29), (7, 9), (1, 34), (9, 10), (8, 32), (9, 21), (1, 7), (7, 14), (9, 15), (4, 27), (11, 33), (3, 9)]
[(12, 17), (7, 11), (11, 25), (13, 28), (22, 27), (

In [54]:
aucs = []
n1s = []
n2s = []
n3s = []
ns = []

# Column headings.
print('\t\tn1   \tn2   \tn3   \tAUC')

# Iterate across the 10 folds.
for i in range(10):
    test = e_prime[i]
    S_CN = compute_similarities(ignore_set=None)
    
    n1 = 0 # missing_link > nonexistant_link
    n2 = 0 # missing_link = nonexistant_link
    n3 = 0 # missing_link < nonexistant_link
    n = 0 # total link comparisons
    for missing_link in test:
        a_score = S_CN[missing_link[0]][missing_link[1]]
        for nonexistant_link in predict:
            b_score = S_CN[nonexistant_link[0]][nonexistant_link[1]]
            if abs(a_score-b_score) < 0.0005:
                n2 += 1
            elif a_score > b_score:
                n1 += 1
            else:
                n3 += 1
            n += 1
    auc = (n1 + 0.5*n2)/(n)
    aucs.append(auc)
    n1s.append(n1)
    n2s.append(n2)
    n3s.append(n3)
    ns.append(n)
    print('Fold {:<2}:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(i+1, n1, n2, n3, auc))

def avg(seq):
    return sum(seq)/len(seq)

print('Average:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(int(round(avg(n1s))), int(round(avg(n2s))), int(round(avg(n3s))), avg(aucs)))

		n1   	n2   	n3   	AUC
Fold 1 :	4626 	449  	649  	0.847397
Fold 2 :	3068 	1037 	1619 	0.626572
Fold 3 :	4223 	646  	855  	0.794200
Fold 4 :	2559 	1184 	1981 	0.550489
Fold 5 :	3127 	1020 	1577 	0.635395
Fold 6 :	4170 	641  	913  	0.784504
Fold 7 :	3210 	965  	1549 	0.645091
Fold 8 :	3059 	1038 	1627 	0.625087
Fold 9 :	3577 	852  	1295 	0.699336
Fold 10:	1834 	1083 	1853 	0.498008
Average:	3345 	892  	1392 	0.670608
