# Load the dataset.

In [195]:
with open("research/datasets/football_key.tsv") as f:
    # Each line is of form: <country_id> <country_name>
    def fmt(line):
        return (int(line[0])-1, line[1].strip('"'))
    data_key = [fmt(line.strip().split()) for line in f if line[0] != '*']

In [196]:
with open("research/datasets/football_pairs.tsv") as f:
    # Each line is of form: <country_a_id> <country_b_id> <number_of_players>
    def fmt(pair):
        return (int(pair[0])-1, int(pair[1])-1, 1)
    data_pairs = [fmt(line.strip().split()) for line in f if line[0] != '*']

# Turn into useful format

Edit the `neighbours[]` below and `similarity` func below that to create a new metric.

In [197]:
neighbours = [set() for _ in range(len(data_key))]
for p in data_pairs:
    neighbours[p[0]].add(p[1])
    neighbours[p[1]].add(p[0])

In [198]:
def similarity_CN(x, y):
    # Common neighbours
    # x, y are indices to neighbours[]
    return len(neighbours[x] & neighbours[y])

# Compute similarities.

In [199]:
# S_CN[x][y] contains the similarity of nodes x and y using the Common Neighbours (CN) metric.
S_CN = [[0 for _ in range(len(data_key))] for _ in range(len(data_key))]
for i in range(len(data_key)-1):
    for j in range(0, len(data_key)):
        S_CN[i][j] = similarity_CN(i, j)
        

In [200]:
# A quick eyeball check of a subset of the data.
num_to_print = len(data_key)//2
print(' '*4 + ' '.join(d[1] for d in data_key[:num_to_print]))
print('\n'.join(data_key[i][1] + ' ' + ','.join('{:>3}'.format(c) for c in S_CN[i][:num_to_print]) for i in range(num_to_print)))

    ARG AUT BEL BGR BRA CHE CHL CMR COL DEU DNK ESP FRA GBR GRE HRV IRN
ARG   4,  2,  1,  1,  3,  0,  1,  2,  2,  2,  2,  2,  1,  1,  0,  2,  0
AUT   2,  8,  3,  2,  3,  1,  1,  4,  2,  6,  4,  5,  4,  3,  1,  4,  1
BEL   1,  3,  6,  1,  2,  1,  1,  3,  1,  5,  3,  5,  3,  3,  1,  2,  1
BGR   1,  2,  1,  4,  2,  0,  0,  4,  1,  1,  3,  1,  1,  0,  0,  3,  1
BRA   3,  3,  2,  2,  7,  0,  1,  5,  2,  3,  2,  3,  1,  1,  0,  2,  0
CHE   0,  1,  1,  0,  0,  2,  0,  0,  0,  2,  0,  2,  2,  1,  0,  0,  0
CHL   1,  1,  1,  0,  1,  0,  3,  1,  3,  2,  1,  2,  1,  2,  0,  1,  0
CMR   2,  4,  3,  4,  5,  0,  1,  9,  2,  4,  4,  3,  3,  2,  0,  5,  1
COL   2,  2,  1,  1,  2,  0,  3,  2,  5,  3,  2,  3,  2,  2,  0,  2,  0
DEU   2,  6,  5,  1,  3,  2,  2,  4,  3, 19,  3, 13,  9, 10,  3,  3,  0
DNK   2,  4,  3,  3,  2,  0,  1,  4,  2,  3,  7,  3,  3,  3,  0,  5,  1
ESP   2,  5,  5,  1,  3,  2,  2,  3,  3, 13,  3, 18,  9,  9,  3,  3,  1
FRA   1,  4,  3,  1,  1,  2,  1,  3,  2,  9,  3,  9, 13,  5,  1,

# Create test sets.

Split the list of links into 10 random partitions, as the paper does, to get comparable measurements. Also create a set of all links which are not in the dataset.

In [201]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for it in range(0, len(l), n):
        yield l[it:it + n]
        
e = []
predict = []
for i in range(len(data_key)):
    for j in range(i+1, len(data_key)):
        if i in neighbours[j]:
            e.append((i, j))
        else:
            predict.append((i, j))
            
# e now contains all link pairs
# predict contains all non-existing links from the original data
# each pair is a tuple (a, b), where a < b

# We now randomly shuffle this list
import random
random.shuffle(e)

print('len(e)', len(e))
print('len(e)//10 =', len(e)//10)

# Create e_prime, a list of 10 partitions
e_prime = []
for _ in range(10):
    e_prime.append(list(chunks(e, len(e)//10 + 1)))

# TODO(iandioch): Figure out why the following line is necessary?
e_prime = e_prime[0]

# The following is a quick eyeball test to make sure the partitions look ok.
print('10 subsets:')
for i in range(len(e_prime)):
    entry = e_prime[i]
    print(entry)


len(e) 118
len(e)//10 = 11
10 subsets:
[(4, 19), (24, 28), (5, 23), (19, 20), (6, 17), (21, 26), (13, 25), (10, 17), (17, 24), (7, 19), (21, 30), (28, 31)]
[(11, 21), (9, 33), (23, 32), (14, 25), (15, 17), (9, 21), (11, 33), (17, 34), (12, 17), (4, 27), (1, 7), (11, 28)]
[(15, 31), (9, 16), (2, 24), (4, 26), (24, 34), (11, 27), (13, 18), (13, 15), (9, 30), (31, 34), (11, 23), (7, 17)]
[(10, 11), (3, 31), (10, 24), (7, 9), (17, 33), (12, 21), (25, 29), (22, 27), (1, 12), (2, 23), (11, 24), (7, 26)]
[(1, 34), (23, 24), (0, 17), (8, 11), (23, 34), (9, 15), (9, 11), (3, 9), (12, 34), (19, 33), (13, 24), (10, 29)]
[(12, 29), (1, 17), (9, 23), (9, 10), (12, 33), (7, 12), (9, 12), (13, 29), (11, 34), (4, 8), (8, 32), (0, 6)]
[(2, 17), (1, 9), (17, 21), (13, 34), (9, 28), (11, 15), (17, 23), (9, 17), (13, 33), (6, 32), (13, 17), (9, 34)]
[(4, 17), (4, 12), (1, 13), (24, 32), (2, 9), (11, 17), (14, 28), (12, 23), (8, 17), (9, 24), (3, 11), (13, 28)]
[(9, 25), (7, 14), (17, 25), (12, 20), (10, 1

In [202]:
aucs = []
n1s = []
n2s = []
n3s = []
ns = []

# Column headings.
print('\t\tn1   \tn2   \tn3   \tAUC')

# Iterate across the 10 folds.
for i in range(10):
    test = e_prime[i]
    
    n1 = 0 # missing_link > nonexistant_link
    n2 = 0 # missing_link = nonexistant_link
    n3 = 0 # missing_link < nonexistant_link
    n = 0 # total link comparisons
    for missing_link in test:
        a_score = S_CN[missing_link[0]][missing_link[1]]
        for nonexistant_link in predict:
            b_score = S_CN[nonexistant_link[0]][nonexistant_link[1]]
            if abs(a_score-b_score) < 0.0005:
                n2 += 1
            elif a_score > b_score:
                n1 += 1
            else:
                n3 += 1
            n += 1
    auc = (n1 + 0.5*n2)/(n)
    aucs.append(auc)
    n1s.append(n1)
    n2s.append(n2)
    n3s.append(n3)
    ns.append(n)
    print('Fold {:<2}:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(i+1, n1, n2, n3, auc))

def avg(seq):
    return sum(seq)/len(seq)

print('Average:\t{:<5}\t{:<5}\t{:<5}\t{:<.6f}'.format(avg(n1s), avg(n2s), avg(n3s), avg(aucs)))

		n1   	n2   	n3   	AUC
Fold 1 :	2222 	1311 	2191 	0.502708
Fold 2 :	3893 	749  	1082 	0.745545
Fold 3 :	2612 	1193 	1919 	0.560535
Fold 4 :	3450 	885  	1389 	0.680031
Fold 5 :	4171 	631  	922  	0.783805
Fold 6 :	3645 	813  	1266 	0.707809
Fold 7 :	4413 	521  	790  	0.816474
Fold 8 :	3796 	772  	1156 	0.730608
Fold 9 :	2887 	1100 	1737 	0.600454
Fold 10:	2364 	940  	1466 	0.594130
Average:	3345.3	891.5	1391.8	0.672210
