In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy import sparse
import scipy.sparse as sps
import collections

# Get data

In [None]:
!wget https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz
!gzip -d loc-gowalla_edges.txt.gz > /devnull

In [None]:
!wget https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz
!gzip -d loc-gowalla_totalCheckins.txt.gz > /devnull

In [None]:
edges = pd.read_csv('loc-gowalla_edges.txt', sep='\t', names=['from', 'to'])
N = edges['from'].nunique()
r_ind = edges['from'].values
col_ind = edges['to'].values
data = np.ones(r_ind.shape[0])
S = sps.csr_matrix((data, (r_ind, col_ind)), shape=(N, N))
S.setdiag(-1)

# Affinity propagation

In [None]:
A, R = S.copy(), S.copy()
A.data, R.data = np.zeros(len(S.data), dtype=np.float64), np.zeros(len(S.data), dtype=np.float64)

max_iterations = 30

for n in range(max_iterations):
    sum = S + A
    tmp = sum.copy()
    indices = np.asarray(np.argmax(sum, -1)).flatten()
    tmp[np.arange(N), indices] = -np.inf
    indices_tmp = np.asarray(np.argmax(tmp, -1)).flatten()
    max1 = np.asarray(sum[np.arange(N), indices]).flatten()
    max2 = np.asarray(sum[np.arange(N), indices_tmp]).flatten()
    for r_ind, (i, j) in enumerate(zip(R.indptr, R.indptr[1:])):
        R.data[i:j] = S.data[i:j] - max1[r_ind]
        if indices[r_ind] in R.indices[i:j]:
            R[r_ind, indices[r_ind]] = S[r_ind, indices[r_ind]] - max2[r_ind]
    A = R.copy()
    A.setdiag(0)
    A[A < 0] = 0
    sums = np.asarray(np.sum(A, axis=0)).flatten()
    sums_d = R.diagonal() + sums
    A.data = np.minimum(0, sums_d[A.indices] - A.data)
    A.setdiag(sums)

result = A + R
labels = [np.argmax(result[i]) for i in range(N)]

# Check accuracy

In [31]:
checkins_df = pd.read_csv('loc-gowalla_totalCheckins.txt', delimiter = '\t', names = ['user', 'time', 'latitude', 'longitude', 'location'])
users = checkins['user'].unique()
np.random.shuffle(users)
test_users = users[:len(users) // 10]

clusters = collections.defaultdict(list)
for user, i in enumerate(labels):
  if user not in test_users:
    clusters[i].append(user)

loc_counter = collections.defaultdict(collections.Counter)
for i, vals in clusters.items():
    data = checkins[checkins['user'].isin(vals)]['location_id'].values
    loc_counter[i].update(data)

In [32]:
accuracies = []
for user in test_users:
    i = labels[user]
    accuracy = len(set([location_id for location_id, _ in loc_counter[i].most_common(10)])
                    & set(checkins[checkins['user'] == user]['location_id'].values)) / 10
    accuracies.append(accuracy)
print(f'accuracy: {np.mean(accuracies)}')

accuracy: 0.03497992342889159
