## Affinity Propagation 
### Part II


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Utils

In [2]:
def loadCheckins():
    df = pd.read_csv('Gowalla_totalCheckins.txt', sep='\t', header=None)
    checkins = df[[0, 4]]
    checkins.columns = ['userId', 'placeId']
    return checkins, np.unique(checkins['userId'].values)
    

In [3]:
def loadClusters(nodesN):
    clusters = pd.read_csv('Gowalla_exemplars.txt', sep='\n', header=None, names=['exemplarId'])
    clusters["userId"] = pd.Series(range(nodesN))
    return clusters

In [4]:
def loadData(df1, df2, key):
    return pd.merge(df1, df2,  how='inner', left_on=[key], right_on = [key])

In [5]:
def getTop(df, n, groupby, cols):
    return df.groupby(groupby).apply(lambda x: x[cols].value_counts().nlargest(n).index.values)

In [6]:
def getAcc(users, df, top):
    positive = 0
    total = 0
    for u in users:
        top_places = top.loc[clusters.loc[u]['exemplarId']]
        u_places = df[df['userId'] == u]['placeId']

        statistic = np.in1d(top_places, u_places)
        total += top_places.shape[0]
        positive += statistic[statistic].shape[0]
    return positive / total * 100

#### Загрузка данных

In [7]:
checkins, users = loadCheckins()
checkins[:5]

Unnamed: 0,userId,placeId
0,0,22847
1,0,420315
2,0,316637
3,0,16516
4,0,5535878


In [8]:
nodesN = 196591
clusters = loadClusters(nodesN)
clusters[:5]

Unnamed: 0,exemplarId,userId
0,220,0
1,1149,1
2,220,2
3,1149,3
4,267,4


In [9]:
df = loadData(clusters, checkins, 'userId')
df[:5]

Unnamed: 0,exemplarId,userId,placeId
0,220,0,22847
1,220,0,420315
2,220,0,316637
3,220,0,16516
4,220,0,5535878


#### Подсчет метрики

In [10]:
u_train, u_test = train_test_split(users, test_size=0.1, random_state=42)

In [11]:
df_test = df[np.in1d(df['userId'], u_test)].reset_index(drop=True)
df_test[:5]

Unnamed: 0,exemplarId,userId,placeId
0,527,39,19542
1,527,39,270037
2,527,39,201028
3,527,39,10570
4,527,39,352264


In [12]:
df_train = df[np.in1d(df['userId'], u_train)].reset_index(drop=True)
df_train[:5]

Unnamed: 0,exemplarId,userId,placeId
0,220,0,22847
1,220,0,420315
2,220,0,316637
3,220,0,16516
4,220,0,5535878


In [13]:
top = getTop(df, 10, 'exemplarId', 'placeId')
top[:5]

exemplarId
0    [32308, 21714, 69249, 12359, 769117, 1317751, ...
1    [9961, 691005, 69516, 894754, 26995, 75386, 15...
2    [420315, 17208, 130876, 9371, 21714, 74955, 90...
3    [68552, 91717, 25281, 1247537, 703421, 26729, ...
4    [29667, 210176, 57155, 85915, 9073, 693413, 63...
dtype: object

In [14]:
acc = getAcc(u_test, df_test, top)
acc

28.252508029660177

#### Сведения о класстеризации

Количество кластеров:

In [15]:
clustersN = np.unique(clusters['exemplarId'].values).shape[0]
clustersN

47504

Количество одиночных кластеров:

In [16]:
clustersWidth = clusters['exemplarId'].value_counts()

In [17]:
clustersWidth[clustersWidth == 1].shape[0]

22059