In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import MeanShift
from scipy.spatial import distance

In [2]:
data = pd.read_csv('checkins.csv')

In [3]:
data.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984222,15824,5222,38.895112,-77.036366,2012-04-21 17:43:47
1,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
2,984291,105054,5222,45.523452,-122.676207,2012-04-21 17:39:22
3,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
4,984232,93870,380645,33.448377,-112.074037,2012-04-21 17:38:18


In [4]:
data.shape

(396634, 6)

In [5]:
data = data[['latitude', 'longitude']][:100000]

# MeanShift

In [6]:
model = MeanShift(bandwidth=0.1, n_jobs=-1)

In [7]:
model.fit(data)

MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=-1, seeds=None)

In [8]:
FS_offices = np.matrix([[33.751277, -118.188740], 
                     [25.867736, -80.324116], 
                     [51.503016, -0.075479], 
                     [52.378894, 4.885084], 
                     [39.366487, 117.036146], 
                     [-33.868457, 151.205134]])

In [9]:
labels, count = np.unique(model.labels_, return_counts=True)

In [10]:
tuples = [(labels[i], count[i]) for i in range(len(labels)) if count[i] > 15]

In [11]:
nearest_points = []
for i in range(len(tuples)):
    for j in range(FS_offices.shape[0]):
        nearest_points.append((distance.euclidean(model.cluster_centers_[tuples[i][0]], FS_offices[j]), tuples[i][0]))

In [12]:
nearest_points = sorted(nearest_points)

In [13]:
cluster_2_coordinate = {}
for i in range(len(labels)):
    cluster_2_coordinate[labels[i]] = ' '.join([str(model.cluster_centers_[i][0]), str(model.cluster_centers_[i][1])])

In [14]:
ans = [cluster_2_coordinate[i[1]] for i in nearest_points[:20]]

In [15]:
ans

['-33.8606304286 151.204775929',
 '52.3729639903 4.89231722258',
 '25.8456722643 -80.3188905964',
 '51.5029912609 -0.12553728871',
 '33.8098779553 -118.148923807',
 '25.78581242 -80.2179380368',
 '25.7053497211 -80.2834287382',
 '26.0100982493 -80.1999905857',
 '33.8883253428 -118.048928172',
 '33.8729860116 -118.362091147',
 '33.9725748214 -118.168370667',
 '26.1388437868 -80.3343468368',
 '33.983935874 -118.007404973',
 '26.1208626586 -80.1589066802',
 '33.8173064339 -117.891249171',
 '34.0603975546 -118.248709027',
 '33.6743026598 -117.858789268',
 '26.200584641 -80.2507161256',
 '34.0354869531 -118.438997719',
 '34.1314601492 -118.118011806']

In [16]:
with open('ans.txt', 'w') as file:
    file.write(ans[0])

In [17]:
nearest_points[:20]

[(0.007834758163107856, 406),
 (0.009353316185992226, 373),
 (0.022674066158385495, 415),
 (0.05005829482278787, 58),
 (0.07084773242719973, 51),
 (0.13410903336184654, 29),
 (0.16740596425035326, 166),
 (0.18887596060185083, 92),
 (0.19577945647763628, 87),
 (0.21181053682436798, 42),
 (0.2222332907317907, 293),
 (0.2713007595066735, 315),
 (0.2949788868004569, 119),
 (0.3022701186924605, 55),
 (0.30473050307840693, 27),
 (0.3148837903362732, 11),
 (0.3388104702511318, 32),
 (0.3408456533220572, 160),
 (0.37868750125029754, 17),
 (0.3867062248427277, 47)]