In [118]:
import pandas as pd
import numpy as np
from sklearn.cluster import MeanShift
from numpy.linalg import norm

In [45]:
df = pd.read_csv(r".\201309_foursquare_dataset_umn\fsq\umn_foursquare_datasets\checkins.dat", sep="|", low_memory=False)

In [46]:
df = df.replace('                   ', np.nan).dropna() 

In [47]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
2,984222,15824.0,5222.0,38.8951118,-77.0363658,2012-04-21 17:43:47
4,984234,44652.0,5222.0,33.800745,-84.41052,2012-04-21 17:43:43
8,984291,105054.0,5222.0,45.5234515,-122.6762071,2012-04-21 17:39:22
10,984318,2146539.0,5222.0,40.764462,-111.904565,2012-04-21 17:35:46
11,984232,93870.0,380645.0,33.4483771,-112.0740373,2012-04-21 17:38:18


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 396634 entries, 2 to 1021965
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0      id                  396634 non-null  object 
 1    user_id               396634 non-null  float64
 2    venue_id              396634 non-null  float64
 3        latitude          396634 non-null  object 
 4        longitude         396634 non-null  object 
 5        created_at        396634 non-null  object 
dtypes: float64(2), object(4)
memory usage: 21.2+ MB


In [67]:
df.columns

Index(['id', 'user_id', 'venue_id', 'latitude', 'longitude', 'created_at'], dtype='object')

In [65]:
df.columns = [x() for x in df.columns]

In [70]:
df.reset_index(inplace=True)

In [285]:
X = df.loc[:99999, ['latitude', 'longitude']]

In [286]:
X = X.apply(lambda x: x.str.strip()).values.astype(np.float32)

In [287]:
model = MeanShift(bandwidth=.1, n_jobs=-1).fit(X)

In [288]:
centers = model.cluster_centers_
labels_per_point = model.labels_
centers.shape

(3230, 2)

In [289]:
np.bincount(labels_per_point)

array([12506,  6572,  4408, ...,     1,     1,     1], dtype=int64)

In [290]:
centers_ref = centers[np.bincount(labels_per_point) > 15]

In [291]:
def distance_between(X, coordinate):
    return norm(X - coordinate, axis=1)

In [292]:
l = []
for i in [(33.751277, -118.188740), (25.867736, -80.324116), (51.503016, -0.075479), (52.378894, 4.885084), (39.366487, 117.036146), (-33.868457, 151.205134)]:
    l.append(distance_between(centers_ref, np.array(i)))

In [294]:
np.array(l).shape, centers_ref.shape[0]

((6, 591), 591)

In [295]:
mask = (np.array(l).flatten().argsort() % centers_ref.shape[0])[:20]

In [296]:
output_array = centers_ref[mask]

In [297]:
np.savetxt("result.txt", output_array)

In [298]:
output_array

array([[-3.38606339e+01,  1.51204803e+02],
       [ 5.23729591e+01,  4.89231730e+00],
       [ 2.58456764e+01, -8.03188858e+01],
       [ 5.15029106e+01, -1.25537127e-01],
       [ 3.38098640e+01, -1.18149147e+02],
       [ 2.57859211e+01, -8.02178726e+01],
       [ 2.57053204e+01, -8.02834854e+01],
       [ 2.60101414e+01, -8.02000427e+01],
       [ 3.38883133e+01, -1.18049004e+02],
       [ 3.38730698e+01, -1.18361679e+02],
       [ 3.39725647e+01, -1.18168373e+02],
       [ 2.61388397e+01, -8.03343277e+01],
       [ 3.39839401e+01, -1.18007408e+02],
       [ 2.61207809e+01, -8.01589050e+01],
       [ 3.38171501e+01, -1.17890518e+02],
       [ 3.40602341e+01, -1.18247810e+02],
       [ 3.36741905e+01, -1.17857864e+02],
       [ 2.62005730e+01, -8.02506561e+01],
       [ 3.40355949e+01, -1.18438438e+02],
       [ 3.41314240e+01, -1.18118027e+02]], dtype=float32)

In [276]:
np.array(l).argmin(axis=1) % centers_ref.shape[0]

array([ 49, 400,  58, 374, 528, 391], dtype=int64)

In [277]:
mask

array([391, 374, 400, ..., 121, 456, 475], dtype=int64)