In [3]:
import pandas as pd
import numpy as np
import csv
from sklearn.cluster import MeanShift


def conv_dat_to_csv_data_with_coords(filename):
    # function to convert input (.dat) data to .csv without empty coordinates
    # return pandas DataFrame of resulted .csv file
    with open(filename + '.dat') as input_file:        
        newLines = []
        for line in input_file:
            newLine = [x.strip() for x in line.split('|')]
            if len(newLine) == 6 and newLine[3] and newLine[4]:
                newLines.append(newLine)
    with open(filename + '.csv', 'w') as output_file:
        file_writer = csv.writer(output_file)
        file_writer.writerows(newLines)
    df = pd.read_csv(filename + '.csv', header=0)
    return df

In [4]:
df = conv_dat_to_csv_data_with_coords('checkins')

In [8]:
print('shape:', df.shape)
df.head()

shape: (396634, 6)


Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984222,15824,5222,38.895112,-77.036366,2012-04-21 17:43:47
1,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
2,984291,105054,5222,45.523452,-122.676207,2012-04-21 17:39:22
3,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
4,984232,93870,380645,33.448377,-112.074037,2012-04-21 17:38:18


In [48]:
lat = df.latitude
lon = df.longitude
X = [0, 0]
X[0] = lat
X[1] = lon
X = np.array(X)
X = np.transpose(X)
# we need only 100000 first rows 
X = X[:100000]
print(X.shape)

(100000, 2)


In [50]:
# fitting
clusterer = MeanShift(bandwidth=0.1)
clusterer.fit(X)

MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [58]:
clusters = clusterer.cluster_centers_
labels = clusterer.labels_

In [60]:
# let's get a rid of very small clusters (with amount of elements less than 16)
cluster_centers = []
for cluster, volume in zip(clusters, labels):
    if volume >= 16:
        cluster_centers.append(cluster)

In [75]:
# coordinates of centers which should be as much closer to clusters' centers as possible
needed_centers = [[33.751277, -118.188740],
                 [25.867736, -80.324116],
                 [51.503016, -0.075479],
                 [52.378894, 4.885084],
                 [39.366487, 117.036146],
                 [-33.868457, 151.205134]]

In [84]:
def eucludean_distance(a, b):
    return np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2)

In [110]:
# let's find the closest center and its coordinates
min_distance = 9999999999
coords = None
for n_c in needed_centers:
    for cluster in cluster_centers:
        if eucludean_distance(n_c, cluster) < min_distance:
            min_distance = eucludean_distance(n_c, cluster)
            coords = cluster
print('min distance:', min_distance, '\n' + 'coords:', coords)

min distance: 0.007834758163107856 
coords: [-33.86063043 151.20477593]
