In [1]:
%pylab inline

%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


## Libraries

In [2]:
import numpy as np
from numpy.random import randn

import pandas as pd

#time
from datetime import datetime
from datetime import timedelta

#counting
from collections import Counter

In [3]:
# good old matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

#high-level based on matplotlib
import seaborn as sns

#dynamic and interactive
import highcharts
from highcharts.charts import chart

## Init

In [4]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
np.random.seed(1234)

In [5]:
#loading highcharts javascript
highcharts.init()

## Reading data

### Checkins

In [6]:
dfc = pd.read_csv('../../datasets/loc-gowalla_totalCheckins.txt', sep='\t', header=False)
dfc.columns = ['uid','utc','lat','lon','vid']

dfc['utc'] = dfc['utc'].astype('datetime64[ms]')

dfc.head()

Unnamed: 0,uid,utc,lat,lon,vid
0,0,2010-10-18 22:17:43,30.269103,-97.749395,420315
1,0,2010-10-17 23:42:03,30.255731,-97.763386,316637
2,0,2010-10-17 19:26:05,30.263418,-97.757597,16516
3,0,2010-10-16 18:50:42,30.274292,-97.740523,5535878
4,0,2010-10-12 23:58:03,30.261599,-97.758581,15372


### Data munging: cleanup, time parsing, etc.

New York, New York!
 - bounding box: (40.4774, -74.2589), (40.9176, -73.7004)

In [7]:
ny = (dfc['lat']>=40.4774) & (dfc['lat']<=40.9176) & (dfc['lon']>=-74.2589) & (dfc['lon']<=-73.7004)
dfc = dfc[ny]

Time columns

In [8]:
dfc['year']     = dfc['utc'].apply(lambda x: x.date().year)
dfc['month']    = dfc['utc'].apply(lambda x: x.date().month)
dfc['day']      = dfc['utc'].apply(lambda x: x.date().day)

dfc['date']     = dfc['utc'].apply(lambda x: x.date())

def time_in_seconds(x):
    return ((x.hour)*60+x.minute)*60+x.second

dfc['time']     = dfc['utc'].apply(lambda x: time_in_seconds(x.time()))

dfc['isotime']  = dfc['utc'].apply(lambda x: x.isoformat() +'Z')

dfc = dfc[dfc['date']<=datetime(2010,10,19).date()]
dfc[['uid','utc','lat','lon','vid', 'time', 'isotime']].head()

Unnamed: 0,uid,utc,lat,lon,vid,time,isotime
9,0,2010-10-12 00:21:28,40.643885,-73.782806,23261,1288,2010-10-12T00:21:28Z
10,0,2010-10-11 20:21:20,40.741374,-73.988105,16907,73280,2010-10-11T20:21:20Z
11,0,2010-10-11 20:20:42,40.741388,-73.989455,12973,73242,2010-10-11T20:20:42Z
12,0,2010-10-11 00:06:30,40.72491,-73.994621,341255,390,2010-10-11T00:06:30Z
13,0,2010-10-10 22:00:37,40.729768,-73.998535,260957,79237,2010-10-10T22:00:37Z


### Venues

In [9]:
dfv = pd.read_csv('../../datasets/spots.txt', sep='\t', header=False)
dfv.columns = ['vid','name','loc']

coords = dfv['loc'].replace('[^0-9. -]+', '',regex=True)
coords = coords.apply(lambda x: x.split())

dfv['v_lat'] = coords.apply(lambda x: float(x[1]))
dfv['v_lon'] = coords.apply(lambda x: float(x[0]))

dfv = dfv.drop('loc', 1)

ny = (dfv['v_lat']>=40.4774) & (dfv['v_lat']<=40.9176) & (dfv['v_lon']>=-74.2589) & (dfv['v_lon']<=-73.7004)
dfv = dfv[ny]

dfv.head()

Unnamed: 0,vid,name,v_lat,v_lon
0,1391604,Conference House Park,40.501759,-74.252343
1,1391611,Almer G. Russell Pavilion,40.502265,-74.254264
2,3612422,Conference House,40.500064,-74.249042
3,3612431,Billop House,40.500064,-74.249042
4,1391499,Biddle House,40.505483,-74.254146


### Merge venue names and checkins

In [10]:
df = pd.merge(dfc, dfv[['vid', 'name']], how='left', on='vid')

Missing venue names on the available checkins:

In [11]:
print "missing venue names: {}% of available checkins".format(len(df[pd.isnull(df['name'])])*100 / len(df))

missing venue names: 85% of available checkins


## Data Exploration

In [12]:
d = df.groupby('date').size()

In [13]:
chart({
            'chart': {
                'type': 'line',
                'marginRight': 30,
                'marginBottom': 50
            },
            'title': {
                'text':'#checkins per day in New York City'
            },
            'yAxis': {
                'type': 'linear',
                'title': {'text':'#checkins'},
                
            },
            'xAxis': {
                'categories': [str(x) for x in d.index.tolist()]
            },
            'series': [{
                'name': 'date',
                'data': d.tolist()
            }]
})

## Clustering

In [14]:
from sklearn.cluster import KMeans

In [15]:
start = datetime(2010,1,1,0,0) 
dfw = df[df['utc']>start]

In [16]:
cl = min(200, len(dfw)/8)
ml = KMeans(n_clusters=cl)
ml.fit(dfw[['lat', 'lon']])

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=156, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [17]:
labels = Counter(ml.labels_)
clusters = [ [ ml.cluster_centers_[x][1], ml.cluster_centers_[x][0], labels[x] ] for x in labels]
clusters

[[-73.992904478200003, 40.729185043581822, 11],
 [-73.783935537399998, 40.6468933369, 1],
 [-73.968482260738099, 40.759754040447618, 21],
 [-73.870768733328561, 40.775952500000002, 7],
 [-74.166031770000004, 40.69031828, 1],
 [-73.985920483117638, 40.747867204070587, 17],
 [-73.962442556623074, 40.71589402404615, 13],
 [-73.988431985000005, 40.769535115125002, 16],
 [-73.985187961199998, 40.686405421499998, 2],
 [-74.003994063563638, 40.742144789600005, 11],
 [-73.947201073163882, 40.765613718413888, 36],
 [-74.011051162281817, 40.711933426390907, 11],
 [-74.001801404357892, 40.728337633484209, 19],
 [-73.920618295699995, 40.878599408500001, 1],
 [-73.98535919246153, 40.758894936869233, 13],
 [-73.997493437286366, 40.72270479547273, 22],
 [-73.983080955774994, 40.722185406674996, 4],
 [-73.97643165256666, 40.780602128491665, 12],
 [-73.952958806744448, 40.743572528244442, 9],
 [-73.950730601000004, 40.723575655174997, 4],
 [-73.993437756742864, 40.751245765242857, 14],
 [-73.9908133680

In [18]:
delta=0.15
chart({
            'chart': {
                'type': 'bubble',
                'zoomType': 'xy',
                'width':800,
                'height':800,
                'plotBackgroundImage':'https://maps.googleapis.com/maps/api/staticmap?center=40.8,-74.0&zoom=11&size=800x800&maptype=roadmap'
            },
            'plotOptions': {
              'bubble': {
                'maxSize':'5%',
                'minSize':'1%'
               }
             },
            'legend': { 
              'enabled': False
            },
            'yAxis': {
              'min':40.8-delta,
              'max':40.8+delta
            },
            'xAxis': {
              'min':-74.0-delta,
              'max':-74.0+delta
            },
            'series': [{'data': clusters, 'color':"#FF0000"}]
}, '800px', '800px')

### Write data to csv file

####checkins

In [19]:
cols = ['year','month','day','time', 'isotime','uid','lat','lon','vid']
dfc[cols].to_csv('../../datasets/checkins.csv', header=False, index=False)

####venues

In [20]:
dfv[['vid','name','v_lat','v_lon']].to_csv('../../datasets/venues.csv', header=False, index=False)