In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.cluster import DBSCAN

In [2]:
p1_list = []
delta = 60 * 60 * 24
duplicate_interval = 60 * 60


def read_txt(file_name):
    filtered = []
    file = open(file_name, 'r', encoding='utf-8')  #打开文件
    file_data = file.readlines()  #读取所有行
    for row in file_data:
        row_list = row.split('\t')
        if row_list[2] == '4bf58dd8d48988d16d941735':
            tmp_list = []
            for i, x in enumerate(row_list):
                if i != 3:
                    tmp_list.append(x)
                else:
                    _, latitude = x.split('?')
                    tmp_list.append('Cafe')
                    tmp_list.append(latitude)
            row_list = tmp_list
        row_list[-1] = row_list[-1].replace('\n', '')
        filtered.append(row_list)
    return filtered


def str2delta(t):
    t = int(t)
    abs_t = abs(t)
    d = pd.Timedelta(minutes=abs_t)
    if t > 0:
        return d
    else:
        return -d


# 这个方法已经要求了轨迹至少得有三个点
def trajectory_regulation(df):
    trajectory_id = 1
    traj_list = []
    retained_list = []
    duplicate_list = []
    df = df.sort_values(by=['local_time']).reset_index(drop=True)
    user_id = df.iloc[0, 0]
    last_date = None
    last_checkin_date = None
    last_checkin_id = None
    last_index = 0
    for i, x in df.iterrows():
        if i == 0:
            last_date = x['local_time']
            last_index = i
            last_checkin_date = x['local_time']
            last_checkin_id = x['business_id']
            continue
        current_date = x['local_time']
        current_checkin_id = x['business_id']
        if (current_date - last_date).total_seconds() < delta:  # 这个是用来取一条轨迹的始末
            if last_checkin_id == current_checkin_id:  # 有连续的两个相同POI的签到点，如果是短时间内的重复打卡那就只保留第一个点，重复这个过程
                if (current_date - last_checkin_date).total_seconds() < duplicate_interval:
                    duplicate_list.append(i)
                else:
                    # 如果是相同的签到点，由于时间间隔大于一小时，那就相当于这个是不同的签到点
                    last_checkin_date = current_date
                    last_checkin_id = current_checkin_id
            else:  # 如果不是相同的签到点，那肯定可以直接记录当前的签到点和时间
                last_checkin_date = current_date
                last_checkin_id = current_checkin_id
        else:
            # 当前这一项如果时间距离轨迹的第一项超过了24小时，那么应该是对上一项进行处理
            # +1是因为数量是减后加1，但还要-1，因为是要用上一项的索引，那么就是不加不减
            if i - last_index >= 3:
                for retain_row in range(last_index, i):  # 刚好range也不用加1了
                    if retain_row not in duplicate_list:
                        retained_list.append(retain_row)  # 索引是用来标记要留下哪些行
                        traj_list.append(trajectory_id)  # 轨迹序号是用来标记哪些行归属一条轨迹
                trajectory_id += 1

            last_date = x['local_time']  # 重新开始划分轨迹
            last_index = i  # 下一条轨迹的开始索引
    retained_df = df.iloc[retained_list, :].copy()
    traj_list = [user_id + '@' + str(x) for x in traj_list]
    retained_df['trajectory_id'] = traj_list
    p1_list.append(retained_df)


def retain_users_with_trajectories(df, min_len):
    trajectories = df['trajectory_id'].unique()
    if len(trajectories) >= min_len:
        return True
    else:
        return False


def check(df):
    n = df['trajectory_id'].nunique()
    if n < 5:
        return False
    else:
        return True


def haversine(pos1, pos2):
    lat1, lon1 = pos1
    lat2, lon2 = pos2
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(math.radians(dlat / 2)) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(
        math.radians(dlon / 2)) ** 2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371
    h_dist = c * r

    return h_dist


def lon_lat_tuning(df):
    l = len(df)
    lon_lat = df['lat_lon'].unique()
    if len(lon_lat) != 1:
        most = df['lat_lon'].value_counts().index[0]
        df['latitude'] = [most.split(',')[0]] * l
        df['longitude'] = [most.split(',')[1]] * l
    return df


def convert_time(d):
    hour = d.hour
    minute = d.minute
    if minute < 30:
        minute = 30
    else:
        minute = 0
        hour += 1
    return (hour * 3600 + minute * 60) / 24 / 3600

In [3]:
data = read_txt('data/NYC/NYC.txt')
columns = ['user_id', 'business_id', 'cat_id', 'cat_name', 'latitude', 'longitude', 'zone_offset', 'utc_time']
nyc = pd.DataFrame(data, columns=columns)
nyc

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.719810375488535,-74.00258103213994,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.60679958140643,-74.04416981025437,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716161684843215,-73.88307005845945,-240,Tue Apr 03 18:02:24 +0000 2012
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.7451638,-73.982518775,-240,Tue Apr 03 18:02:41 +0000 2012
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.74010382743943,-73.98965835571289,-240,Tue Apr 03 18:03:00 +0000 2012
...,...,...,...,...,...,...,...,...
227423,688,3fd66200f964a52000e71ee3,4bf58dd8d48988d1e7931735,Music Venue,40.73359624463056,-74.00313913822174,-300,Sat Feb 16 02:29:11 +0000 2013
227424,560,4bca32ff0687ef3be789dbcc,4bf58dd8d48988d16c941735,Burger Joint,40.74571871633641,-73.99372049041526,-300,Sat Feb 16 02:31:35 +0000 2013
227425,945,50a77716e4b0b5a9492f6f56,4bf58dd8d48988d103941735,Home (private),40.85436449986449,-73.88307005845945,-300,Sat Feb 16 02:33:16 +0000 2013
227426,671,4514efe0f964a520e7391fe3,4bf58dd8d48988d11d941735,Bar,40.73598131890752,-74.0293093759129,-300,Sat Feb 16 02:34:31 +0000 2013


In [4]:
nyc['utc_time'] = pd.to_datetime(nyc['utc_time'])
zone_delta = nyc['zone_offset'].map(lambda x: str2delta(x))
nyc['local_time'] = nyc['utc_time'] + zone_delta
nyc.groupby('user_id').apply(lambda x: trajectory_regulation(x))

In [5]:
p1 = pd.concat(p1_list).reset_index(drop=True)
p1

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4db44994cda1c57c82583709,4bf58dd8d48988d1f1931735,General Entertainment,40.73939794092962,-73.99320973426613,-240,2012-04-08 18:20:29+00:00,2012-04-08 14:20:29+00:00,1@1
1,1,4a541923f964a52008b31fe3,4bf58dd8d48988d14e941735,American Restaurant,40.785677,-73.976498,-240,2012-04-08 20:02:10+00:00,2012-04-08 16:02:10+00:00,1@1
2,1,40f1d480f964a5205b0a1fe3,4bf58dd8d48988d143941735,Breakfast Spot,40.71992902385343,-74.00853182642739,-240,2012-04-09 16:20:52+00:00,2012-04-09 12:20:52+00:00,1@1
3,1,3fd66200f964a52094e41ee3,4bf58dd8d48988d1cc941735,Steakhouse,40.7342762286207,-73.9935253702492,-240,2012-04-10 00:24:31+00:00,2012-04-09 20:24:31+00:00,1@2
4,1,42586c80f964a520db201fe3,4bf58dd8d48988d121941735,Bar,40.7759862279394,-73.97952785019343,-240,2012-04-10 03:36:56+00:00,2012-04-09 23:36:56+00:00,1@2
...,...,...,...,...,...,...,...,...,...,...
160638,999,4b777562f964a520b09b2ee3,4bf58dd8d48988d16e941735,Fast Food Restaurant,40.73087259189469,-74.00116934115263,-300,2013-01-11 06:59:57+00:00,2013-01-11 01:59:57+00:00,999@29
160639,999,423f6000f964a5205f201fe3,4bf58dd8d48988d10e941735,Greek Restaurant,40.730951,-74.004561,-300,2013-01-12 01:16:06+00:00,2013-01-11 20:16:06+00:00,999@29
160640,999,42717900f964a5206d211fe3,4bf58dd8d48988d1fa931735,Hotel,40.76150653058549,-73.9853129268425,-300,2013-01-31 02:15:13+00:00,2013-01-30 21:15:13+00:00,999@30
160641,999,4b1bf775f964a520d0ff23e3,4bf58dd8d48988d121941735,Bar,40.77143374050618,-73.98247003555298,-300,2013-01-31 02:39:26+00:00,2013-01-30 21:39:26+00:00,999@30


In [6]:
p2 = p1.groupby('user_id').filter(lambda x: retain_users_with_trajectories(x, 5)).reset_index(drop=True)
p2

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4db44994cda1c57c82583709,4bf58dd8d48988d1f1931735,General Entertainment,40.73939794092962,-73.99320973426613,-240,2012-04-08 18:20:29+00:00,2012-04-08 14:20:29+00:00,1@1
1,1,4a541923f964a52008b31fe3,4bf58dd8d48988d14e941735,American Restaurant,40.785677,-73.976498,-240,2012-04-08 20:02:10+00:00,2012-04-08 16:02:10+00:00,1@1
2,1,40f1d480f964a5205b0a1fe3,4bf58dd8d48988d143941735,Breakfast Spot,40.71992902385343,-74.00853182642739,-240,2012-04-09 16:20:52+00:00,2012-04-09 12:20:52+00:00,1@1
3,1,3fd66200f964a52094e41ee3,4bf58dd8d48988d1cc941735,Steakhouse,40.7342762286207,-73.9935253702492,-240,2012-04-10 00:24:31+00:00,2012-04-09 20:24:31+00:00,1@2
4,1,42586c80f964a520db201fe3,4bf58dd8d48988d121941735,Bar,40.7759862279394,-73.97952785019343,-240,2012-04-10 03:36:56+00:00,2012-04-09 23:36:56+00:00,1@2
...,...,...,...,...,...,...,...,...,...,...
160598,999,4b777562f964a520b09b2ee3,4bf58dd8d48988d16e941735,Fast Food Restaurant,40.73087259189469,-74.00116934115263,-300,2013-01-11 06:59:57+00:00,2013-01-11 01:59:57+00:00,999@29
160599,999,423f6000f964a5205f201fe3,4bf58dd8d48988d10e941735,Greek Restaurant,40.730951,-74.004561,-300,2013-01-12 01:16:06+00:00,2013-01-11 20:16:06+00:00,999@29
160600,999,42717900f964a5206d211fe3,4bf58dd8d48988d1fa931735,Hotel,40.76150653058549,-73.9853129268425,-300,2013-01-31 02:15:13+00:00,2013-01-30 21:15:13+00:00,999@30
160601,999,4b1bf775f964a520d0ff23e3,4bf58dd8d48988d121941735,Bar,40.77143374050618,-73.98247003555298,-300,2013-01-31 02:39:26+00:00,2013-01-30 21:39:26+00:00,999@30


In [7]:
retain = []
for i, count in zip(p2['business_id'].value_counts().index, p2['business_id'].value_counts()):
    if count >= 5:
        retain.append(i)
    else:
        break
p3 = p2[p2['business_id'].isin(retain)]
p3

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4db44994cda1c57c82583709,4bf58dd8d48988d1f1931735,General Entertainment,40.73939794092962,-73.99320973426613,-240,2012-04-08 18:20:29+00:00,2012-04-08 14:20:29+00:00,1@1
2,1,40f1d480f964a5205b0a1fe3,4bf58dd8d48988d143941735,Breakfast Spot,40.71992902385343,-74.00853182642739,-240,2012-04-09 16:20:52+00:00,2012-04-09 12:20:52+00:00,1@1
6,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.76066701799484,-73.99494767189026,-240,2012-04-14 01:11:20+00:00,2012-04-13 21:11:20+00:00,1@3
7,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.7641035574988,-73.98672473457826,-240,2012-04-14 03:07:56+00:00,2012-04-13 23:07:56+00:00,1@3
8,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.76064468176889,-73.9860647444491,-240,2012-04-14 04:45:13+00:00,2012-04-14 00:45:13+00:00,1@3
...,...,...,...,...,...,...,...,...,...,...
160594,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28
160595,999,4b3ff06cf964a52097b225e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.777624339274105,-73.95491637454415,-300,2013-01-09 19:11:07+00:00,2013-01-09 14:11:07+00:00,999@28
160596,999,4b6daa1cf964a52072842ce3,4bf58dd8d48988d1e0931735,Coffee Shop,40.77707531682623,-73.95002010717978,-300,2013-01-10 15:29:53+00:00,2013-01-10 10:29:53+00:00,999@28
160601,999,4b1bf775f964a520d0ff23e3,4bf58dd8d48988d121941735,Bar,40.77143374050618,-73.98247003555298,-300,2013-01-31 02:39:26+00:00,2013-01-30 21:39:26+00:00,999@30


In [8]:
p3['business_id'].value_counts()

42911d00f964a520f5231fe3    754
42829c80f964a5206a221fe3    601
4a737bf8f964a52091dc1fe3    423
4840fe6bf964a52030501fe3    419
49b7ed6df964a52030531fe3    377
                           ... 
4e628a7ad22d509a39ad445b      5
4b9aac00f964a520abcb35e3      5
40fc6080f964a5201d0b1fe3      5
4ad74202f964a520220921e3      5
4db854784b22f2ddb6399a1b      5
Name: business_id, Length: 7275, dtype: int64

In [9]:
p4 = p3.groupby('trajectory_id').filter(lambda x: len(x) >= 3)
p4

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
6,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.76066701799484,-73.99494767189026,-240,2012-04-14 01:11:20+00:00,2012-04-13 21:11:20+00:00,1@3
7,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.7641035574988,-73.98672473457826,-240,2012-04-14 03:07:56+00:00,2012-04-13 23:07:56+00:00,1@3
8,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.76064468176889,-73.9860647444491,-240,2012-04-14 04:45:13+00:00,2012-04-14 00:45:13+00:00,1@3
10,1,428d2880f964a520b5231fe3,4bf58dd8d48988d1fa931735,Hotel,40.75673068586622,-73.97406974625711,-240,2012-04-14 17:45:23+00:00,2012-04-14 13:45:23+00:00,1@3
13,1,4e742aabc65bb91db3cadb79,4bf58dd8d48988d116941735,Bar,40.75730501775622,-73.96864018539301,-240,2012-04-14 22:07:43+00:00,2012-04-14 18:07:43+00:00,1@3
...,...,...,...,...,...,...,...,...,...,...
160592,999,3fd66200f964a52067e91ee3,4bf58dd8d48988d116941735,Bar,40.737216,-73.989348,-300,2012-12-22 04:52:50+00:00,2012-12-21 23:52:50+00:00,999@27
160593,999,3fd66200f964a5203ee71ee3,4bf58dd8d48988d11e951735,Food & Drink Shop,40.73109392616922,-74.00284500572215,-300,2012-12-22 19:27:53+00:00,2012-12-22 14:27:53+00:00,999@27
160594,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28
160595,999,4b3ff06cf964a52097b225e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.777624339274105,-73.95491637454415,-300,2013-01-09 19:11:07+00:00,2013-01-09 14:11:07+00:00,999@28


In [10]:
p5 = p4.groupby('user_id').filter(lambda x: check(x)).reset_index(drop=True)
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 16:10:47+00:00,2012-04-07 12:10:47+00:00,10@2
1,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 17:51:43+00:00,2012-04-07 13:51:43+00:00,10@2
2,10,4a464a63f964a520a3a81fe3,4bf58dd8d48988d15d941735,Other Great Outdoors,40.85023766062983,-73.94696831703186,-240,2012-04-07 19:51:56+00:00,2012-04-07 15:51:56+00:00,10@2
3,10,4ae8906ef964a52071b021e3,4bf58dd8d48988d129951735,Train Station,40.71267092713338,-74.01193141937256,-240,2012-04-15 13:27:42+00:00,2012-04-15 09:27:42+00:00,10@6
4,10,49f76cc2f964a5209d6c1fe3,4bf58dd8d48988d129951735,Train Station,40.734200760779835,-74.16483829470191,-240,2012-04-15 14:31:45+00:00,2012-04-15 10:31:45+00:00,10@6
...,...,...,...,...,...,...,...,...,...,...
100864,999,3fd66200f964a52067e91ee3,4bf58dd8d48988d116941735,Bar,40.737216,-73.989348,-300,2012-12-22 04:52:50+00:00,2012-12-21 23:52:50+00:00,999@27
100865,999,3fd66200f964a5203ee71ee3,4bf58dd8d48988d11e951735,Food & Drink Shop,40.73109392616922,-74.00284500572215,-300,2012-12-22 19:27:53+00:00,2012-12-22 14:27:53+00:00,999@27
100866,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28
100867,999,4b3ff06cf964a52097b225e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.777624339274105,-73.95491637454415,-300,2013-01-09 19:11:07+00:00,2013-01-09 14:11:07+00:00,999@28


In [11]:
p5['lat_lon'] = p5['latitude'] + ',' + p5['longitude']
p5 = p5.groupby('business_id').apply(lambda x: lon_lat_tuning(x))
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon
0,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 16:10:47+00:00,2012-04-07 12:10:47+00:00,10@2,"40.74148072635098,-74.00943275021737"
1,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 17:51:43+00:00,2012-04-07 13:51:43+00:00,10@2,"40.74148072635098,-74.00943275021737"
2,10,4a464a63f964a520a3a81fe3,4bf58dd8d48988d15d941735,Other Great Outdoors,40.85023766062983,-73.94696831703186,-240,2012-04-07 19:51:56+00:00,2012-04-07 15:51:56+00:00,10@2,"40.85023766062983,-73.94696831703186"
3,10,4ae8906ef964a52071b021e3,4bf58dd8d48988d129951735,Train Station,40.71267092713338,-74.01193141937256,-240,2012-04-15 13:27:42+00:00,2012-04-15 09:27:42+00:00,10@6,"40.71267092713338,-74.01193141937256"
4,10,49f76cc2f964a5209d6c1fe3,4bf58dd8d48988d129951735,Train Station,40.734200760779835,-74.16483829470191,-240,2012-04-15 14:31:45+00:00,2012-04-15 10:31:45+00:00,10@6,"40.734200760779835,-74.16483829470191"
...,...,...,...,...,...,...,...,...,...,...,...
100864,999,3fd66200f964a52067e91ee3,4bf58dd8d48988d116941735,Bar,40.737216,-73.989348,-300,2012-12-22 04:52:50+00:00,2012-12-21 23:52:50+00:00,999@27,"40.737216,-73.989348"
100865,999,3fd66200f964a5203ee71ee3,4bf58dd8d48988d11e951735,Food & Drink Shop,40.73109392616922,-74.00284500572215,-300,2012-12-22 19:27:53+00:00,2012-12-22 14:27:53+00:00,999@27,"40.73109392616922,-74.00284500572215"
100866,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28,"40.74790144316556,-73.98689116651354"
100867,999,4b3ff06cf964a52097b225e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.777624339274105,-73.95491637454415,-300,2013-01-09 19:11:07+00:00,2013-01-09 14:11:07+00:00,999@28,"40.777624339274105,-73.95491637454415"


In [12]:
all_poi = p5.drop_duplicates(subset=['business_id', 'latitude', 'longitude'])
all_business_ids = np.array(all_poi['business_id'])
all_lat_lon = np.array(
    [[float(x['latitude']), float(x['longitude'])] for _, x in all_poi[['latitude', 'longitude']].iterrows()])

In [13]:
eps = 0.05
min_samples = 6
clusters = DBSCAN(eps=eps, min_samples=min_samples, metric=haversine).fit(all_lat_lon)
np.unique(clusters.labels_)

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
      dtype=int64)

In [15]:
individual_poi_map = {x: i for i, x in enumerate(all_business_ids[clusters.labels_ == -1])}  # 6014
individual_in_collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI中的独立POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        individual_in_collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map)

collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map) + cluster
actual_poi_reindex_map = dict(**individual_poi_map, **individual_in_collective_poi_map)
check_in_poi_reindex_map = dict(**individual_poi_map, **collective_poi_map)
len(individual_poi_map), len(individual_in_collective_poi_map), len(collective_poi_map)

(6014, 1160, 1160)

In [20]:
collective_poi_map

{'49e34b16f964a5206f621fe3': 7174,
 '4a523c9df964a5206cb11fe3': 7174,
 '4a5519d5f964a520adb31fe3': 7174,
 '4a3ffdc5f964a5202da41fe3': 7174,
 '3fd66200f964a52062e41ee3': 7174,
 '4e093bd38130709eafbd7030': 7174,
 '3fd66200f964a52061e41ee3': 7174,
 '49c27575f964a520f1551fe3': 7174,
 '3fd66200f964a52028e41ee3': 7174,
 '4a749103f964a520a2de1fe3': 7174,
 '3fd66200f964a5204ee41ee3': 7174,
 '43fe1ba4f964a520f82f1fe3': 7174,
 '4e6e440e3151d38bc2daf56c': 7174,
 '4a021207f964a52029711fe3': 7174,
 '4a65fe27f964a520d6c71fe3': 7174,
 '3fd66200f964a52015e91ee3': 7174,
 '4d9352219213b1f73b7fc546': 7174,
 '49be9addf964a520c4541fe3': 7174,
 '49c28a31f964a520fc551fe3': 7174,
 '3fd66200f964a52083e31ee3': 7174,
 '4a78718ff964a520a4e51fe3': 7174,
 '4d0781b830a58cfad6f0b0e7': 7174,
 '4f905671e4b05b1cb7740ec4': 7174,
 '444e0033f964a52090321fe3': 7175,
 '4a441b2bf964a52027a71fe3': 7175,
 '4a7c6d9bf964a52091ec1fe3': 7175,
 '4aa044d9f964a520ee3e20e3': 7175,
 '44d9e8dbf964a5208a361fe3': 7175,
 '40958b80f964a520e6

In [19]:
check_in_poi_reindex_map.values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [16]:
import os
import pickle
with open(os.path.join('out/NYC', 'raw_i_p_reindex.pkl'), 'wb') as file:
        pickle.dump(individual_poi_map, file)
with open(os.path.join('out/NYC', 'raw_i_c_p_reindex.pkl'), 'wb') as file:
        pickle.dump(individual_in_collective_poi_map, file)
with open(os.path.join('out/NYC', 'raw_c_p_reindex.pkl'), 'wb') as file:
        pickle.dump(collective_poi_map, file)

In [17]:
len(actual_poi_reindex_map), len(check_in_poi_reindex_map)

(7174, 7174)

In [18]:
p5['actual_poi_id'] = p5['business_id'].map(actual_poi_reindex_map)
p5['check_in_poi_id'] = p5['business_id'].map(check_in_poi_reindex_map)
p5['poi_type'] = p5['actual_poi_id'].map(
    lambda x: 'individual' if x < len(individual_poi_map) else 'collective')
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type
0,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 16:10:47+00:00,2012-04-07 12:10:47+00:00,10@2,"40.74148072635098,-74.00943275021737",0,0,individual
1,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 17:51:43+00:00,2012-04-07 13:51:43+00:00,10@2,"40.74148072635098,-74.00943275021737",0,0,individual
2,10,4a464a63f964a520a3a81fe3,4bf58dd8d48988d15d941735,Other Great Outdoors,40.85023766062983,-73.94696831703186,-240,2012-04-07 19:51:56+00:00,2012-04-07 15:51:56+00:00,10@2,"40.85023766062983,-73.94696831703186",1,1,individual
3,10,4ae8906ef964a52071b021e3,4bf58dd8d48988d129951735,Train Station,40.71267092713338,-74.01193141937256,-240,2012-04-15 13:27:42+00:00,2012-04-15 09:27:42+00:00,10@6,"40.71267092713338,-74.01193141937256",2,2,individual
4,10,49f76cc2f964a5209d6c1fe3,4bf58dd8d48988d129951735,Train Station,40.734200760779835,-74.16483829470191,-240,2012-04-15 14:31:45+00:00,2012-04-15 10:31:45+00:00,10@6,"40.734200760779835,-74.16483829470191",3,3,individual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100864,999,3fd66200f964a52067e91ee3,4bf58dd8d48988d116941735,Bar,40.737216,-73.989348,-300,2012-12-22 04:52:50+00:00,2012-12-21 23:52:50+00:00,999@27,"40.737216,-73.989348",3691,3691,individual
100865,999,3fd66200f964a5203ee71ee3,4bf58dd8d48988d11e951735,Food & Drink Shop,40.73109392616922,-74.00284500572215,-300,2012-12-22 19:27:53+00:00,2012-12-22 14:27:53+00:00,999@27,"40.73109392616922,-74.00284500572215",849,849,individual
100866,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28,"40.74790144316556,-73.98689116651354",6390,7202,collective
100867,999,4b3ff06cf964a52097b225e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.777624339274105,-73.95491637454415,-300,2013-01-09 19:11:07+00:00,2013-01-09 14:11:07+00:00,999@28,"40.777624339274105,-73.95491637454415",6723,7234,collective


In [19]:
p5[p5['poi_type']=='collective']

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type
6,10,4c2bccc7d1a10f475a88f864,4bf58dd8d48988d16c941735,Burger Joint,40.75845677498381,-73.98914046612941,-240,2012-04-27 17:48:01+00:00,2012-04-27 13:48:01+00:00,10@13,"40.75845677498381,-73.98914046612941",7055,7272,collective
16,10,49e34b16f964a5206f621fe3,4bf58dd8d48988d1f2941735,Sporting Goods Shop,40.73753501383214,-73.99030208587646,-240,2012-07-04 19:51:07+00:00,2012-07-04 15:51:07+00:00,10@28,"40.73753501383214,-73.99030208587646",6019,7174,collective
17,10,4a523c9df964a5206cb11fe3,4bf58dd8d48988d1c5941735,Sandwich Place,40.737106477651764,-73.99032307563172,-240,2012-07-04 22:07:48+00:00,2012-07-04 18:07:48+00:00,10@28,"40.737106477651764,-73.99032307563172",6020,7174,collective
24,10,444e0033f964a52090321fe3,4bf58dd8d48988d10f941735,Indian Restaurant,40.723891,-73.996801,-240,2012-10-27 23:07:09+00:00,2012-10-27 19:07:09+00:00,10@33,"40.723891,-73.996801",6042,7175,collective
31,100,4b9fb437f964a520833637e3,4bf58dd8d48988d137941735,Theater,40.76274736599172,-73.98761987686157,-240,2012-04-07 17:52:02+00:00,2012-04-07 13:52:02+00:00,100@3,"40.76274736599172,-73.98761987686157",6064,7176,collective
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100846,999,4dd20d20e4cd7f7178c0d34d,4bf58dd8d48988d116941735,Bar,40.757078,-73.9671573,-240,2012-07-03 04:15:45+00:00,2012-07-03 00:15:45+00:00,999@15,"40.757078,-73.9671573",6533,7214,collective
100851,999,4ac52957f964a520b8b020e3,4bf58dd8d48988d103951735,Clothing Store,40.778152,-73.95438485,-240,2012-07-06 18:01:34+00:00,2012-07-06 14:01:34+00:00,999@17,"40.778152,-73.95438485",6719,7234,collective
100861,999,4c2bccc7d1a10f475a88f864,4bf58dd8d48988d16c941735,Burger Joint,40.75845677498381,-73.98914046612941,-300,2012-12-17 03:33:23+00:00,2012-12-16 22:33:23+00:00,999@26,"40.75845677498381,-73.98914046612941",7055,7272,collective
100866,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28,"40.74790144316556,-73.98689116651354",6390,7202,collective


In [20]:
category_names = p5['cat_name'].unique()
category_reindex_map = {category_names[i]: i for i in range(len(category_names))}
p5['cat_code'] = p5['cat_name'].map(category_reindex_map)
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type,cat_code
0,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 16:10:47+00:00,2012-04-07 12:10:47+00:00,10@2,"40.74148072635098,-74.00943275021737",0,0,individual,0
1,10,41181000f964a520060c1fe3,4bf58dd8d48988d162941735,Other Great Outdoors,40.74148072635098,-74.00943275021737,-240,2012-04-07 17:51:43+00:00,2012-04-07 13:51:43+00:00,10@2,"40.74148072635098,-74.00943275021737",0,0,individual,0
2,10,4a464a63f964a520a3a81fe3,4bf58dd8d48988d15d941735,Other Great Outdoors,40.85023766062983,-73.94696831703186,-240,2012-04-07 19:51:56+00:00,2012-04-07 15:51:56+00:00,10@2,"40.85023766062983,-73.94696831703186",1,1,individual,0
3,10,4ae8906ef964a52071b021e3,4bf58dd8d48988d129951735,Train Station,40.71267092713338,-74.01193141937256,-240,2012-04-15 13:27:42+00:00,2012-04-15 09:27:42+00:00,10@6,"40.71267092713338,-74.01193141937256",2,2,individual,1
4,10,49f76cc2f964a5209d6c1fe3,4bf58dd8d48988d129951735,Train Station,40.734200760779835,-74.16483829470191,-240,2012-04-15 14:31:45+00:00,2012-04-15 10:31:45+00:00,10@6,"40.734200760779835,-74.16483829470191",3,3,individual,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100864,999,3fd66200f964a52067e91ee3,4bf58dd8d48988d116941735,Bar,40.737216,-73.989348,-300,2012-12-22 04:52:50+00:00,2012-12-21 23:52:50+00:00,999@27,"40.737216,-73.989348",3691,3691,individual,3
100865,999,3fd66200f964a5203ee71ee3,4bf58dd8d48988d11e951735,Food & Drink Shop,40.73109392616922,-74.00284500572215,-300,2012-12-22 19:27:53+00:00,2012-12-22 14:27:53+00:00,999@27,"40.73109392616922,-74.00284500572215",849,849,individual,21
100866,999,40bbc700f964a520a2001fe3,4bf58dd8d48988d113941735,Korean Restaurant,40.74790144316556,-73.98689116651354,-300,2013-01-09 17:43:19+00:00,2013-01-09 12:43:19+00:00,999@28,"40.74790144316556,-73.98689116651354",6390,7202,collective,83
100867,999,4b3ff06cf964a52097b225e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.777624339274105,-73.95491637454415,-300,2013-01-09 19:11:07+00:00,2013-01-09 14:11:07+00:00,999@28,"40.777624339274105,-73.95491637454415",6723,7234,collective,26


In [21]:
p5['cat_name'].unique()

array(['Other Great Outdoors', 'Train Station', 'Burger Joint', 'Bar',
       'Park', 'Bookstore', 'Ice Cream Shop', 'Sporting Goods Shop',
       'Home (private)', 'Sandwich Place', 'Mexican Restaurant',
       'Indian Restaurant', 'Diner', 'Hotel', 'Sculpture Garden',
       'Garden', 'Scenic Lookout', 'Bakery', 'Theater', 'Deli / Bodega',
       'Hardware Store', 'Food & Drink Shop', 'Neighborhood',
       'Flea Market', 'Outdoors & Recreation', 'Plaza', 'Coffee Shop',
       'Building', 'Movie Theater', 'Breakfast Spot', 'Museum',
       'Gym / Fitness Center', 'Stadium', 'Moving Target', 'Bridge',
       'Food Truck', 'Miscellaneous Shop', 'Tea Room',
       'Furniture / Home Store', 'Bus Station', 'Community College',
       'Cajun / Creole Restaurant', 'Seafood Restaurant', 'Event Space',
       'Drugstore / Pharmacy', 'City', 'Steakhouse', 'Office', 'Subway',
       'Paper / Office Supplies Store', 'Arcade', 'Clothing Store',
       'Mall', 'Airport', 'Electronics Store', 'Cosm

In [22]:
p5['cat_id'].unique()

array(['4bf58dd8d48988d162941735', '4bf58dd8d48988d15d941735',
       '4bf58dd8d48988d129951735', '4bf58dd8d48988d16c941735',
       '4bf58dd8d48988d116941735', '4bf58dd8d48988d163941735',
       '4bf58dd8d48988d114951735', '4bf58dd8d48988d1c9941735',
       '4bf58dd8d48988d1f2941735', '4bf58dd8d48988d103941735',
       '4bf58dd8d48988d1c5941735', '4bf58dd8d48988d123941735',
       '4bf58dd8d48988d1c1941735', '4bf58dd8d48988d1d4941735',
       '4bf58dd8d48988d10f941735', '4bf58dd8d48988d147941735',
       '4bf58dd8d48988d1fa931735', '4bf58dd8d48988d166941735',
       '4bf58dd8d48988d15a941735', '4bf58dd8d48988d165941735',
       '4bf58dd8d48988d16a941735', '4bf58dd8d48988d137941735',
       '4bf58dd8d48988d135941735', '4bf58dd8d48988d146941735',
       '4bf58dd8d48988d112951735', '4bf58dd8d48988d118951735',
       '4bf58dd8d48988d1e5941735', '4f2a25ac4b909258e854f55f',
       '4bf58dd8d48988d1f7941735', '4d4b7105d754a06377d81259',
       '4bf58dd8d48988d164941735', '4bf58dd8d48988d1e09

In [23]:
len(category_reindex_map)

222

In [24]:
p5['user_id'].nunique()

877

In [25]:
p5['norm_in_day_time']= p5['local_time'].map(lambda x: convert_time(x))
p5 = p5[['user_id', 'business_id',  'poi_type',  'actual_poi_id', 'check_in_poi_id', 'local_time', 'norm_in_day_time', 'cat_name','cat_code', 'trajectory_id','latitude','longitude']]
p5

Unnamed: 0,user_id,business_id,poi_type,actual_poi_id,check_in_poi_id,local_time,norm_in_day_time,cat_name,cat_code,trajectory_id,latitude,longitude
0,10,41181000f964a520060c1fe3,individual,0,0,2012-04-07 12:10:47+00:00,0.520833,Other Great Outdoors,0,10@2,40.74148072635098,-74.00943275021737
1,10,41181000f964a520060c1fe3,individual,0,0,2012-04-07 13:51:43+00:00,0.583333,Other Great Outdoors,0,10@2,40.74148072635098,-74.00943275021737
2,10,4a464a63f964a520a3a81fe3,individual,1,1,2012-04-07 15:51:56+00:00,0.666667,Other Great Outdoors,0,10@2,40.85023766062983,-73.94696831703186
3,10,4ae8906ef964a52071b021e3,individual,2,2,2012-04-15 09:27:42+00:00,0.395833,Train Station,1,10@6,40.71267092713338,-74.01193141937256
4,10,49f76cc2f964a5209d6c1fe3,individual,3,3,2012-04-15 10:31:45+00:00,0.458333,Train Station,1,10@6,40.734200760779835,-74.16483829470191
...,...,...,...,...,...,...,...,...,...,...,...,...
100864,999,3fd66200f964a52067e91ee3,individual,3691,3691,2012-12-21 23:52:50+00:00,1.000000,Bar,3,999@27,40.737216,-73.989348
100865,999,3fd66200f964a5203ee71ee3,individual,849,849,2012-12-22 14:27:53+00:00,0.604167,Food & Drink Shop,21,999@27,40.73109392616922,-74.00284500572215
100866,999,40bbc700f964a520a2001fe3,collective,6390,7202,2013-01-09 12:43:19+00:00,0.541667,Korean Restaurant,83,999@28,40.74790144316556,-73.98689116651354
100867,999,4b3ff06cf964a52097b225e3,collective,6723,7234,2013-01-09 14:11:07+00:00,0.604167,Coffee Shop,26,999@28,40.777624339274105,-73.95491637454415


In [26]:
p5.to_csv('out/NYC/checkins_v1.csv', index=False)