In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.cluster import DBSCAN

In [2]:
p1_list = []
delta = 60 * 60 * 24
duplicate_interval = 60 * 60


def read_txt(file_name):
    filtered = []
    file = open(file_name, 'r', encoding='utf-8')  #打开文件
    file_data = file.readlines()  #读取所有行
    for row in file_data:
        row_list = row.split('\t')
        if row_list[2] == '4bf58dd8d48988d16d941735':
            tmp_list = []
            for i, x in enumerate(row_list):
                if i != 3:
                    tmp_list.append(x)
                else:
                    _, latitude = x.split('?')
                    tmp_list.append('Cafe')
                    tmp_list.append(latitude)
            row_list = tmp_list
        row_list[-1] = row_list[-1].replace('\n', '')
        filtered.append(row_list)
    return filtered


def str2delta(t):
    t = int(t)
    abs_t = abs(t)
    d = pd.Timedelta(minutes=abs_t)
    if t > 0:
        return d
    else:
        return -d


# 这个方法已经要求了轨迹至少得有三个点
def trajectory_regulation(df):
    trajectory_id = 1
    traj_list = []
    retained_list = []
    duplicate_list = []
    df = df.sort_values(by=['local_time']).reset_index(drop=True)
    user_id = df.iloc[0, 0]
    last_date = None
    last_checkin_date = None
    last_checkin_id = None
    last_index = 0
    for i, x in df.iterrows():
        if i == 0:
            last_date = x['local_time']
            last_index = i
            last_checkin_date = x['local_time']
            last_checkin_id = x['business_id']
            continue
        current_date = x['local_time']
        current_checkin_id = x['business_id']
        if (current_date - last_date).total_seconds() < delta:  # 这个是用来取一条轨迹的始末
            if last_checkin_id == current_checkin_id:  # 有连续的两个相同POI的签到点，如果是短时间内的重复打卡那就只保留第一个点，重复这个过程
                if (current_date - last_checkin_date).total_seconds() < duplicate_interval:
                    duplicate_list.append(i)
                else:
                    # 如果是相同的签到点，由于时间间隔大于一小时，那就相当于这个是不同的签到点
                    last_checkin_date = current_date
                    last_checkin_id = current_checkin_id
            else:  # 如果不是相同的签到点，那肯定可以直接记录当前的签到点和时间
                last_checkin_date = current_date
                last_checkin_id = current_checkin_id
        else:
            # 当前这一项如果时间距离轨迹的第一项超过了24小时，那么应该是对上一项进行处理
            # +1是因为数量是减后加1，但还要-1，因为是要用上一项的索引，那么就是不加不减
            if i - last_index >= 3:
                for retain_row in range(last_index, i):  # 刚好range也不用加1了
                    if retain_row not in duplicate_list:
                        retained_list.append(retain_row)  # 索引是用来标记要留下哪些行
                        traj_list.append(trajectory_id)  # 轨迹序号是用来标记哪些行归属一条轨迹
                trajectory_id += 1

            last_date = x['local_time']  # 重新开始划分轨迹
            last_index = i  # 下一条轨迹的开始索引
    retained_df = df.iloc[retained_list, :].copy()
    traj_list = [user_id + '@' + str(x) for x in traj_list]
    retained_df['trajectory_id'] = traj_list
    p1_list.append(retained_df)


def retain_users_with_trajectories(df, min_len):
    trajectories = df['trajectory_id'].unique()
    if len(trajectories) >= min_len:
        return True
    else:
        return False


def check(df):
    n = df['trajectory_id'].nunique()
    if n < 5:
        return False
    else:
        return True


def haversine(pos1, pos2):
    lat1, lon1 = pos1
    lat2, lon2 = pos2
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(math.radians(dlat / 2)) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(
        math.radians(dlon / 2)) ** 2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371
    h_dist = c * r

    return h_dist


def lon_lat_tuning(df):
    l = len(df)
    lon_lat = df['lat_lon'].unique()
    if len(lon_lat) != 1:
        most = df['lat_lon'].value_counts().index[0]
        df['latitude'] = [most.split(',')[0]] * l
        df['longitude'] = [most.split(',')[1]] * l
    return df


def convert_time(d):
    hour = d.hour
    minute = d.minute
    if minute < 30:
        minute = 30
    else:
        minute = 0
        hour += 1
    return (hour * 3600 + minute * 60) / 24 / 3600

In [3]:
data = read_txt('data/TKY/TKY.txt')
columns = ['user_id', 'business_id', 'cat_id', 'cat_name', 'latitude', 'longitude', 'zone_offset', 'utc_time']
tky = pd.DataFrame(data, columns=columns)
tky

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101088587135,139.6195900440216,540,Tue Apr 03 18:17:18 +0000 2012
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.71558112039315,139.80031728744507,540,Tue Apr 03 18:22:04 +0000 2012
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.71454217399564,139.4800649934587,540,Tue Apr 03 19:12:07 +0000 2012
3,868,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.72559198908874,139.77663259388527,540,Tue Apr 03 19:12:13 +0000 2012
4,1458,4f568309e4b071452e447afe,4f2a210c4b9023bd5841ed28,Housing Development,35.656083091901124,139.734045462721,540,Tue Apr 03 19:18:23 +0000 2012
...,...,...,...,...,...,...,...,...
573698,326,4bab3456f964a5204d993ae3,4bf58dd8d48988d1e9931735,Music Venue,35.65693905642321,139.7025178567526,540,Sat Feb 16 02:34:35 +0000 2013
573699,853,4b559c09f964a520efe827e3,4bf58dd8d48988d129951735,Train Station,35.858739601673754,139.6572858095169,540,Sat Feb 16 02:34:53 +0000 2013
573700,1502,5101e81ee4b020384100b0a5,4bf58dd8d48988d1dc931735,Tea Room,35.701748478737294,139.7712157996672,540,Sat Feb 16 02:34:55 +0000 2013
573701,408,4bbc5648afe1b7134743304b,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.670464936192225,139.76834803819656,540,Sat Feb 16 02:35:17 +0000 2013


In [4]:
tky['utc_time'] = pd.to_datetime(tky['utc_time'])
zone_delta = tky['zone_offset'].map(lambda x: str2delta(x))
tky['local_time'] = tky['utc_time'] + zone_delta
tky.groupby('user_id').apply(lambda x: trajectory_regulation(x))

In [5]:
p1 = pd.concat(p1_list).reset_index(drop=True)
p1

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
3,1,4bef4d2fb0b376b030d8dab3,4d954b0ea243a5684a65b473,Convenience Store,35.66833822,139.7667561,540,2012-04-08 01:18:00+00:00,2012-04-08 10:18:00+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
443044,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
443045,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
443046,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
443047,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [6]:
p2 = p1.groupby('user_id').filter(lambda x: retain_users_with_trajectories(x, 5)).reset_index(drop=True)
p2

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
3,1,4bef4d2fb0b376b030d8dab3,4d954b0ea243a5684a65b473,Convenience Store,35.66833822,139.7667561,540,2012-04-08 01:18:00+00:00,2012-04-08 10:18:00+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
442954,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
442955,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
442956,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
442957,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [7]:
retain = []
for i, count in zip(p2['business_id'].value_counts().index, p2['business_id'].value_counts()):
    if count >= 5:
        retain.append(i)
    else:
        break
p3 = p2[p2['business_id'].isin(retain)]
p3

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
5,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
442954,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
442955,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
442956,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
442957,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [8]:
p3['business_id'].value_counts()

4b19f917f964a520abe623e3    10409
4b0587a6f964a5203d9e22e3     9594
4b243a7df964a520356424e3     5686
4b093eeff964a520e51423e3     5122
4b0587a6f964a5203e9e22e3     3667
                            ...  
4eca18de0aaf9c3ccf5cf881        5
4b5d26d1f964a520af5429e3        5
4b757532f964a520a60d2ee3        5
4b6426a8f964a5206ea12ae3        5
4f704414e4b02781d658807c        5
Name: business_id, Length: 11451, dtype: int64

In [9]:
p4 = p3.groupby('trajectory_id').filter(lambda x: len(x) >= 3)
p4

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
5,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
442954,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
442955,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
442956,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
442957,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [10]:
p5 = p4.groupby('user_id').filter(lambda x: check(x)).reset_index(drop=True)
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [11]:
category_ids = p5['cat_id'].unique()
category_reindex_map = {category_ids[i]: i for i in range(len(category_ids))}
p5['cat_code'] = p5['cat_id'].map(category_reindex_map)
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,0
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,2
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,2
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,1
...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,0
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,0
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,0
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,0


In [12]:
p5['lat_lon'] = p5['latitude'] + ',' + p5['longitude']
p5 = p5.groupby('business_id').apply(lambda x: lon_lat_tuning(x))
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,0,"35.74964694346383,139.80514526367188"
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,1,"35.669687,139.767254"
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,2,"35.668087628929534,139.76731538772583"
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,2,"35.668087628929534,139.76731538772583"
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,1,"35.669687,139.767254"
...,...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,0,"35.69137390901421,139.6993589401245"
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,0,"35.69080269423021,139.70027922656513"
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,0,"35.6895798775885,139.7000147227295"
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,0,"35.69080269423021,139.70027922656513"


In [13]:
all_poi = p5.drop_duplicates(subset=['business_id', 'latitude', 'longitude'])
all_business_ids = np.array(all_poi['business_id'])
all_lat_lon = np.array(
    [[float(x['latitude']), float(x['longitude'])] for _, x in all_poi[['latitude', 'longitude']].iterrows()])

In [14]:
eps = 0.05
min_samples = 5
clusters = DBSCAN(eps=eps, min_samples=min_samples, metric=haversine).fit(all_lat_lon)
np.unique(clusters.labels_)

array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
       168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 17

In [15]:
individual_poi_map = {x: i for i, x in enumerate(all_business_ids[clusters.labels_ == -1])}

In [16]:
individual_in_collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI中的独立POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        individual_in_collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map)

In [17]:
check_in_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI中的独立POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        check_in_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map) + cluster

In [18]:
actual_poi_reindex_map = dict(**individual_poi_map, **individual_in_collective_poi_map)
check_in_poi_reindex_map = dict(**individual_poi_map, **check_in_poi_map)

In [19]:
len(individual_poi_map), len(individual_in_collective_poi_map), len(check_in_poi_map)

(7311, 4110, 4110)

In [20]:
len(actual_poi_reindex_map), len(check_in_poi_reindex_map)

(11421, 11421)

In [21]:
p5['actual_poi_id'] = p5['business_id'].map(actual_poi_reindex_map)
p5['check_in_poi_id'] = p5['business_id'].map(check_in_poi_reindex_map)
p5['poi_type'] = p5['actual_poi_id'].map(
    lambda x: 'individual' if x < len(individual_poi_map) else 'collective')
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon,actual_poi_id,check_in_poi_id,poi_type
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,0,"35.74964694346383,139.80514526367188",7311,11421,collective
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,1,"35.669687,139.767254",9560,11486,collective
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,2,"35.668087628929534,139.76731538772583",0,0,individual
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,2,"35.668087628929534,139.76731538772583",0,0,individual
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,1,"35.669687,139.767254",9560,11486,collective
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,0,"35.69137390901421,139.6993589401245",8295,11428,collective
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,0,"35.69080269423021,139.70027922656513",8024,11428,collective
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,0,"35.6895798775885,139.7000147227295",8029,11428,collective
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,0,"35.69080269423021,139.70027922656513",8024,11428,collective


In [22]:
p5['norm_in_day_time']= p5['local_time'].map(lambda x: convert_time(x))
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon,actual_poi_id,check_in_poi_id,poi_type,norm_in_day_time
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,0,"35.74964694346383,139.80514526367188",7311,11421,collective,0.375000
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,1,"35.669687,139.767254",9560,11486,collective,0.395833
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,2,"35.668087628929534,139.76731538772583",0,0,individual,0.395833
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,2,"35.668087628929534,139.76731538772583",0,0,individual,0.437500
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,1,"35.669687,139.767254",9560,11486,collective,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,0,"35.69137390901421,139.6993589401245",8295,11428,collective,0.416667
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,0,"35.69080269423021,139.70027922656513",8024,11428,collective,0.729167
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,0,"35.6895798775885,139.7000147227295",8029,11428,collective,0.729167
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,0,"35.69080269423021,139.70027922656513",8024,11428,collective,0.416667


In [23]:
p5['user_id'].unique()

array(['1', '10', '100', ..., '997', '998', '999'], dtype=object)

In [24]:
p5['cat_code'].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [25]:
p5['user_id'] = p5['user_id'].map(lambda x:'user_'+str(x))
p5['actual_poi_id'] = p5['actual_poi_id'].map(lambda x: 'poi_' + str(x))
p5['check_in_poi_id'] = p5['check_in_poi_id'].map(lambda x: 'poi_' + str(x))
p5['trajectory_id'] = p5['trajectory_id'].map(lambda x: 'user_' + str(x))
p5['cat_code'] = p5['cat_code'].map(lambda x: 'cat_' + str(x))

In [26]:
p5[p5['poi_type']=='collective']

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon,actual_poi_id,check_in_poi_id,poi_type,norm_in_day_time
0,user_1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,user_1@1,cat_0,"35.74964694346383,139.80514526367188",poi_7311,poi_11421,collective,0.375000
1,user_1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,user_1@1,cat_1,"35.669687,139.767254",poi_9560,poi_11486,collective,0.395833
4,user_1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,user_1@1,cat_1,"35.669687,139.767254",poi_9560,poi_11486,collective,0.666667
5,user_1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596195264464,139.77301824864617,540,2012-04-08 06:55:32+00:00,2012-04-08 15:55:32+00:00,user_1@1,cat_0,"35.698596195264464,139.77301824864617",poi_7322,poi_11422,collective,0.666667
6,user_1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06+00:00,2012-04-08 16:46:06+00:00,user_1@1,cat_3,"35.700941,139.770555",poi_7323,poi_11422,collective,0.708333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351982,user_999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,user_999@20,cat_0,"35.69137390901421,139.6993589401245",poi_8295,poi_11428,collective,0.416667
351983,user_999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,user_999@21,cat_0,"35.69080269423021,139.70027922656513",poi_8024,poi_11428,collective,0.729167
351984,user_999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,user_999@21,cat_0,"35.6895798775885,139.7000147227295",poi_8029,poi_11428,collective,0.729167
351985,user_999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,user_999@21,cat_0,"35.69080269423021,139.70027922656513",poi_8024,poi_11428,collective,0.416667


In [27]:
p5[p5['poi_type']=='individual']

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon,actual_poi_id,check_in_poi_id,poi_type,norm_in_day_time
2,user_1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,user_1@1,cat_2,"35.668087628929534,139.76731538772583",poi_0,poi_0,individual,0.395833
3,user_1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,user_1@1,cat_2,"35.668087628929534,139.76731538772583",poi_0,poi_0,individual,0.437500
14,user_1,4f6d5c5be4b04cb293be0d42,4d954b0ea243a5684a65b473,Convenience Store,35.634079,139.791346,540,2012-05-05 21:23:08+00:00,2012-05-06 06:23:08+00:00,user_1@2,cat_7,"35.63379646661188,139.79151759062583",poi_1,poi_1,individual,0.270833
15,user_1,4b5653a9f964a520810b28e3,4bf58dd8d48988d129951735,Train Station,35.63440986098353,139.79161351919174,540,2012-05-05 21:23:21+00:00,2012-05-06 06:23:21+00:00,user_1@2,cat_0,"35.63440986098353,139.79161351919174",poi_2,poi_2,individual,0.270833
16,user_1,4b6a4f2df964a520b0d12be3,4bf58dd8d48988d1ff931735,Convention Center,35.62986667837494,139.79426622390747,540,2012-05-05 21:36:21+00:00,2012-05-06 06:36:21+00:00,user_1@2,cat_2,"35.62986667837494,139.79426622390747",poi_3,poi_3,individual,0.291667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351926,user_999,4b17d66cf964a520b7c823e3,4bf58dd8d48988d129951735,Train Station,35.66158508135192,139.66747283935547,540,2012-12-08 04:43:32+00:00,2012-12-08 13:43:32+00:00,user_999@9,cat_0,"35.66158508135192,139.66747283935547",poi_107,poi_107,individual,0.583333
351940,user_999,4b24e5a2f964a520636a24e3,4bf58dd8d48988d129951735,Train Station,35.68417610648677,139.70206260681152,540,2013-01-17 06:26:01+00:00,2013-01-17 15:26:01+00:00,user_999@12,cat_0,"35.68417610648677,139.70206260681152",poi_23,poi_23,individual,0.645833
351952,user_999,4b21c0def964a520104124e3,4bf58dd8d48988d129951735,Train Station,35.6513682132017,139.63663816452026,540,2013-01-22 06:43:11+00:00,2013-01-22 15:43:11+00:00,user_999@14,cat_0,"35.6513682132017,139.63663816452026",poi_685,poi_685,individual,0.666667
351954,user_999,4b89fd41f964a520685a32e3,4bf58dd8d48988d1ed931735,Airport,35.55059416176674,139.78411674499512,540,2013-01-27 10:20:14+00:00,2013-01-27 19:20:14+00:00,user_999@15,cat_172,"35.550227542635206,139.78566169738767",poi_853,poi_853,individual,0.812500


In [28]:
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon,actual_poi_id,check_in_poi_id,poi_type,norm_in_day_time
0,user_1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,user_1@1,cat_0,"35.74964694346383,139.80514526367188",poi_7311,poi_11421,collective,0.375000
1,user_1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,user_1@1,cat_1,"35.669687,139.767254",poi_9560,poi_11486,collective,0.395833
2,user_1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,user_1@1,cat_2,"35.668087628929534,139.76731538772583",poi_0,poi_0,individual,0.395833
3,user_1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,user_1@1,cat_2,"35.668087628929534,139.76731538772583",poi_0,poi_0,individual,0.437500
4,user_1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,user_1@1,cat_1,"35.669687,139.767254",poi_9560,poi_11486,collective,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351982,user_999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,user_999@20,cat_0,"35.69137390901421,139.6993589401245",poi_8295,poi_11428,collective,0.416667
351983,user_999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,user_999@21,cat_0,"35.69080269423021,139.70027922656513",poi_8024,poi_11428,collective,0.729167
351984,user_999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,user_999@21,cat_0,"35.6895798775885,139.7000147227295",poi_8029,poi_11428,collective,0.729167
351985,user_999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,user_999@21,cat_0,"35.69080269423021,139.70027922656513",poi_8024,poi_11428,collective,0.416667


In [29]:
p5 = p5[['user_id', 'poi_type',  'actual_poi_id', 'check_in_poi_id', 'local_time', 'norm_in_day_time', 'cat_name','cat_code', 'trajectory_id','latitude','longitude']]
p5

Unnamed: 0,user_id,poi_type,actual_poi_id,check_in_poi_id,local_time,norm_in_day_time,cat_name,cat_code,trajectory_id,latitude,longitude
0,user_1,collective,poi_7311,poi_11421,2012-04-08 08:33:08+00:00,0.375000,Train Station,cat_0,user_1@1,35.74964694346383,139.80514526367188
1,user_1,collective,poi_9560,poi_11486,2012-04-08 09:03:35+00:00,0.395833,Subway,cat_1,user_1@1,35.669687,139.767254
2,user_1,individual,poi_0,poi_0,2012-04-08 09:10:48+00:00,0.395833,Convention Center,cat_2,user_1@1,35.668087628929534,139.76731538772583
3,user_1,individual,poi_0,poi_0,2012-04-08 10:18:21+00:00,0.437500,Convention Center,cat_2,user_1@1,35.668087628929534,139.76731538772583
4,user_1,collective,poi_9560,poi_11486,2012-04-08 15:41:07+00:00,0.666667,Subway,cat_1,user_1@1,35.669687,139.767254
...,...,...,...,...,...,...,...,...,...,...,...
351982,user_999,collective,poi_8295,poi_11428,2013-02-08 09:31:55+00:00,0.416667,Train Station,cat_0,user_999@20,35.69137390901421,139.6993589401245
351983,user_999,collective,poi_8024,poi_11428,2013-02-12 17:17:09+00:00,0.729167,Train Station,cat_0,user_999@21,35.69080269423021,139.70027922656513
351984,user_999,collective,poi_8029,poi_11428,2013-02-12 17:19:24+00:00,0.729167,Train Station,cat_0,user_999@21,35.68896262757877,139.70000973651244
351985,user_999,collective,poi_8024,poi_11428,2013-02-13 09:35:26+00:00,0.416667,Train Station,cat_0,user_999@21,35.69080269423021,139.70027922656513


In [30]:
p5.to_csv('out/TKY/checkins_v1.csv', index=False)

In [37]:
# import collections
#
# cat_distribution = {}
#
#
# def cal_cat_distribution(df):
#     global cat_distribution
#     all_categories = df.drop_duplicates(subset=['actual_poi_id', 'cat_code'])  # 该集合POI中的各个POI的类别
#     counter = collections.Counter(all_categories['cat_code'])
#     cat_distribution[df['actual_poi_id'].iloc[0]] = counter
#
#
# collective = p5[p5['poi_type'] == 'collective']  # 所有的集合POI签到记录
# collective.groupby('check_in_poi_id', group_keys=False).apply(lambda x: cal_cat_distribution(x))

In [38]:
cat_distribution

{6884: Counter({8: 1,
          10: 1,
          89: 1,
          31: 1,
          118: 1,
          151: 1,
          107: 1,
          36: 1,
          170: 1,
          12: 1,
          15: 1,
          96: 1,
          53: 1,
          106: 1,
          163: 1,
          176: 1}),
 6900: Counter({40: 1,
          61: 1,
          33: 1,
          12: 1,
          16: 1,
          35: 1,
          81: 1,
          133: 1,
          7: 1,
          158: 1,
          31: 1,
          3: 1,
          8: 1,
          118: 1}),
 6914: Counter({82: 2,
          58: 4,
          172: 1,
          146: 2,
          24: 1,
          4: 1,
          27: 1,
          0: 1,
          131: 2,
          295: 4,
          205: 1,
          291: 1,
          157: 1,
          32: 1,
          54: 1,
          3: 1,
          128: 1}),
 6938: Counter({7: 1,
          143: 1,
          25: 1,
          107: 2,
          185: 1,
          151: 1,
          228: 1,
          264: 1,
          243: 1,
 

In [34]:
p5[p5['poi_type'] == 'collective']

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,cat_code,lat_lon,actual_poi_id,check_in_poi_id,poi_type
16,10,49e34b16f964a5206f621fe3,4bf58dd8d48988d1f2941735,Sporting Goods Shop,40.73753501383214,-73.99030208587646,-240,2012-07-04 19:51:07+00:00,2012-07-04 15:51:07+00:00,10@28,8,"40.73753501383214,-73.99030208587646",6884,7174,collective
17,10,4a523c9df964a5206cb11fe3,4bf58dd8d48988d1c5941735,Sandwich Place,40.737106477651764,-73.99032307563172,-240,2012-07-04 22:07:48+00:00,2012-07-04 18:07:48+00:00,10@28,10,"40.737106477651764,-73.99032307563172",6885,7174,collective
168,100,49b79f54f964a5202c531fe3,4bf58dd8d48988d164941735,Plaza,40.75868265304899,-73.97867843935826,-240,2012-05-05 01:59:46+00:00,2012-05-04 21:59:46+00:00,100@22,30,"40.758669195135035,-73.97859808949039",7110,7190,collective
183,100,41390580f964a520dc1a1fe3,4bf58dd8d48988d1fa941735,Food & Drink Shop,40.74193085348367,-74.00498547371578,-240,2012-05-09 15:44:07+00:00,2012-05-09 11:44:07+00:00,100@25,36,"40.7420803919755,-74.00510598192578",7098,7189,collective
215,1000,40afe980f964a5203bf31ee3,4bf58dd8d48988d180941735,Movie Theater,40.757031390907855,-73.98902607371383,-240,2012-05-03 14:45:21+00:00,2012-05-03 10:45:21+00:00,1000@6,40,"40.7569743825982,-73.98877355904663",6900,7175,collective
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100823,998,49b79f54f964a5202c531fe3,4bf58dd8d48988d164941735,Plaza,40.75868265304899,-73.97867843935826,-300,2013-02-01 20:31:19+00:00,2013-02-01 15:31:19+00:00,998@20,30,"40.758732719752736,-73.97859808949039",7110,7190,collective
100825,998,49b79f54f964a5202c531fe3,4bf58dd8d48988d164941735,Plaza,40.75868265304899,-73.97867843935826,-300,2013-02-07 15:04:15+00:00,2013-02-07 10:04:15+00:00,998@22,30,"40.758732719752736,-73.97859808949039",7110,7190,collective
100839,999,43c922c9f964a520972d1fe3,4bf58dd8d48988d1fd941735,Mall,40.76792357432163,-73.98194432258606,-240,2012-05-16 21:14:00+00:00,2012-05-16 17:14:00+00:00,999@12,63,"40.76792357432163,-73.98194432258606",6954,7178,collective
100844,999,49ff53e6f964a5200c701fe3,4bf58dd8d48988d176941735,Gym / Fitness Center,40.77920900596156,-73.9551287537907,-240,2012-07-02 23:34:50+00:00,2012-07-02 19:34:50+00:00,999@15,37,"40.77904848612385,-73.9548647146921",6964,7179,collective
