In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.cluster import DBSCAN

In [2]:
p1_list = []
delta = 60 * 60 * 24
duplicate_interval = 60 * 60


def read_txt(file_name):
    filtered = []
    file = open(file_name, 'r', encoding='utf-8')  #打开文件
    file_data = file.readlines()  #读取所有行
    for row in file_data:
        row_list = row.split('\t')
        if row_list[2] == '4bf58dd8d48988d16d941735':
            tmp_list = []
            for i, x in enumerate(row_list):
                if i != 3:
                    tmp_list.append(x)
                else:
                    _, latitude = x.split('?')
                    tmp_list.append('Cafe')
                    tmp_list.append(latitude)
            row_list = tmp_list
        row_list[-1] = row_list[-1].replace('\n', '')
        filtered.append(row_list)
    return filtered


def str2delta(t):
    t = int(t)
    abs_t = abs(t)
    d = pd.Timedelta(minutes=abs_t)
    if t > 0:
        return d
    else:
        return -d


# 这个方法已经要求了轨迹至少得有三个点
def trajectory_regulation(df):
    trajectory_id = 1
    traj_list = []
    retained_list = []
    duplicate_list = []
    df = df.sort_values(by=['local_time']).reset_index(drop=True)
    user_id = df.iloc[0, 0]
    last_date = None
    last_checkin_date = None
    last_checkin_id = None
    last_index = 0
    for i, x in df.iterrows():
        if i == 0:
            last_date = x['local_time']
            last_index = i
            last_checkin_date = x['local_time']
            last_checkin_id = x['business_id']
            continue
        current_date = x['local_time']
        current_checkin_id = x['business_id']
        if (current_date - last_date).total_seconds() < delta:  # 这个是用来取一条轨迹的始末
            if last_checkin_id == current_checkin_id:  # 有连续的两个相同POI的签到点，如果是短时间内的重复打卡那就只保留第一个点，重复这个过程
                if (current_date - last_checkin_date).total_seconds() < duplicate_interval:
                    duplicate_list.append(i)
                else:
                    # 如果是相同的签到点，由于时间间隔大于一小时，那就相当于这个是不同的签到点
                    last_checkin_date = current_date
                    last_checkin_id = current_checkin_id
            else:  # 如果不是相同的签到点，那肯定可以直接记录当前的签到点和时间
                last_checkin_date = current_date
                last_checkin_id = current_checkin_id
        else:
            # 当前这一项如果时间距离轨迹的第一项超过了24小时，那么应该是对上一项进行处理
            # +1是因为数量是减后加1，但还要-1，因为是要用上一项的索引，那么就是不加不减
            if i - last_index >= 3:
                for retain_row in range(last_index, i):  # 刚好range也不用加1了
                    if retain_row not in duplicate_list:
                        retained_list.append(retain_row)  # 索引是用来标记要留下哪些行
                        traj_list.append(trajectory_id)  # 轨迹序号是用来标记哪些行归属一条轨迹
                trajectory_id += 1

            last_date = x['local_time']  # 重新开始划分轨迹
            last_index = i  # 下一条轨迹的开始索引
    retained_df = df.iloc[retained_list, :].copy()
    traj_list = [user_id + '@' + str(x) for x in traj_list]
    retained_df['trajectory_id'] = traj_list
    p1_list.append(retained_df)


def retain_users_with_trajectories(df, min_len):
    trajectories = df['trajectory_id'].unique()
    if len(trajectories) >= min_len:
        return True
    else:
        return False


def check(df):
    n = df['trajectory_id'].nunique()
    if n < 5:
        return False
    else:
        return True


def haversine(pos1, pos2):
    lat1, lon1 = pos1
    lat2, lon2 = pos2
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(math.radians(dlat / 2)) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(
        math.radians(dlon / 2)) ** 2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371
    h_dist = c * r

    return h_dist


def lon_lat_tuning(df):
    l = len(df)
    lon_lat = df['lat_lon'].unique()
    if len(lon_lat) != 1:
        most = df['lat_lon'].value_counts().index[0]
        df['latitude'] = [most.split(',')[0]] * l
        df['longitude'] = [most.split(',')[1]] * l
    return df


def convert_time(d):
    hour = d.hour
    minute = d.minute
    if minute < 30:
        minute = 30
    else:
        minute = 0
        hour += 1
    return (hour * 3600 + minute * 60) / 24 / 3600

In [4]:
data = read_txt('data/TKY/TKY.txt')
columns = ['user_id', 'business_id', 'cat_id', 'cat_name', 'latitude', 'longitude', 'zone_offset', 'utc_time']
tky = pd.DataFrame(data, columns=columns)
tky

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101088587135,139.6195900440216,540,Tue Apr 03 18:17:18 +0000 2012
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.71558112039315,139.80031728744507,540,Tue Apr 03 18:22:04 +0000 2012
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.71454217399564,139.4800649934587,540,Tue Apr 03 19:12:07 +0000 2012
3,868,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.72559198908874,139.77663259388527,540,Tue Apr 03 19:12:13 +0000 2012
4,1458,4f568309e4b071452e447afe,4f2a210c4b9023bd5841ed28,Housing Development,35.656083091901124,139.734045462721,540,Tue Apr 03 19:18:23 +0000 2012
...,...,...,...,...,...,...,...,...
573698,326,4bab3456f964a5204d993ae3,4bf58dd8d48988d1e9931735,Music Venue,35.65693905642321,139.7025178567526,540,Sat Feb 16 02:34:35 +0000 2013
573699,853,4b559c09f964a520efe827e3,4bf58dd8d48988d129951735,Train Station,35.858739601673754,139.6572858095169,540,Sat Feb 16 02:34:53 +0000 2013
573700,1502,5101e81ee4b020384100b0a5,4bf58dd8d48988d1dc931735,Tea Room,35.701748478737294,139.7712157996672,540,Sat Feb 16 02:34:55 +0000 2013
573701,408,4bbc5648afe1b7134743304b,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.670464936192225,139.76834803819656,540,Sat Feb 16 02:35:17 +0000 2013


In [5]:
tky['utc_time'] = pd.to_datetime(tky['utc_time'])
zone_delta = tky['zone_offset'].map(lambda x: str2delta(x))
tky['local_time'] = tky['utc_time'] + zone_delta
tky.groupby('user_id').apply(lambda x: trajectory_regulation(x))

In [6]:
p1 = pd.concat(p1_list).reset_index(drop=True)
p1

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
3,1,4bef4d2fb0b376b030d8dab3,4d954b0ea243a5684a65b473,Convenience Store,35.66833822,139.7667561,540,2012-04-08 01:18:00+00:00,2012-04-08 10:18:00+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
443044,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
443045,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
443046,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
443047,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [7]:
p2 = p1.groupby('user_id').filter(lambda x: retain_users_with_trajectories(x, 5)).reset_index(drop=True)
p2

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
3,1,4bef4d2fb0b376b030d8dab3,4d954b0ea243a5684a65b473,Convenience Store,35.66833822,139.7667561,540,2012-04-08 01:18:00+00:00,2012-04-08 10:18:00+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
442954,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
442955,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
442956,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
442957,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [8]:
retain = []
for i, count in zip(p2['business_id'].value_counts().index, p2['business_id'].value_counts()):
    if count >= 5:
        retain.append(i)
    else:
        break
p3 = p2[p2['business_id'].isin(retain)]
p3

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
5,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
442954,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
442955,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
442956,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
442957,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [9]:
p3['business_id'].value_counts()

4b19f917f964a520abe623e3    10409
4b0587a6f964a5203d9e22e3     9594
4b243a7df964a520356424e3     5686
4b093eeff964a520e51423e3     5122
4b0587a6f964a5203e9e22e3     3667
                            ...  
5006318ce4b044d21405085d        5
4c2844f49fb5d13a0b5f9957        5
4bf217f570779521aca73e7c        5
4b8ce9b6f964a520a4e032e3        5
4d27fcdfb818a35db621918a        5
Name: business_id, Length: 11451, dtype: int64

In [10]:
p4 = p3.groupby('trajectory_id').filter(lambda x: len(x) >= 3)
p4

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
5,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
442954,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
442955,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
442956,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
442957,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [11]:
p5 = p4.groupby('user_id').filter(lambda x: check(x)).reset_index(drop=True)
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1
...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.6895798775885,139.7000147227295,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21


In [12]:
p5['lat_lon'] = p5['latitude'] + ',' + p5['longitude']
p5 = p5.groupby('business_id').apply(lambda x: lon_lat_tuning(x))
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,"35.74964694346383,139.80514526367188"
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,"35.669687,139.767254"
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,"35.668087628929534,139.76731538772583"
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,"35.668087628929534,139.76731538772583"
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,"35.669687,139.767254"
...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,"35.69137390901421,139.6993589401245"
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,"35.69080269423021,139.70027922656513"
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,"35.6895798775885,139.7000147227295"
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,"35.69080269423021,139.70027922656513"


In [13]:
all_poi = p5.drop_duplicates(subset=['business_id', 'latitude', 'longitude'])
all_business_ids = np.array(all_poi['business_id'])
all_lat_lon = np.array(
    [[float(x['latitude']), float(x['longitude'])] for _, x in all_poi[['latitude', 'longitude']].iterrows()])

In [21]:
eps = 0.05
min_samples = 9
clusters = DBSCAN(eps=eps, min_samples=min_samples, metric=haversine).fit(all_lat_lon)
np.unique(clusters.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79], dtype=int64)

In [22]:
individual_poi_map = {x: i for i, x in enumerate(all_business_ids[clusters.labels_ == -1])}  # 6014
individual_in_collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI中的独立POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        individual_in_collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map)

collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map) + cluster
actual_poi_reindex_map = dict(**individual_poi_map, **individual_in_collective_poi_map)
check_in_poi_reindex_map = dict(**individual_poi_map, **collective_poi_map)
len(individual_poi_map), len(individual_in_collective_poi_map), len(collective_poi_map)

(9483, 1938, 1938)

In [23]:
collective_poi_map

{'4b19f917f964a520abe623e3': 11421,
 '4b73cca1f964a5203fbc2de3': 11421,
 '4b9dd2aaf964a52020bf36e3': 11421,
 '4b55670ff964a52071e327e3': 11421,
 '4dd86e71e4cd37c893c5c0b2': 11421,
 '4b593c7ef964a520828228e3': 11421,
 '4b5982cbf964a520a58a28e3': 11421,
 '4b5dcff1f964a520196d29e3': 11421,
 '4bbac8b753649c742f7249fb': 11421,
 '4b0bc75ff964a520923323e3': 11421,
 '4b5ab7bff964a520eed128e3': 11421,
 '4bd4edd15631c9b6cf73a330': 11421,
 '4b713b79f964a5207e3d2de3': 11421,
 '4de74b34e4cdfedb8a9a1b0c': 11421,
 '4e9f55d99adfb559cc1964f3': 11421,
 '4f35f3bce4b0d26b9f73e176': 11421,
 '4b7782e8f964a520f49e2ee3': 11421,
 '4b62b1dbf964a5204f4f2ae3': 11421,
 '4b73785bf964a52022b02de3': 11421,
 '4e7eb53aa17c96ac9dc0d484': 11421,
 '4b5bf176f964a520471e29e3': 11421,
 '4b5b278df964a5200fe728e3': 11421,
 '4c81d7d8e602b1f7b536977a': 11421,
 '4b6a97d2f964a520a6d92be3': 11421,
 '4b5fccd2f964a52037cd29e3': 11421,
 '4c3bd1425810a5932d3fbb3c': 11421,
 '4c37fe3f2c8020a11d2d8a00': 11421,
 '5003c391e4b0a7c21378d0ad':

In [24]:
check_in_poi_reindex_map.values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [25]:
import os
import pickle
with open(os.path.join('out/TKY', 'raw_i_p_reindex.pkl'), 'wb') as file:
        pickle.dump(individual_poi_map, file)
with open(os.path.join('out/TKY', 'raw_i_c_p_reindex.pkl'), 'wb') as file:
        pickle.dump(individual_in_collective_poi_map, file)
with open(os.path.join('out/TKY', 'raw_c_p_reindex.pkl'), 'wb') as file:
        pickle.dump(collective_poi_map, file)

In [26]:
len(actual_poi_reindex_map), len(check_in_poi_reindex_map)

(11421, 11421)

In [27]:
p5['actual_poi_id'] = p5['business_id'].map(actual_poi_reindex_map)
p5['check_in_poi_id'] = p5['business_id'].map(check_in_poi_reindex_map)
p5['poi_type'] = p5['actual_poi_id'].map(
    lambda x: 'individual' if x < len(individual_poi_map) else 'collective')
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,"35.74964694346383,139.80514526367188",0,0,individual
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,"35.669687,139.767254",1,1,individual
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,"35.668087628929534,139.76731538772583",2,2,individual
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,"35.668087628929534,139.76731538772583",2,2,individual
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,"35.669687,139.767254",1,1,individual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,"35.69137390901421,139.6993589401245",10142,11423,collective
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,"35.69080269423021,139.70027922656513",10099,11423,collective
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,"35.6895798775885,139.7000147227295",45,45,individual
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,"35.69080269423021,139.70027922656513",10099,11423,collective


In [28]:
p5[p5['poi_type']=='collective']

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type
5,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596195264464,139.77301824864617,540,2012-04-08 06:55:32+00:00,2012-04-08 15:55:32+00:00,1@1,"35.698596195264464,139.77301824864617",9483,11421,collective
6,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06+00:00,2012-04-08 16:46:06+00:00,1@1,"35.700941,139.770555",9484,11421,collective
7,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.70128071770129,139.77129310369492,540,2012-04-08 08:58:02+00:00,2012-04-08 17:58:02+00:00,1@1,"35.70128071770129,139.77129310369492",9485,11421,collective
8,1,4b55670ff964a52071e327e3,4bf58dd8d48988d122951735,Electronics Store,35.69901541704127,139.7746217250824,540,2012-04-08 09:21:33+00:00,2012-04-08 18:21:33+00:00,1@1,"35.69901541704127,139.7746217250824",9486,11421,collective
9,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Cafe,35.70093221409643,139.77065205574036,540,2012-04-08 10:05:55+00:00,2012-04-08 19:05:55+00:00,1@1,"35.70093221409643,139.77065205574036",9487,11421,collective
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351978,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-07 00:38:42+00:00,2013-02-07 09:38:42+00:00,999@20,"35.69080269423021,139.70027922656513",10099,11423,collective
351979,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-07 06:56:10+00:00,2013-02-07 15:56:10+00:00,999@20,"35.69080269423021,139.70027922656513",10099,11423,collective
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,"35.69137390901421,139.6993589401245",10142,11423,collective
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,"35.69080269423021,139.70027922656513",10099,11423,collective


In [29]:
category_names = p5['cat_name'].unique()
category_reindex_map = {category_names[i]: i for i in range(len(category_names))}
p5['cat_code'] = p5['cat_name'].map(category_reindex_map)
p5

Unnamed: 0,user_id,business_id,cat_id,cat_name,latitude,longitude,zone_offset,utc_time,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type,cat_code
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.74964694346383,139.80514526367188,540,2012-04-07 23:33:08+00:00,2012-04-08 08:33:08+00:00,1@1,"35.74964694346383,139.80514526367188",0,0,individual,0
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35+00:00,2012-04-08 09:03:35+00:00,1@1,"35.669687,139.767254",1,1,individual,1
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 00:10:48+00:00,2012-04-08 09:10:48+00:00,1@1,"35.668087628929534,139.76731538772583",2,2,individual,2
3,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668087628929534,139.76731538772583,540,2012-04-08 01:18:21+00:00,2012-04-08 10:18:21+00:00,1@1,"35.668087628929534,139.76731538772583",2,2,individual,2
4,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 06:41:07+00:00,2012-04-08 15:41:07+00:00,1@1,"35.669687,139.767254",1,1,individual,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,4bf58dd8d48988d129951735,Train Station,35.69137390901421,139.6993589401245,540,2013-02-08 00:31:55+00:00,2013-02-08 09:31:55+00:00,999@20,"35.69137390901421,139.6993589401245",10142,11423,collective,0
351983,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-12 08:17:09+00:00,2013-02-12 17:17:09+00:00,999@21,"35.69080269423021,139.70027922656513",10099,11423,collective,0
351984,999,4b5ecbf5f964a520fd9829e3,4bf58dd8d48988d129951735,Train Station,35.68896262757877,139.70000973651244,540,2013-02-12 08:19:24+00:00,2013-02-12 17:19:24+00:00,999@21,"35.6895798775885,139.7000147227295",45,45,individual,0
351985,999,4b0587a6f964a5203d9e22e3,4bf58dd8d48988d129951735,Train Station,35.69080269423021,139.70027922656513,540,2013-02-13 00:35:26+00:00,2013-02-13 09:35:26+00:00,999@21,"35.69080269423021,139.70027922656513",10099,11423,collective,0


In [30]:
p5['cat_name'].unique()

array(['Train Station', 'Subway', 'Convention Center',
       'Japanese Restaurant', 'Electronics Store', 'Cafe',
       'Fast Food Restaurant', 'Convenience Store', 'Event Space',
       'Paper / Office Supplies Store', 'Chinese Restaurant', 'Office',
       'Bookstore', 'Hobby Shop', 'Bus Station', 'Spiritual Center',
       'Temple', 'Bar', 'Miscellaneous Shop', 'Toy / Game Store',
       'University', 'Concert Hall', 'Italian Restaurant', 'Stadium',
       'Burger Joint', 'Racetrack', 'Mall', 'Department Store',
       'General Entertainment', 'Park', 'Steakhouse', 'Soup Place',
       'Ramen /  Noodle House', 'Hardware Store', 'Bridge', 'Airport',
       'Food & Drink Shop', 'Light Rail', 'Building', 'Mobile Phone Shop',
       'Diner', 'Gym / Fitness Center', 'Restaurant', 'Arcade',
       'Coffee Shop', 'Shrine', 'Bowling Alley', 'Drugstore / Pharmacy',
       'Home (private)', 'Hotel', 'Plaza', 'Clothing Store',
       'Record Shop', 'Music Store', 'Government Building', 'Gift 

In [31]:
p5['cat_id'].unique()

array(['4bf58dd8d48988d129951735', '4bf58dd8d48988d1fd931735',
       '4bf58dd8d48988d1ff931735', '4bf58dd8d48988d111941735',
       '4bf58dd8d48988d122951735', '4bf58dd8d48988d16d941735',
       '4bf58dd8d48988d16e941735', '4d954b0ea243a5684a65b473',
       '4bf58dd8d48988d171941735', '4bf58dd8d48988d121951735',
       '4bf58dd8d48988d145941735', '4bf58dd8d48988d125941735',
       '4bf58dd8d48988d114951735', '4bf58dd8d48988d1fb941735',
       '4bf58dd8d48988d1fe931735', '4bf58dd8d48988d131941735',
       '4bf58dd8d48988d13a941735', '4bf58dd8d48988d120941735',
       '4bf58dd8d48988d1ff941735', '4bf58dd8d48988d1f3941735',
       '4bf58dd8d48988d1ae941735', '5032792091d4c4b30a586d5c',
       '4bf58dd8d48988d110941735', '4bf58dd8d48988d18c941735',
       '4bf58dd8d48988d16c941735', '4bf58dd8d48988d1f4931735',
       '4f4531504b9074f6e4fb0102', '4bf58dd8d48988d1fd941735',
       '4bf58dd8d48988d1f6941735', '4bf58dd8d48988d1f1931735',
       '4bf58dd8d48988d163941735', '4bf58dd8d48988d1cc9

In [32]:
len(category_reindex_map)

206

In [33]:
p5['user_id'].nunique()

2109

In [34]:
p5['norm_in_day_time']= p5['local_time'].map(lambda x: convert_time(x))
p5 = p5[['user_id', 'business_id',  'poi_type',  'actual_poi_id', 'check_in_poi_id', 'local_time', 'norm_in_day_time', 'cat_name','cat_code', 'trajectory_id','latitude','longitude']]
p5

Unnamed: 0,user_id,business_id,poi_type,actual_poi_id,check_in_poi_id,local_time,norm_in_day_time,cat_name,cat_code,trajectory_id,latitude,longitude
0,1,4b396b34f964a5204f5c25e3,individual,0,0,2012-04-08 08:33:08+00:00,0.375000,Train Station,0,1@1,35.74964694346383,139.80514526367188
1,1,4b305a74f964a5201ef924e3,individual,1,1,2012-04-08 09:03:35+00:00,0.395833,Subway,1,1@1,35.669687,139.767254
2,1,4b835f06f964a520330431e3,individual,2,2,2012-04-08 09:10:48+00:00,0.395833,Convention Center,2,1@1,35.668087628929534,139.76731538772583
3,1,4b835f06f964a520330431e3,individual,2,2,2012-04-08 10:18:21+00:00,0.437500,Convention Center,2,1@1,35.668087628929534,139.76731538772583
4,1,4b305a74f964a5201ef924e3,individual,1,1,2012-04-08 15:41:07+00:00,0.666667,Subway,1,1@1,35.669687,139.767254
...,...,...,...,...,...,...,...,...,...,...,...,...
351982,999,4b9779f5f964a5203e0535e3,collective,10142,11423,2013-02-08 09:31:55+00:00,0.416667,Train Station,0,999@20,35.69137390901421,139.6993589401245
351983,999,4b0587a6f964a5203d9e22e3,collective,10099,11423,2013-02-12 17:17:09+00:00,0.729167,Train Station,0,999@21,35.69080269423021,139.70027922656513
351984,999,4b5ecbf5f964a520fd9829e3,individual,45,45,2013-02-12 17:19:24+00:00,0.729167,Train Station,0,999@21,35.68896262757877,139.70000973651244
351985,999,4b0587a6f964a5203d9e22e3,collective,10099,11423,2013-02-13 09:35:26+00:00,0.416667,Train Station,0,999@21,35.69080269423021,139.70027922656513


In [35]:
p5.to_csv('out/TKY/checkins_v1.csv', index=False)