In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.cluster import DBSCAN

In [2]:
pois = pd.read_csv('data/SIN/dataset_TIST2015_POIs.txt', sep='\t', header=None, names=['business_id', 'latitude', 'longitude', 'cat_name', 'country'])
pois

Unnamed: 0,business_id,latitude,longitude,cat_name,country
0,3fd66200f964a52000e71ee3,40.733596,-74.003139,Jazz Club,US
1,3fd66200f964a52000e81ee3,40.758102,-73.975734,Gym,US
2,3fd66200f964a52000ea1ee3,40.732456,-74.003755,Indian Restaurant,US
3,3fd66200f964a52000ec1ee3,42.345907,-71.087001,Indian Restaurant,US
4,3fd66200f964a52000ee1ee3,39.933178,-75.159262,Sandwich Place,US
...,...,...,...,...,...
3680121,5237865c498e89110c1d03e7,40.154444,26.410847,Comedy Club,TR
3680122,5237867411d2a1e910744c81,35.340099,33.309328,Home (private),CY
3680123,5237879111d216bab10e9e09,-1.404065,-48.453742,Home (private),BR
3680124,52378c24498ea9502baf2716,3.425155,-76.545010,Sandwich Place,CO


In [3]:
pois = pois[pois['country']=='SG']
cafe_pois = pois[(pois['cat_name'].str[:3]=='Caf') & (pois['cat_name']!='Cafeteria')]['business_id'].unique()
pois.loc[pois['business_id'].isin(cafe_pois), 'cat_name'] ='Cafe'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [4]:
checkins = pd.read_csv('data/SIN/dataset_TIST2015_Checkins.txt', sep='\t', header=None, names=['user_id', 'business_id', 'utc_time', 'zone_offset'])

In [8]:
sin = pd.merge(pois, checkins,how='inner', on='business_id')
sin

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset
0,4a5eb95bf964a52019bf1fe3,1.335005,103.964682,Office,SG,52438,Tue Jan 08 01:36:49 +0000 2013,480
1,4a5eb95bf964a52019bf1fe3,1.335005,103.964682,Office,SG,52438,Thu Jan 10 03:30:59 +0000 2013,480
2,4a5eb95bf964a52019bf1fe3,1.335005,103.964682,Office,SG,52438,Thu Jan 10 05:47:22 +0000 2013,480
3,4a5eb95bf964a52019bf1fe3,1.335005,103.964682,Office,SG,52438,Mon Jan 14 05:16:49 +0000 2013,480
4,4a5eb95bf964a52019bf1fe3,1.335005,103.964682,Office,SG,52438,Mon Jan 28 09:12:10 +0000 2013,480
...,...,...,...,...,...,...,...,...
356376,5232de3311d2849884484960,1.323134,103.873704,Housing Development,SG,133548,Fri Sep 13 09:43:21 +0000 2013,480
356377,5232de3311d2849884484960,1.323134,103.873704,Housing Development,SG,133548,Mon Sep 16 10:30:52 +0000 2013,480
356378,52348bac11d2f5222d094397,1.329494,103.889506,Bar,SG,156472,Sat Sep 14 16:15:43 +0000 2013,480
356379,52348ec311d2a8886f1f3ccd,1.329494,103.889506,Nightclub,SG,156472,Sat Sep 14 16:28:54 +0000 2013,480


In [9]:
p1_list = []
delta = 60 * 60 * 24
duplicate_interval = 60 * 60


def read_txt(file_name):
    filtered = []
    file = open(file_name, 'r', encoding='utf-8')  #打开文件
    file_data = file.readlines()  #读取所有行
    for row in file_data:
        row_list = row.split('\t')
        if row_list[2] == '4bf58dd8d48988d16d941735':
            tmp_list = []
            for i, x in enumerate(row_list):
                if i != 3:
                    tmp_list.append(x)
                else:
                    _, latitude = x.split('?')
                    tmp_list.append('Cafe')
                    tmp_list.append(latitude)
            row_list = tmp_list
        row_list[-1] = row_list[-1].replace('\n', '')
        filtered.append(row_list)
    return filtered


def str2delta(t):
    t = int(t)
    abs_t = abs(t)
    d = pd.Timedelta(minutes=abs_t)
    if t > 0:
        return d
    else:
        return -d


# 这个方法已经要求了轨迹至少得有三个点
def trajectory_regulation(df):
    trajectory_id = 1
    traj_list = []
    retained_list = []
    duplicate_list = []
    df = df.sort_values(by=['local_time']).reset_index(drop=True)
    user_id = str(df.iloc[0, 5])
    last_date = None
    last_checkin_date = None
    last_checkin_id = None
    last_index = 0
    for i, x in df.iterrows():
        if i == 0:
            last_date = x['local_time']
            last_index = i
            last_checkin_date = x['local_time']
            last_checkin_id = x['business_id']
            continue
        current_date = x['local_time']
        current_checkin_id = x['business_id']
        if (current_date - last_date).total_seconds() < delta:  # 这个是用来取一条轨迹的始末
            if last_checkin_id == current_checkin_id:  # 有连续的两个相同POI的签到点，如果是短时间内的重复打卡那就只保留第一个点，重复这个过程
                if (current_date - last_checkin_date).total_seconds() < duplicate_interval:
                    duplicate_list.append(i)
                else:
                    # 如果是相同的签到点，由于时间间隔大于一小时，那就相当于这个是不同的签到点
                    last_checkin_date = current_date
                    last_checkin_id = current_checkin_id
            else:  # 如果不是相同的签到点，那肯定可以直接记录当前的签到点和时间
                last_checkin_date = current_date
                last_checkin_id = current_checkin_id
        else:
            # 当前这一项如果时间距离轨迹的第一项超过了24小时，那么应该是对上一项进行处理
            # +1是因为数量是减后加1，但还要-1，因为是要用上一项的索引，那么就是不加不减
            if i - last_index >= 3:
                for retain_row in range(last_index, i):  # 刚好range也不用加1了
                    if retain_row not in duplicate_list:
                        retained_list.append(retain_row)  # 索引是用来标记要留下哪些行
                        traj_list.append(trajectory_id)  # 轨迹序号是用来标记哪些行归属一条轨迹
                trajectory_id += 1

            last_date = x['local_time']  # 重新开始划分轨迹
            last_index = i  # 下一条轨迹的开始索引
    retained_df = df.iloc[retained_list, :].copy()
    traj_list = [user_id + '@' + str(x) for x in traj_list]
    retained_df['trajectory_id'] = traj_list
    p1_list.append(retained_df)


def retain_users_with_trajectories(df, min_len):
    trajectories = df['trajectory_id'].unique()
    if len(trajectories) >= min_len:
        return True
    else:
        return False


def check(df):
    n = df['trajectory_id'].nunique()
    if n < 5:
        return False
    else:
        return True


def haversine(pos1, pos2):
    lat1, lon1 = pos1
    lat2, lon2 = pos2
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(math.radians(dlat / 2)) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(
        math.radians(dlon / 2)) ** 2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371
    h_dist = c * r

    return h_dist


def lon_lat_tuning(df):
    l = len(df)
    lon_lat = df['lat_lon'].unique()
    if len(lon_lat) != 1:
        most = df['lat_lon'].value_counts().index[0]
        df['latitude'] = [most.split(',')[0]] * l
        df['longitude'] = [most.split(',')[1]] * l
    return df


def convert_time(d):
    hour = d.hour
    minute = d.minute
    if minute < 30:
        minute = 30
    else:
        minute = 0
        hour += 1
    return (hour * 3600 + minute * 60) / 24 / 3600

In [10]:
sin['utc_time'] = pd.to_datetime(sin['utc_time'])
zone_delta = sin['zone_offset'].map(lambda x: str2delta(x))
sin['local_time'] = sin['utc_time'] + zone_delta

In [11]:
sin.groupby('user_id').apply(lambda x: trajectory_regulation(x))

In [12]:
p1 = pd.concat(p1_list).reset_index(drop=True)
p1

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1
...,...,...,...,...,...,...,...,...,...,...
201908,4b932271f964a520073734e3,1.372859,103.949332,Train Station,SG,266723,2013-08-02 00:25:41+00:00,480,2013-08-02 08:25:41+00:00,266723@8
201909,4b05880bf964a520b7ad22e3,1.307516,103.827931,Hotel,SG,266872,2013-07-22 16:17:55+00:00,480,2013-07-23 00:17:55+00:00,266872@1
201910,4b05880bf964a520b7ad22e3,1.307516,103.827931,Hotel,SG,266872,2013-07-23 07:48:26+00:00,480,2013-07-23 15:48:26+00:00,266872@1
201911,4b08e700f964a520421323e3,1.304783,103.831938,Mall,SG,266872,2013-07-23 11:00:04+00:00,480,2013-07-23 19:00:04+00:00,266872@1


In [13]:
p2 = p1.groupby('user_id').filter(lambda x: retain_users_with_trajectories(x, 5)).reset_index(drop=True)
p2

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1
...,...,...,...,...,...,...,...,...,...,...
182056,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7
182057,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7
182058,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8
182059,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8


In [14]:
retain = []
for i, count in zip(p2['business_id'].value_counts().index, p2['business_id'].value_counts()):
    if count >= 5:
        retain.append(i)
    else:
        break
p3 = p2[p2['business_id'].isin(retain)]
p3

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1
...,...,...,...,...,...,...,...,...,...,...
182056,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7
182057,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7
182058,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8
182059,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8


In [15]:
p3['business_id'].value_counts()

4c775cda93faa093a2c5f0fb    1881
4b5da988f964a520ae6529e3    1575
4b08e700f964a520421323e3    1402
4b44cd7af964a520a3fc25e3    1330
4b058818f964a520a7b122e3    1309
                            ... 
4bd3da2641b9ef3b273001e6       5
4c0701c52e80a593e31175f9       5
4b2b6a7af964a52042b624e3       5
4d16f5ab401db60c0ddde8a4       5
50713a80e4b04113b2c81b26       5
Name: business_id, Length: 6074, dtype: int64

In [16]:
p4 = p3.groupby('trajectory_id').filter(lambda x: len(x) >= 3)
p4

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1
...,...,...,...,...,...,...,...,...,...,...
182056,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7
182057,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7
182058,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8
182059,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8


In [17]:
p5 = p4.groupby('user_id').filter(lambda x: check(x)).reset_index(drop=True)
p5

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1
...,...,...,...,...,...,...,...,...,...,...
135781,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7
135782,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7
135783,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8
135784,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8


In [18]:
p5['lat_lon'] = p5['latitude'].map(str) + ',' + p5['longitude'].map(str)
p5 = p5.groupby('business_id').apply(lambda x: lon_lat_tuning(x))
p5

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id,lat_lon
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1,"1.31079,103.72052"
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1,"1.296405,103.850241"
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1,"1.301253,103.837023"
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1,"1.300642,103.844898"
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1,"1.31079,103.72052"
...,...,...,...,...,...,...,...,...,...,...,...
135781,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7,"1.310145,103.855216"
135782,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7,"1.370918,103.892459"
135783,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8,"1.368064,103.955007"
135784,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8,"1.373546,103.949536"


In [19]:
all_poi = p5.drop_duplicates(subset=['business_id', 'latitude', 'longitude'])
all_business_ids = np.array(all_poi['business_id'])
all_lat_lon = np.array(
    [[float(x['latitude']), float(x['longitude'])] for _, x in all_poi[['latitude', 'longitude']].iterrows()])

In [20]:
eps = 0.05
min_samples = 12
clusters = DBSCAN(eps=eps, min_samples=min_samples, metric=haversine).fit(all_lat_lon)
np.unique(clusters.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64],
      dtype=int64)

In [21]:
individual_poi_map = {x: i for i, x in enumerate(all_business_ids[clusters.labels_ == -1])}  # 6014
individual_in_collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI中的独立POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        individual_in_collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map)

collective_poi_map = dict()
for cluster in np.unique(clusters.labels_):  # 给集合POI标POI编号
    if cluster == -1:
        continue
    for x in all_business_ids[clusters.labels_ == cluster]:
        collective_poi_map[x] = len(individual_poi_map) + len(individual_in_collective_poi_map) + cluster
actual_poi_reindex_map = dict(**individual_poi_map, **individual_in_collective_poi_map)
check_in_poi_reindex_map = dict(**individual_poi_map, **collective_poi_map)
len(individual_poi_map), len(individual_in_collective_poi_map), len(collective_poi_map)

(4465, 1553, 1553)

In [22]:
collective_poi_map

{'4c775cda93faa093a2c5f0fb': 6018,
 '4b0a71d7f964a5201a2423e3': 6018,
 '4b4e7e0ff964a520a7ef26e3': 6018,
 '4b7553ecf964a52088062ee3': 6018,
 '4bc81ea50050b71399e4b93b': 6018,
 '4c6f2b6a34443704d462205f': 6018,
 '4b603ebaf964a520a9db29e3': 6018,
 '4bd50a026f649521249d6eec': 6018,
 '4e8d6d7061af7424da973a15': 6018,
 '4c02034cf815a593103c0a96': 6018,
 '50697249e4b0cdb23126577a': 6018,
 '4c3dd44a980320a13f838ce4': 6018,
 '4b9c715af964a520fb6836e3': 6018,
 '4bb71a2c941ad13a084a20e3': 6018,
 '4d19b4c9cc216ea879337fd3': 6018,
 '50bcad6ae4b07b04b525e195': 6018,
 '4ba7967df964a520fe9f39e3': 6018,
 '4c750a63604a37049c728149': 6018,
 '4c0a29fd6071a593a818df32': 6018,
 '4c2b5f5d77cfe21e3a59b4f1': 6018,
 '4d788aed109eb1f7c7225106': 6018,
 '4f6bf0dfe4b0541c6001a1b5': 6018,
 '4d42b4f432d5236a0225baa5': 6018,
 '4c949b1b72dd224bf4629b91': 6018,
 '4d4d116cf8292d43506969d6': 6018,
 '4d4be85a9f79f04df97c1e2b': 6018,
 '4b666248f964a520bb1f2be3': 6018,
 '4c0b19d7340720a1fab68793': 6018,
 '4b5edeeff964a520c4

In [23]:
check_in_poi_reindex_map.values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [24]:
import os
import pickle
with open(os.path.join('out/SIN', 'raw_i_p_reindex.pkl'), 'wb') as file:
        pickle.dump(individual_poi_map, file)
with open(os.path.join('out/SIN', 'raw_i_c_p_reindex.pkl'), 'wb') as file:
        pickle.dump(individual_in_collective_poi_map, file)
with open(os.path.join('out/SIN', 'raw_c_p_reindex.pkl'), 'wb') as file:
        pickle.dump(collective_poi_map, file)

In [25]:
len(actual_poi_reindex_map), len(check_in_poi_reindex_map)

(6018, 6018)

In [26]:
p5['actual_poi_id'] = p5['business_id'].map(actual_poi_reindex_map)
p5['check_in_poi_id'] = p5['business_id'].map(check_in_poi_reindex_map)
p5['poi_type'] = p5['actual_poi_id'].map(
    lambda x: 'individual' if x < len(individual_poi_map) else 'collective')
p5

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1,"1.31079,103.72052",0,0,individual
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1,"1.296405,103.850241",1,1,individual
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1,"1.301253,103.837023",4465,6018,collective
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1,"1.300642,103.844898",4515,6019,collective
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1,"1.31079,103.72052",0,0,individual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135781,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7,"1.310145,103.855216",210,210,individual
135782,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7,"1.370918,103.892459",57,57,individual
135783,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8,"1.368064,103.955007",5781,6067,collective
135784,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8,"1.373546,103.949536",411,411,individual


In [27]:
p5[p5['poi_type']=='collective']

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1,"1.301253,103.837023",4465,6018,collective
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1,"1.300642,103.844898",4515,6019,collective
6,4b058815f964a520afb022e3,1.300304,103.844887,Mall,SG,16,2012-04-11 10:36:57+00:00,480,2012-04-11 18:36:57+00:00,16@2,"1.300304,103.844887",4516,6019,collective
7,4b058814f964a52087b022e3,1.298915,103.847644,Multiplex,SG,16,2012-04-11 10:54:56+00:00,480,2012-04-11 18:54:56+00:00,16@2,"1.298915,103.847644",5705,6064,collective
8,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-11 13:18:37+00:00,480,2012-04-11 21:18:37+00:00,16@2,"1.300642,103.844898",4515,6019,collective
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135771,4b932271f964a520073734e3,1.372859,103.949332,Train Station,SG,266723,2013-03-18 23:50:34+00:00,480,2013-03-19 07:50:34+00:00,266723@2,"1.372859,103.949332",5149,6036,collective
135778,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-04-08 01:01:12+00:00,480,2013-04-08 09:01:12+00:00,266723@6,"1.368064,103.955007",5781,6067,collective
135780,4b932271f964a520073734e3,1.372859,103.949332,Train Station,SG,266723,2013-04-09 01:47:40+00:00,480,2013-04-09 09:47:40+00:00,266723@7,"1.372859,103.949332",5149,6036,collective
135783,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8,"1.368064,103.955007",5781,6067,collective


In [28]:
category_names = p5['cat_name'].unique()
category_reindex_map = {category_names[i]: i for i in range(len(category_names))}
p5['cat_code'] = p5['cat_name'].map(category_reindex_map)
p5

Unnamed: 0,business_id,latitude,longitude,cat_name,country,user_id,utc_time,zone_offset,local_time,trajectory_id,lat_lon,actual_poi_id,check_in_poi_id,poi_type,cat_code
0,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-04 09:10:49+00:00,480,2012-04-04 17:10:49+00:00,16@1,"1.31079,103.72052",0,0,individual,0
1,4c84c1f4d92ea093aec25f72,1.296405,103.850241,University,SG,16,2012-04-04 10:14:32+00:00,480,2012-04-04 18:14:32+00:00,16@1,"1.296405,103.850241",1,1,individual,1
2,4c775cda93faa093a2c5f0fb,1.301253,103.837023,Mall,SG,16,2012-04-04 13:13:21+00:00,480,2012-04-04 21:13:21+00:00,16@1,"1.301253,103.837023",4465,6018,collective,2
3,4b058814f964a52080b022e3,1.300642,103.844898,Multiplex,SG,16,2012-04-04 13:31:37+00:00,480,2012-04-04 21:31:37+00:00,16@1,"1.300642,103.844898",4515,6019,collective,3
4,4e94f6845c5c3201e8ea90f0,1.310790,103.720520,Office,SG,16,2012-04-05 09:07:23+00:00,480,2012-04-05 17:07:23+00:00,16@1,"1.31079,103.72052",0,0,individual,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135781,4b058815f964a520a9b022e3,1.310145,103.855216,Mall,SG,266723,2013-04-09 03:13:21+00:00,480,2013-04-09 11:13:21+00:00,266723@7,"1.310145,103.855216",210,210,individual,2
135782,4b127444f964a520fd8923e3,1.370918,103.892459,Bus Station,SG,266723,2013-04-09 08:05:25+00:00,480,2013-04-09 16:05:25+00:00,266723@7,"1.370918,103.892459",57,57,individual,9
135783,4ba0ddaaf964a520298337e3,1.368064,103.955007,Home (private),SG,266723,2013-08-01 23:46:35+00:00,480,2013-08-02 07:46:35+00:00,266723@8,"1.368064,103.955007",5781,6067,collective,81
135784,4b87b5a6f964a5202ac831e3,1.373546,103.949536,Bus Station,SG,266723,2013-08-02 00:20:48+00:00,480,2013-08-02 08:20:48+00:00,266723@8,"1.373546,103.949536",411,411,individual,9


In [29]:
p5['cat_name'].unique()

array(['Office', 'University', 'Mall', 'Multiplex', 'Other Nightlife',
       'Building', 'Library', 'Airport', 'Airport Terminal',
       'Bus Station', 'Post Office', 'Animal Shelter', 'Food Court',
       'Hotel', 'Plaza', 'Museum', 'Fish Market', 'Nightclub', 'Hospital',
       'Train Station', 'Asian Restaurant', 'College Library',
       'High School', 'Temple', 'General College & University', 'Subway',
       'Fire Station', 'Mobile Phone Shop', 'Grocery Store',
       'Government Building', 'Pier', 'Convention Center', 'Factory',
       'Coffee Shop', 'Light Rail', 'Medical Center',
       'Scandinavian Restaurant', 'Breakfast Spot', 'Bakery',
       'Fast Food Restaurant', 'Trade School', 'Japanese Restaurant',
       'Police Station', 'Event Space', 'Vegetarian / Vegan Restaurant',
       'Park', 'Flea Market', 'Concert Hall', 'Stadium',
       'Sushi Restaurant', 'Seafood Restaurant', 'African Restaurant',
       'Cafe', 'Garden', 'Scenic Lookout', 'Garden Center',
       'C

In [30]:
len(category_reindex_map)

312

In [31]:
p5['user_id'].nunique()

1536

In [32]:
p5['norm_in_day_time']= p5['local_time'].map(lambda x: convert_time(x))
p5 = p5[['user_id', 'business_id',  'poi_type',  'actual_poi_id', 'check_in_poi_id', 'local_time', 'norm_in_day_time', 'cat_name','cat_code', 'trajectory_id','latitude','longitude']]
p5

Unnamed: 0,user_id,business_id,poi_type,actual_poi_id,check_in_poi_id,local_time,norm_in_day_time,cat_name,cat_code,trajectory_id,latitude,longitude
0,16,4e94f6845c5c3201e8ea90f0,individual,0,0,2012-04-04 17:10:49+00:00,0.729167,Office,0,16@1,1.310790,103.720520
1,16,4c84c1f4d92ea093aec25f72,individual,1,1,2012-04-04 18:14:32+00:00,0.770833,University,1,16@1,1.296405,103.850241
2,16,4c775cda93faa093a2c5f0fb,collective,4465,6018,2012-04-04 21:13:21+00:00,0.895833,Mall,2,16@1,1.301253,103.837023
3,16,4b058814f964a52080b022e3,collective,4515,6019,2012-04-04 21:31:37+00:00,0.916667,Multiplex,3,16@1,1.300642,103.844898
4,16,4e94f6845c5c3201e8ea90f0,individual,0,0,2012-04-05 17:07:23+00:00,0.729167,Office,0,16@1,1.310790,103.720520
...,...,...,...,...,...,...,...,...,...,...,...,...
135781,266723,4b058815f964a520a9b022e3,individual,210,210,2013-04-09 11:13:21+00:00,0.479167,Mall,2,266723@7,1.310145,103.855216
135782,266723,4b127444f964a520fd8923e3,individual,57,57,2013-04-09 16:05:25+00:00,0.687500,Bus Station,9,266723@7,1.370918,103.892459
135783,266723,4ba0ddaaf964a520298337e3,collective,5781,6067,2013-08-02 07:46:35+00:00,0.333333,Home (private),81,266723@8,1.368064,103.955007
135784,266723,4b87b5a6f964a5202ac831e3,individual,411,411,2013-08-02 08:20:48+00:00,0.354167,Bus Station,9,266723@8,1.373546,103.949536


In [33]:
p5.to_csv('out/SIN/checkins_v1.csv', index=False)