In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder 
import pickle

In [2]:
foursquare = pd.read_csv('data/foursquare_complete.csv')

# encoding category
enc = LabelEncoder()
enc.fit(foursquare['venue_category_name'])
foursquare['venue_category_name'] = enc.transform(foursquare['venue_category_name'])

mapping_name = dict(zip(range(len(enc.classes_)), enc.classes_))
with open('name_category.pkl', 'wb') as f:
    pickle.dump(mapping_name, f)

# equal length in all of the trajectory
min_len = float('inf')
set_uid = set(foursquare['uid'])
for u in set_uid:
    min_len = min(min_len, len(foursquare[foursquare['uid'] == u]))

red_df = foursquare.groupby(by=['uid']).tail(min_len).copy()
#timestamp
red_df['timestamp'] = np.arange(0, min_len).tolist() * len(set_uid)

# encoding ids
enc = LabelEncoder()
enc.fit(red_df['venue_id'])
red_df['venue_id'] = enc.transform(red_df['venue_id'])

mapping_id = dict(zip(range(len(enc.classes_)), enc.classes_))
with open('id_category.pkl', 'wb') as f:
    pickle.dump(mapping_id, f)

### split train/test dataset
users = set(red_df['uid'])

red_df.columns = ['uid:token', 'venue_id:token', 'venue_category_name:token', 'lat:float', 'lon:float', 'timestamp:token']

In [None]:

# get the indexes for the training and test sets
np.random.seed(1234)
test_ind = np.random.choice(list(users), int(len(users) * 0.2), replace=False)
train_ind = users.difference(set(test_ind))

test_set = red_df[red_df['uid:token'].isin(test_ind)].copy()
train_set = red_df[red_df['uid:token'].isin(train_ind)].copy()

In [82]:
# training files
#interaction
train_set[['uid:token', 'venue_id:token', 'timestamp:token']].to_csv('foursquare_train/foursquare_train.inter', index = False, sep = '\t')
#users
pd.DataFrame(set(train_set['uid:token']), columns=['uid:token']).to_csv('foursquare_train/foursquare_train.user', index=False, sep = '\t')
#items
items = train_set[['venue_id:token', 'venue_category_name:token']].drop_duplicates()
items.sort_values(by = 'venue_id:token', inplace=True)
items.to_csv('foursquare_train/foursquare_train.item', index = False, sep = '\t')

In [83]:
# test files
#interaction
test_set[['uid:token', 'venue_id:token', 'timestamp:token']].to_csv('foursquare_test/foursquare_test.inter', index = False, sep = '\t')
#users
pd.DataFrame(set(test_set['uid:token']), columns=['uid:token']).to_csv('foursquare_test/foursquare_test.user', index=False, sep = '\t')
#items
items = test_set[['venue_id:token', 'venue_category_name:token']].drop_duplicates()
items.sort_values(by = 'venue_id:token', inplace=True)
items.to_csv('foursquare_test/foursquare_test.item', index = False, sep = '\t')

In [4]:
#interaction
red_df[['uid:token', 'venue_id:token', 'timestamp:token']].to_csv('foursquare/foursquare.inter', index = False, sep = '\t')
#users
pd.DataFrame(set(red_df['uid:token']), columns=['uid:token']).to_csv('foursquare/foursquare.user', index=False, sep = '\t')
#items
items = red_df[['venue_id:token', 'venue_category_name:token']].drop_duplicates()
items.sort_values(by = 'venue_id:token', inplace=True)
items.to_csv('foursquare/foursquare.item', index = False, sep = '\t')

In [22]:
# spatial distribution for each user
it,co = np.unique(red_df[red_df['uid:token'] == 2]['venue_category_name:token'], return_counts=True)

In [24]:
it, co

(array([  3,  12,  19,  22,  23,  31,  33,  34,  36,  38,  53,  54,  64,
         68,  69,  70,  87,  94, 114, 121, 124, 128, 141, 161, 165, 166,
        170, 180, 192, 196, 199, 201, 212, 220, 223, 234, 239]),
 array([ 4,  3,  1,  3,  1,  1,  1,  4,  1,  3,  2, 14,  1,  8,  1,  1,  1,
         5, 11,  2,  1,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
         8,  2,  2]))

In [25]:
import matplotlib.pyplot as plt

In [27]:
co / sum(co)

array([0.04, 0.03, 0.01, 0.03, 0.01, 0.01, 0.01, 0.04, 0.01, 0.03, 0.02,
       0.14, 0.01, 0.08, 0.01, 0.01, 0.01, 0.05, 0.11, 0.02, 0.01, 0.03,
       0.02, 0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.08, 0.02, 0.02])

In [50]:
inter = pd.read_csv('foursquare/foursquare.inter', sep = '\t')
items = pd.read_csv('foursquare/foursquare.item', sep = '\t')
inter.shape

(108300, 3)

In [69]:
lista = [94, 114, 121, 124, 128, 141, 161]

In [74]:
visit = red_df[red_df['uid:token'] == 2]['venue_category_name:token']

In [77]:
visit

100    223
101    124
102    128
103     22
104     12
      ... 
195    114
196     68
197     68
198     36
199     53
Name: venue_category_name:token, Length: 100, dtype: int64

In [86]:
rf = 0
for i in set(visit):
    print(i,np.count_nonzero(visit == i)/len((visit)))
    rf+= np.count_nonzero(visit == i)/len((visit))

128 0.03
3 0.04
12 0.03
141 0.02
19 0.01
22 0.03
23 0.01
31 0.01
161 0.02
34 0.04
33 0.01
36 0.01
165 0.02
38 0.03
166 0.02
170 0.01
180 0.01
53 0.02
54 0.14
192 0.01
64 0.01
196 0.01
68 0.08
70 0.01
69 0.01
199 0.01
201 0.01
212 0.01
87 0.01
220 0.01
94 0.05
223 0.08
234 0.02
239 0.02
114 0.11
121 0.02
124 0.01


In [87]:
rf

1.0000000000000002