In [4]:
dataset_path = "../data/MP"

In [5]:
import os
from tqdm import tqdm
import numpy as np
import datetime

In [6]:
user2id = np.load(os.path.join(dataset_path, 'user_mapper.npy'), allow_pickle=True).item()
location2id = np.load(os.path.join(dataset_path, 'location_mapper.npy'), allow_pickle=True).item()

In [7]:
def datetime_to_features(timestamp):
    dt = datetime.datetime.fromtimestamp(int(timestamp) // 1000)
    weekday = dt.weekday()
    hour = dt.hour
    return weekday, hour

In [8]:
trans_time_individual = []
# occur_time_individual: (user_num, 24)
occur_time_individual = np.zeros(shape=(len(user2id), 24), dtype=np.float32)

diff_data = []
with open(os.path.join(dataset_path, f'train.csv'), 'r', encoding='utf8') as file:
    lines = file.readlines()
    for line in tqdm(lines, desc=f'Preprocess data'):
        # trans_matrix_time: (24, 24)
        trans_matrix_time = np.ones((24, 24))
        stay_points = line.strip().split(',')[1:]
        user = line.strip().split(',')[0]
        # i:0 - len(stay_points) - 2
        # len(stay_points) - 1没有下一个时刻，所以不包括在循环里，
        # 且只更新了occur_time_individual和user_loc_matrix，没有更新trans_matrix_time
        for i in range(len(stay_points) - 1):
            location, timestamp = stay_points[i].split('@')
            next_location, next_timestamp = stay_points[i + 1].split('@')
            weekday, hour = datetime_to_features(timestamp)
            next_weekday, next_hour = datetime_to_features(next_timestamp)
            diff_data.append(abs(next_hour - hour))
            i, j = hour, next_hour
            # 时间点之间的转移次数
            trans_matrix_time[i, j] += 1
            # 在某个小时出现的次数
            occur_time_individual[user2id[user]][hour] += 1
            # 下面这个i是hour啊
            if i == len(stay_points) - 2:
                occur_time_individual[user2id[user]][next_hour] += 1

        time_row_sums = trans_matrix_time.sum(axis=1)
        trans_matrix_time = trans_matrix_time / time_row_sums[:, np.newaxis]
        # 所有用户的转移矩阵存成一个list
        trans_time_individual.append(trans_matrix_time)

trans_time_individual = np.array(trans_time_individual)

np.save(os.path.join(dataset_path, f'prob_matrix_time_individual.npy'),
        np.array(trans_time_individual))
np.save(os.path.join(dataset_path, f'occur_time_individual.npy'),
        np.array(occur_time_individual))

Preprocess data: 100%|██████████| 10000/10000 [00:08<00:00, 1235.85it/s]


In [9]:
occur_time_individual = np.load(os.path.join(dataset_path, f'occur_time_individual.npy'),allow_pickle=True)

In [10]:
occur_time_individual.shape

(10000, 24)

In [11]:
res = []

In [12]:
line = "27978,1625@1672567200000,1625@1672623000000,991@1672642800000,936@1672653600000,1625@1672747200000,1620@1672831800000,288@1672840800000,1625@1672921800000,7375@1673011800000,1625@1673024400000,10496@1673060400000,1625@1673078400000,1625@1673087400000,1108@1673152200000,1625@1673177400000,10496@1673236800000,1618@1673249400000,1625@1673352000000,10496@1673445600000,1625@1673528400000,1625@1673613000000,10496@1673668800000,288@1673677800000,1625@1673694000000,1047@1673762400000,2850@1673784000000,1015@1673879400000,2896@1673958600000,4402@1674050400000,799@1674111600000,785@1674129600000,799@1674145800000,4402@1674208800000,3268@1674289800000,4402@1674381600000,1620@1674480600000,3146@1674558000000,1625@1674646200000,1640@1674738000000,1625@1674822600000,1625@1674975600000,1623@1675065600000,1625@1675168200000,1623@1675254600000,966@1675312200000,288@1675337400000,6482@1675422000000,1625@1675443600000,899@1675504800000,10496@1675580400000,288@1675591200000,5723@1675645200000,1015@1675688400000,10496@1675744200000,2601@1675755000000,1625@1675855800000,1625@1675947600000,3645@1676025000000,974@1676034000000,5711@1676043000000,12479@1676073600000,1620@1676106000000,1625@1676183400000,1625@1676197800000,1625@1676295000000,1625@1676376000000,2548@1676466000000,899@1676552400000,3183@1676653200000,5676@1676712600000,1625@1676723400000,10496@1676781000000,1625@1676791800000,1625@1676881800000,1620@1676986200000,10496@1677063600000,1623@1677074400000,1625@1677153600000,1015@1677198600000,1625@1677236400000,921@1677317400000,7392@1677328200000,1233@1677396600000,2829@1677409200000,2547@1677501000000,1623@1677587400000,1015@1677646800000"

In [13]:
user = line.strip().split(',')[0]
user

'27978'

In [14]:
occur_time_user = occur_time_individual[user2id[user]]
occur_time_user

array([ 1.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  2.,  0.,  1.,  6.,
        0.,  3.,  8.,  4.,  3.,  9., 11., 16., 11.,  5.,  1.],
      dtype=float32)

In [15]:
stay_points = line.strip().split(',')[1:]

In [16]:
count=0
for item in stay_points:
    count+=1
count

87

In [17]:
sequence_count, left = divmod(len(stay_points), 20)
print(sequence_count, left)

4 7


In [18]:
split_start = 0
split_end = 20
location_x = [location2id[item.split('@')[0]] for item in stay_points[split_start:split_end]]
location_x

[15741,
 15741,
 10432,
 11426,
 15741,
 9115,
 7729,
 15741,
 12672,
 15741,
 7971,
 15741,
 15741,
 13646,
 15741,
 7971,
 19345,
 15741,
 7971,
 15741]

In [19]:
timestamp_x = [item.split('@')[1] for item in stay_points[split_start:split_end]]
for x, t in zip(location_x, timestamp_x):
    print(f'{x}:{t}')

15741:1672567200000
15741:1672623000000
10432:1672642800000
11426:1672653600000
15741:1672747200000
9115:1672831800000
7729:1672840800000
15741:1672921800000
12672:1673011800000
15741:1673024400000
7971:1673060400000
15741:1673078400000
15741:1673087400000
13646:1673152200000
15741:1673177400000
7971:1673236800000
19345:1673249400000
15741:1673352000000
7971:1673445600000
15741:1673528400000


In [20]:
location_y = [location2id[item.split('@')[0]] for item in
                stay_points[split_start + 1:split_end + 1]]
timestamp_y = [item.split('@')[1] for item in stay_points[split_start + 1:split_end + 1]]
for y, t in zip(location_y, timestamp_y):
    print(f'{y}:{t}')

15741:1672623000000
10432:1672642800000
11426:1672653600000
15741:1672747200000
9115:1672831800000
7729:1672840800000
15741:1672921800000
12672:1673011800000
15741:1673024400000
7971:1673060400000
15741:1673078400000
15741:1673087400000
13646:1673152200000
15741:1673177400000
7971:1673236800000
19345:1673249400000
15741:1673352000000
7971:1673445600000
15741:1673528400000
15741:1673613000000


In [21]:
timeslot_y = []
for item in timestamp_y:
    weekday, hour = datetime_to_features(item)
    timeslot_y.append(hour)
for y, t in zip(location_y, timeslot_y):
    print(f'{y}:{t}')

15741:9
10432:15
11426:18
15741:20
9115:19
7729:22
15741:20
12672:21
15741:1
7971:11
15741:16
15741:18
13646:12
15741:19
7971:12
19345:15
15741:20
7971:22
15741:21
15741:20


In [22]:
hour_x = []
hour_mask = []
for item in timestamp_x:
    weekday, hour = datetime_to_features(item)
    hour_x.append(hour)
    mask = np.zeros(24, dtype=np.int32)
    mask[occur_time_user == 0] = 1
    if mask.sum() == 24:
        exit()
    hour_mask.append(mask)
print(hour_x)
print(hour_mask)

[18, 9, 15, 18, 20, 19, 22, 20, 21, 1, 11, 16, 18, 12, 19, 12, 15, 20, 22, 21]
[array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 

In [23]:
res.append(
    {
        'user': user2id[user],
        'location_x': location_x,
        'hour': hour_x,
        'hour_mask': np.array(hour_mask),
        'location_y': location_y,
        'timeslot_y': timeslot_y,
    }
)

In [24]:
res

[{'user': 1417,
  'location_x': [15741,
   15741,
   10432,
   11426,
   15741,
   9115,
   7729,
   15741,
   12672,
   15741,
   7971,
   15741,
   15741,
   13646,
   15741,
   7971,
   19345,
   15741,
   7971,
   15741],
  'hour': [18,
   9,
   15,
   18,
   20,
   19,
   22,
   20,
   21,
   1,
   11,
   16,
   18,
   12,
   19,
   12,
   15,
   20,
   22,
   21],
  'hour_mask': array([[0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0

In [None]:
with open(os.path.join(dataset_path, 'train.csv'), 'r', encoding='utf8') as file:
    lines = file.readlines()
    for line_i, line in enumerate(tqdm(lines, desc=f'Initial train data')):
        user = line.strip().split(',')[0]
        occur_time_user = occur_time_individual[user2id[user]]
        stay_points = line.strip().split(',')[1:]
        # divmod: 计算商和余数，返回一个元组
        # sequence_count: 停留点序列总长度除以20，left: 剩余停留点个数
        sequence_count, left = divmod(len(stay_points), 20)
        assert sequence_count > 0, f"{user}'s does not have enough data."
        sequence_count -= 1 if left == 0 else 0
        for i in range(sequence_count):
            split_start = i * 20
            split_end = (i + 1) * 20
            location_x = [location2id[item.split('@')[0]] for item in stay_points[split_start:split_end]]
            timestamp_x = [item.split('@')[1] for item in stay_points[split_start:split_end]]
            location_y = [location2id[item.split('@')[0]] for item in
                            stay_points[split_start + 1:split_end + 1]]
            timestamp_y = [item.split('@')[1] for item in stay_points[split_start + 1:split_end + 1]]
            timeslot_y = []
            
            hour_x = []
            hour_mask = []
            for item in timestamp_x:
                weekday, hour = datetime_to_features(item)
                hour_x.append(hour)
                mask = np.zeros(24, dtype=np.int32)
                mask[occur_time_user == 0] = 1
                if mask.sum() == 24:
                    exit()
                hour_mask.append(mask)
            
            for item in timestamp_y:
                weekday, hour = datetime_to_features(item)
                timeslot_y.append(hour)

            res.append(
                {
                    'user': user2id[user],
                    'location_x': location_x,
                    'hour': hour_x,
                    'hour_mask': np.array(hour_mask),
                    'location_y': location_y,
                    'timeslot_y': timeslot_y,
                }
            )

In [1]:
from tools import get_config
save_dir = f"../saved_models/MP"
config_path = f"{save_dir}/settings.yml"
config = get_config(config_path, easy=True)

In [27]:
import torch.nn as nn

In [28]:
location_embedding = nn.Embedding(20607, 8)

In [29]:
location_x = res[0]["location_x"]
loc_embedded = location_embedding(location_x)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list