In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import defaultdict, Counter

%matplotlib inline

In [2]:
path_to_checkins = '../data/Gowalla/Gowalla_totalCheckins.txt'
checkins_df = pd.read_csv(path_to_checkins, sep='\t', names=['user_id', 'datetime', 'lat', 'lon', 'place_id'])

In [3]:
checkins_df.head()

Unnamed: 0,user_id,datetime,lat,lon,place_id
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878


In [9]:
checkins_df['timestamp'] = checkins_df.apply(lambda row: int(datetime.strptime(row['datetime'], "%Y-%m-%dT%H:%M:%SZ").timestamp()), axis=1)
checkins_df.head()

Unnamed: 0,user_id,datetime,lat,lon,place_id,timestamp
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847,1287514527
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315,1287422263
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637,1287340923
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516,1287325565
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878,1287237042


In [10]:
checkins_df.isnull().sum()

user_id      0
datetime     0
lat          0
lon          0
place_id     0
timestamp    0
dtype: int64

In [11]:
checkins_df.user_id.max(), checkins_df.user_id.unique().shape

(196585, (107092,))

In [12]:
checkins_df.user_id = pd.factorize(checkins_df.user_id)[0] + 1
checkins_df.user_id.min(), checkins_df.user_id.max(), checkins_df.user_id.unique().shape

(1, 107092, (107092,))

In [13]:
checkins_df.place_id = pd.factorize(checkins_df.place_id)[0] + 1
checkins_df.place_id.min(), checkins_df.place_id.max(), checkins_df.place_id.unique().shape

(1, 1280969, (1280969,))

In [14]:
checkins_df.head()

Unnamed: 0,user_id,datetime,lat,lon,place_id,timestamp
0,1,2010-10-19T23:55:27Z,30.235909,-97.79514,1,1287514527
1,1,2010-10-18T22:17:43Z,30.269103,-97.749395,2,1287422263
2,1,2010-10-17T23:42:03Z,30.255731,-97.763386,3,1287340923
3,1,2010-10-17T19:26:05Z,30.263418,-97.757597,4,1287325565
4,1,2010-10-16T18:50:42Z,30.274292,-97.740523,5,1287237042


In [15]:
sorted_timestamps = sorted(checkins_df.timestamp)
len(sorted_timestamps)

6442892

In [16]:
threshold_timestamp = sorted_timestamps[int(len(sorted_timestamps) * (1.0 - 0.2))]
threshold_timestamp

1284456277

In [17]:
train_data = checkins_df[checkins_df.timestamp < threshold_timestamp]
test_data = checkins_df[checkins_df.timestamp >= threshold_timestamp]

train_data.shape, test_data.shape

((5154313, 6), (1288579, 6))

In [19]:
data = []

for _, row in tqdm(train_data.iterrows()):
    data.append({
        'user_id': int(row.user_id),
        'item_id': int(row.place_id),
        'timestamp': int(row.timestamp)
    })

print(len(data))

5154313it [04:50, 17725.41it/s]

5154313





In [21]:
user_history = defaultdict(list)
item_history = defaultdict(list)

for row in tqdm(data):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    
    user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
    item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


is_changed = True
threshold = 20
good_users = set()
good_items = set()


while is_changed:
    old_state = (len(good_users), len(good_items))
    
    good_users = set()
    good_items = set()

    for user_id, history in user_history.items():
        if len(history) >= threshold:
            good_users.add(user_id)

    for item_id, history in item_history.items():
        if len(history) >= threshold:
            good_items.add(item_id)
    
    user_history = {
        user_id: list(filter(lambda x: x['item_id'] in good_items, history))
        for user_id, history in user_history.items()
    }
    
    item_history = {
        item_id: list(filter(lambda x: x['user_id'] in good_users, history))
        for item_id, history in item_history.items()
    }
    
    new_state = (len(good_users), len(good_items))
    is_changed = (old_state != new_state)
    print(old_state, new_state)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5154313/5154313 [00:09<00:00, 547966.19it/s]


(0, 0) (47677, 38368)
(47677, 38368) (20167, 35906)
(20167, 35906) (19655, 29504)
(19655, 29504) (18084, 29267)
(18084, 29267) (18021, 28404)
(18021, 28404) (17770, 28368)
(17770, 28368) (17756, 28220)
(17756, 28220) (17708, 28212)
(17708, 28212) (17707, 28173)
(17707, 28173) (17697, 28172)
(17697, 28172) (17696, 28161)
(17696, 28161) (17688, 28161)
(17688, 28161) (17688, 28156)
(17688, 28156) (17688, 28156)


In [22]:
user_mapping = {}
item_mapping = {}
tmp_user_history = defaultdict(list)
tmp_item_history = defaultdict(list)

for user_id, history in tqdm(user_history.items()):
    processed_history = []

    for filtered_item in history:
        item_id = filtered_item['item_id']
        item_timestamp = filtered_item['timestamp']

        processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
        item_mapping[item_id] = processed_item_id

        processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
    if len(processed_history) >= threshold:
        processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
        user_mapping[user_id] = processed_user_id

        tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    
for item_id, history in tqdm(item_history.items()):
    processed_history = []

    for filtered_user in history:
        user_id = filtered_user['user_id']
        user_timestamp = filtered_user['timestamp']

        processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
        user_mapping[user_id] = processed_user_id

        processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

    if len(processed_history) >= threshold:
        processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
        item_mapping[item_id] = processed_item_id

        tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

user_history = tmp_user_history
item_history = tmp_item_history

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 94019/94019 [00:01<00:00, 57563.62it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1096278/1096278 [00:04<00:00, 260344.29it/s]


In [23]:
print('Users count:', len(user_mapping))
print('Items count:', len(item_mapping))
print('Actions count:', sum(list(map(lambda x: len(x), user_history.values()))))
print('Avg user history len:', np.mean(list(map(lambda x: len(x), user_history.values()))))
print('Avg item history len:', np.mean(list(map(lambda x: len(x), item_history.values()))))

Users count: 17688
Items count: 28156
Actions count: 1359055
Avg user history len: 76.83485979194934
Avg item history len: 48.268752663730645


In [34]:
test_data_filtered = test_data[test_data.user_id.isin(user_mapping) & test_data.place_id.isin(item_mapping)]
test_data_filtered.shape, test_data.shape

((154313, 6), (1288579, 6))

In [35]:
test_list = []

for _, row in tqdm(test_data_filtered.iterrows()):
    test_list.append({
        'user_id': int(row.user_id),
        'item_id': int(row.place_id),
        'timestamp': int(row.timestamp)
    })

print(len(test_list))


test_user_history = defaultdict(list)

for row in tqdm(test_list):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    
    test_user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
    
tmp_user_history = defaultdict(list)


for user_id, history in tqdm(test_user_history.items()):
    processed_history = []

    for filtered_item in history:
        item_id = filtered_item['item_id']
        item_timestamp = filtered_item['timestamp']

        processed_item_id = item_mapping[item_id]
        processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
    processed_user_id = user_mapping[user_id]
    tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    
test_user_history = tmp_user_history

154313it [00:08, 17667.03it/s]


154313


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 154313/154313 [00:00<00:00, 962572.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10792/10792 [00:00<00:00, 78439.91it/s]


In [36]:
with open('../data/Gowalla/train_new.txt', 'w') as f:
    for user_id, item_history in user_history.items():
        f.write(' '.join([str(user_id)] + [
            str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
        ]))
        f.write('\n')

In [37]:
with open('../data/Gowalla/test_new.txt', 'w') as f:
    for user_id, item_history in test_user_history.items():
        f.write(' '.join([str(user_id)] + [
            str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
        ]))
        f.write('\n')