In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import defaultdict, Counter

%matplotlib inline

In [2]:
path_to_checkins = '../data/Gowalla/Gowalla_totalCheckins.txt'
checkins_df = pd.read_csv(path_to_checkins, sep='\t', names=['user_id', 'datetime', 'lat', 'lon', 'place_id'])

In [3]:
checkins_df.head()

Unnamed: 0,user_id,datetime,lat,lon,place_id
0,0,2010-10-19T23:55:27Z,30.235909,-97.79514,22847
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878


In [4]:
checkins_df['timestamp'] = checkins_df.apply(
    lambda row: int(datetime.strptime(row['datetime'], "%Y-%m-%dT%H:%M:%SZ").timestamp()),
    axis=1
)

checkins_df = checkins_df.drop(['datetime'], axis=1)
checkins_df.head()

Unnamed: 0,user_id,lat,lon,place_id,timestamp
0,0,30.235909,-97.79514,22847,1287525327
1,0,30.269103,-97.749395,420315,1287433063
2,0,30.255731,-97.763386,316637,1287351723
3,0,30.263418,-97.757597,16516,1287336365
4,0,30.274292,-97.740523,5535878,1287247842


In [5]:
checkins_df.isnull().sum()

user_id      0
lat          0
lon          0
place_id     0
timestamp    0
dtype: int64

In [6]:
checkins_df.user_id.max(), checkins_df.user_id.unique().shape

(196585, (107092,))

In [7]:
checkins_df.user_id = pd.factorize(checkins_df.user_id)[0] + 1
checkins_df.user_id.min(), checkins_df.user_id.max(), checkins_df.user_id.unique().shape

(1, 107092, (107092,))

In [8]:
checkins_df.place_id = pd.factorize(checkins_df.place_id)[0] + 1
checkins_df.place_id.min(), checkins_df.place_id.max(), checkins_df.place_id.unique().shape

(1, 1280969, (1280969,))

In [9]:
checkins_df.head()

Unnamed: 0,user_id,lat,lon,place_id,timestamp
0,1,30.235909,-97.79514,1,1287525327
1,1,30.269103,-97.749395,2,1287433063
2,1,30.255731,-97.763386,3,1287351723
3,1,30.263418,-97.757597,4,1287336365
4,1,30.274292,-97.740523,5,1287247842


In [10]:
sorted_timestamps = sorted(checkins_df.timestamp)
len(sorted_timestamps)

6442892

In [11]:
threshold_timestamp = sorted_timestamps[int(len(sorted_timestamps) * (1.0 - 0.2))]
test_threshold_timestamp = sorted_timestamps[int(len(sorted_timestamps) * (1.0 - 0.1))]

print(threshold_timestamp, test_threshold_timestamp)

1284467077 1286110393


In [12]:
train_data = checkins_df[checkins_df.timestamp < threshold_timestamp]
validation_data = checkins_df[(threshold_timestamp <= checkins_df.timestamp) & (checkins_df.timestamp < test_threshold_timestamp)]
test_data = checkins_df[test_threshold_timestamp <= checkins_df.timestamp]

train_data.shape, validation_data.shape, test_data.shape

((5154313, 5), (644287, 5), (644292, 5))

In [13]:
data = []

for _, row in tqdm(train_data.iterrows()):
    data.append({
        'user_id': int(row.user_id),
        'item_id': int(row.place_id),
        'timestamp': int(row.timestamp)
    })

print(len(data))

5154313it [03:52, 22123.15it/s]

5154313





In [23]:
user_history = defaultdict(list)
item_history = defaultdict(list)

for row in tqdm(data):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    
    user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
    item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


is_changed = True
threshold = 20
good_users = set()
good_items = set()


while is_changed:
    old_state = (len(good_users), len(good_items))
    
    good_users = set()
    good_items = set()

    for user_id, history in user_history.items():
        if len(history) >= threshold:
            good_users.add(user_id)

    for item_id, history in item_history.items():
        if len(history) >= threshold:
            good_items.add(item_id)
    
    user_history = {
        user_id: list(filter(lambda x: x['item_id'] in good_items, history))
        for user_id, history in user_history.items()
    }
    
    item_history = {
        item_id: list(filter(lambda x: x['user_id'] in good_users, history))
        for item_id, history in item_history.items()
    }
    
    new_state = (len(good_users), len(good_items))
    is_changed = (old_state != new_state)
    print(old_state, new_state)

100%|████████████████████████████████████████████████████████████████████| 5154313/5154313 [00:17<00:00, 296492.47it/s]


(0, 0) (47677, 38368)
(47677, 38368) (20167, 35906)
(20167, 35906) (19655, 29504)
(19655, 29504) (18084, 29267)
(18084, 29267) (18021, 28404)
(18021, 28404) (17770, 28368)
(17770, 28368) (17756, 28220)
(17756, 28220) (17708, 28212)
(17708, 28212) (17707, 28173)
(17707, 28173) (17697, 28172)
(17697, 28172) (17696, 28161)
(17696, 28161) (17688, 28161)
(17688, 28161) (17688, 28156)
(17688, 28156) (17688, 28156)


In [24]:
user_mapping = {}
item_mapping = {}
tmp_user_history = defaultdict(list)
tmp_item_history = defaultdict(list)

for raw_user_id, history in tqdm(user_history.items()):
    if len(history) >= threshold:
        processed_history = []

        for filtered_item_event in history:
            raw_item_id = filtered_item_event['item_id']
            item_timestamp = filtered_item_event['timestamp']

            processed_item_id = item_mapping.get(raw_item_id, len(item_mapping) + 1)
            item_mapping[raw_item_id] = processed_item_id

            processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
        processed_user_id = user_mapping.get(raw_user_id, len(user_mapping) + 1)
        user_mapping[raw_user_id] = processed_user_id

        tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    
for raw_item_id, history in tqdm(item_history.items()):
    if len(history) >= threshold:
        processed_history = []

        for filtered_user_event in history:
            raw_user_id = filtered_user_event['user_id']
            user_timestamp = filtered_user_event['timestamp']

            processed_user_id = user_mapping.get(raw_user_id, len(user_mapping) + 1)
            user_mapping[raw_user_id] = processed_user_id

            processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})


        processed_item_id = item_mapping.get(raw_item_id, len(item_mapping) + 1)
        item_mapping[raw_item_id] = processed_item_id

        tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

train_user_history = tmp_user_history
train_item_history = tmp_item_history

100%|█████████████████████████████████████████████████████████████████████████| 94019/94019 [00:01<00:00, 68727.49it/s]
100%|████████████████████████████████████████████████████████████████████| 1096278/1096278 [00:02<00:00, 506195.13it/s]


In [25]:
print('Users count:', len(user_mapping))
print('Items count:', len(item_mapping))
print('Actions count:', sum(list(map(lambda x: len(x), user_history.values()))))
print('Avg train user history len:', np.mean(list(map(lambda x: len(x), train_user_history.values()))))
print('Avg train item history len:', np.mean(list(map(lambda x: len(x), train_item_history.values()))))

Users count: 17688
Items count: 28156
Actions count: 1606516
Avg train user history len: 76.83485979194934
Avg train item history len: 48.268752663730645


In [26]:
train_data_filtered = train_data[train_data.user_id.isin(user_mapping) & train_data.place_id.isin(item_mapping)]
validation_data_filtered = validation_data[validation_data.user_id.isin(user_mapping) & validation_data.place_id.isin(item_mapping)]
test_data_filtered = test_data[test_data.user_id.isin(user_mapping) & test_data.place_id.isin(item_mapping)]

print(f'Train data. Before: {train_data.shape}. After: {train_data_filtered.shape}')
print(f'Validation data. Before: {validation_data.shape}. After: {validation_data_filtered.shape}')
print(f'Test data. Before: {test_data.shape}. After: {test_data_filtered.shape}')

Train data. Before: (5154313, 5). After: (1359055, 5)
Validation data. Before: (644287, 5). After: (87235, 5)
Test data. Before: (644292, 5). After: (67078, 5)


In [27]:
# Save train data (TODO do we need to duplicate: I think yes)
with open('../data/Gowalla/train_new.txt', 'w') as f:
    cnt = 0
    cnt_added = 0
    for user_id, history in train_user_history.items():
        previous_history = []
        
        for item_event in sorted(history, key=lambda x: x['timestamp']):
            if len(previous_history) + 1 >= threshold:
                f.write(' '.join([str(user_id)] + previous_history + [str(item_event['item_id'])]))
                f.write('\n')
                cnt_added += 1
            
            previous_history.append(str(item_event['item_id']))
            cnt += 1
            
    assert cnt == train_data_filtered.shape[0]
    assert cnt == cnt_added + (len(user_mapping) * (threshold - 1))
print(cnt, train_data_filtered.shape, len(train_user_history))

1359055 (1359055, 5) 17688


In [28]:
validation_list = []

for _, row in tqdm(validation_data_filtered.iterrows()):
    validation_list.append({
        'user_id': int(row.user_id),
        'item_id': int(row.place_id),
        'timestamp': int(row.timestamp)
    })
validation_list = sorted(validation_list, key=lambda x: x['timestamp'])
print(len(validation_list))


# Add events to `validation_user_history`
validation_user_history = defaultdict(list)
for row in tqdm(validation_list):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    validation_user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})

# Re-number user and item IDs
tmp_user_history = defaultdict(list)
for user_id, history in tqdm(validation_user_history.items()):
    processed_user_id = user_mapping[user_id]
    
    processed_history = []
    for item_event in history:
        item_id = item_event['item_id']
        item_timestamp = item_event['timestamp']
        processed_item_id = item_mapping[item_id]
        processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
    tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])
validation_user_history = tmp_user_history

87235it [00:03, 22316.50it/s]
100%|████████████████████████████████████████████████████████████████████████| 87235/87235 [00:00<00:00, 890154.78it/s]
  0%|                                                                                         | 0/9669 [00:00<?, ?it/s]

87235


100%|██████████████████████████████████████████████████████████████████████████| 9669/9669 [00:00<00:00, 100720.55it/s]


In [29]:
# Save validation data
with open('../data/Gowalla/validation_new.txt', 'w') as f:
    cnt = 0
    for user_id, validation_history in validation_user_history.items():
        previous_history = [
            str(event['item_id']) for event in sorted(train_user_history[user_id], key=lambda x: x['timestamp'])
        ]
        assert len(previous_history) > 0
        
        for validation_item_event in  sorted(validation_history, key=lambda x: x['timestamp']):
            assert len(previous_history) + 1 >= threshold
            
            f.write(' '.join([str(user_id)] + previous_history + [str(validation_item_event['item_id'])]))
            f.write('\n')
            
            previous_history.append(str(validation_item_event['item_id']))
            cnt += 1
            
    assert cnt == validation_data_filtered.shape[0]

In [30]:
test_list = []

for _, row in tqdm(test_data_filtered.iterrows()):
    test_list.append({
        'user_id': int(row.user_id),
        'item_id': int(row.place_id),
        'timestamp': int(row.timestamp)
    })
test_list = sorted(test_list, key=lambda x: x['timestamp'])
print(len(test_list))

# Add events to `test_user_history`
test_user_history = defaultdict(list)
for row in tqdm(test_list):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    test_user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})

# Re-number user and item IDs
tmp_user_history = defaultdict(list)
for user_id, history in tqdm(test_user_history.items()):
    processed_user_id = user_mapping[user_id]
    
    processed_history = []
    for item_event in history:
        item_id = item_event['item_id']
        item_timestamp = item_event['timestamp']
        processed_item_id = item_mapping[item_id]
        processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
    tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])
test_user_history = tmp_user_history

67078it [00:03, 22336.93it/s]
100%|████████████████████████████████████████████████████████████████████████| 67078/67078 [00:00<00:00, 931647.80it/s]
100%|██████████████████████████████████████████████████████████████████████████| 8750/8750 [00:00<00:00, 104169.19it/s]

67078





In [31]:
# Save test data
with open('../data/Gowalla/test_new.txt', 'w') as f:
    cnt = 0
    for user_id, test_history in test_user_history.items():
        train_history = [
            str(event['item_id']) for event in sorted(train_user_history[user_id], key=lambda x: x['timestamp'])
        ]
        validation_history = [
            str(event['item_id']) for event in sorted(validation_user_history[user_id], key=lambda x: x['timestamp'])
        ]
        previous_history = train_history + validation_history
        assert len(train_history) > 0
        
        for test_item_event in sorted(test_history, key=lambda x: x['timestamp']):
            f.write(' '.join([str(user_id)] + previous_history + [str(test_item_event['item_id'])]))
            f.write('\n')
            
            previous_history.append(str(test_item_event['item_id']))
            cnt += 1
            
    assert cnt == test_data_filtered.shape[0]

## All data

In [26]:
data = []

for _, row in tqdm(checkins_df.iterrows()):
    data.append({
        'user_id': int(row.user_id),
        'item_id': int(row.place_id),
        'timestamp': int(row.timestamp)
    })

print(len(data))

6442892it [05:49, 18441.06it/s]

6442892





In [27]:
user_history = defaultdict(list)
item_history = defaultdict(list)

for row in tqdm(data):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    
    user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
    item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


is_changed = True
threshold = 20
good_users = set()
good_items = set()


while is_changed:
    old_state = (len(good_users), len(good_items))
    
    good_users = set()
    good_items = set()

    for user_id, history in user_history.items():
        if len(history) >= threshold:
            good_users.add(user_id)

    for item_id, history in item_history.items():
        if len(history) >= threshold:
            good_items.add(item_id)
    
    user_history = {
        user_id: list(filter(lambda x: x['item_id'] in good_items, history))
        for user_id, history in user_history.items()
    }
    
    item_history = {
        item_id: list(filter(lambda x: x['user_id'] in good_users, history))
        for item_id, history in item_history.items()
    }
    
    new_state = (len(good_users), len(good_items))
    is_changed = (old_state != new_state)
    print(old_state, new_state)

100%|█████████████████████████████████████████████████████████████████████| 6442892/6442892 [01:21<00:00, 79494.54it/s]


(0, 0) (60562, 49750)
(60562, 49750) (26336, 47198)
(26336, 47198) (25811, 38979)
(25811, 38979) (23856, 38730)
(23856, 38730) (23797, 37646)
(23797, 37646) (23509, 37610)
(23509, 37610) (23499, 37452)
(23499, 37452) (23443, 37451)
(23443, 37451) (23442, 37414)
(23442, 37414) (23431, 37414)
(23431, 37414) (23431, 37404)
(23431, 37404) (23429, 37404)
(23429, 37404) (23429, 37402)
(23429, 37402) (23427, 37402)
(23427, 37402) (23427, 37398)
(23427, 37398) (23427, 37398)


In [28]:
user_mapping = {}
item_mapping = {}
tmp_user_history = defaultdict(list)
tmp_item_history = defaultdict(list)

for user_id, history in tqdm(user_history.items()):
    processed_history = []

    for filtered_item in history:
        item_id = filtered_item['item_id']
        item_timestamp = filtered_item['timestamp']

        processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
        item_mapping[item_id] = processed_item_id

        processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
    if len(processed_history) >= threshold:
        processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
        user_mapping[user_id] = processed_user_id

        tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    
for item_id, history in tqdm(item_history.items()):
    processed_history = []

    for filtered_user in history:
        user_id = filtered_user['user_id']
        user_timestamp = filtered_user['timestamp']

        processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
        user_mapping[user_id] = processed_user_id

        processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

    if len(processed_history) >= threshold:
        processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
        item_mapping[item_id] = processed_item_id

        tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

user_history = tmp_user_history
item_history = tmp_item_history

100%|███████████████████████████████████████████████████████████████████████| 107092/107092 [00:03<00:00, 35007.23it/s]
100%|████████████████████████████████████████████████████████████████████| 1280969/1280969 [00:06<00:00, 183525.04it/s]


In [29]:
print('Users count:', len(user_mapping))
print('Items count:', len(item_mapping))
print('Actions count:', sum(list(map(lambda x: len(x), user_history.values()))))
print('Avg user history len:', np.mean(list(map(lambda x: len(x), user_history.values()))))
print('Avg item history len:', np.mean(list(map(lambda x: len(x), item_history.values()))))

Users count: 23427
Items count: 37398
Actions count: 1849431
Avg user history len: 78.94442310154949
Avg item history len: 49.452671265843094


In [30]:
with open('../data/Gowalla/all_data.txt', 'w') as f:
    for user_id, item_history in user_history.items():
        f.write(' '.join([str(user_id)] + [
            str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
        ]))
        f.write('\n')