In [1]:
import os
import sys
import json
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import defaultdict, Counter

%matplotlib inline


project = 'Douban'
path_to_raw = '../data/'+project+'/raw/douban_dataset(text information)/'
path_to_processed = '../data/'+project+'/'

In [2]:
# this cell is required for colab only

from google.colab import drive
drive.mount('/content/drive')

path_to_raw = './drive/MyDrive/Science/Multidomain RecSys/' + path_to_raw[3:]
path_to_processed = './drive/MyDrive/Science/Multidomain RecSys/source/iz-dev/' + path_to_processed[3:]

Mounted at /content/drive


In [3]:
def create_all_data(df, output_path, output_name='all_data.txt', convert_dtypes=False):
    # part 1
    data = []
    for _, row in tqdm(df.iterrows()):
        data.append({
            'user_id': int(row.user_id) if convert_dtypes else row.user_id,
            'item_id':  int(row.item_id) if convert_dtypes else row.item_id,
            'timestamp': int(row.timestamp) if convert_dtypes else row.timestamp
        })


    # part 2
    user_history = defaultdict(list)
    item_history = defaultdict(list)

    for row in tqdm(data):
        user_raw_id = row['user_id']
        item_raw_id = row['item_id']
        interaction_timestamp = row['timestamp']

        user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
        item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


    # part 3
    is_changed = True
    threshold = 5
    good_users = set()
    good_items = set()

    while is_changed:
        old_state = (len(good_users), len(good_items))

        good_users = set()
        good_items = set()

        for user_id, history in user_history.items():
            if len(history) >= threshold:
                good_users.add(user_id)

        for item_id, history in item_history.items():
            if len(history) >= threshold:
                good_items.add(item_id)

        user_history = {
            user_id: list(filter(lambda x: x['item_id'] in good_items, history))
            for user_id, history in user_history.items()
        }

        item_history = {
            item_id: list(filter(lambda x: x['user_id'] in good_users, history))
            for item_id, history in item_history.items()
        }

        new_state = (len(good_users), len(good_items))
        is_changed = (old_state != new_state)
        print(old_state, new_state)


    # part 4
    user_mapping = {}
    item_mapping = {}
    tmp_user_history = defaultdict(list)
    tmp_item_history = defaultdict(list)

    for user_id, history in tqdm(user_history.items()):
        processed_history = []

        for filtered_item in history:
            item_id = filtered_item['item_id']
            item_timestamp = filtered_item['timestamp']

            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})

        if len(processed_history) >= threshold:
            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])


    for item_id, history in tqdm(item_history.items()):
        processed_history = []

        for filtered_user in history:
            user_id = filtered_user['user_id']
            user_timestamp = filtered_user['timestamp']

            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

        if len(processed_history) >= threshold:
            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    user_history = tmp_user_history
    item_history = tmp_item_history

    # part 5
    with open(output_path+'/'+output_name, 'w') as f:
        for user_id, item_history in user_history.items():
            f.write(' '.join([str(user_id)] + [
                str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
            ]))
            f.write('\n')

In [4]:
sep = '\t'
domains = ['books','movies','music']
files = ['bookreviews_cleaned.txt','moviereviews_cleaned.txt','musicreviews_cleaned.txt']

for d,f in tqdm(zip(domains,files)):
    cols = ['user_id', d.strip('s')+'_id', 'time']
    df = pd.read_csv(path_to_raw+f, sep=sep)[cols]
    df.columns = ['user_id','item_id','timestamp']

    new_dir = path_to_processed + d #.capitalize()
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    output_path = path_to_processed + d
    _ = create_all_data(df, output_path)

0it [00:00, ?it/s]
0it [00:00, ?it/s][A
1188it [00:00, 11874.04it/s][A
2514it [00:00, 12686.43it/s][A
3899it [00:00, 13216.89it/s][A
5420it [00:00, 14002.69it/s][A
6821it [00:00, 13957.83it/s][A
8217it [00:00, 13746.15it/s][A
9643it [00:00, 13911.14it/s][A
11146it [00:00, 14262.25it/s][A
12573it [00:00, 14208.45it/s][A
14055it [00:01, 14393.81it/s][A
15528it [00:01, 14495.34it/s][A
16978it [00:01, 14115.04it/s][A
18393it [00:01, 14122.83it/s][A
19895it [00:01, 14389.15it/s][A
21336it [00:01, 14359.82it/s][A
22786it [00:01, 14399.70it/s][A
24227it [00:01, 14248.53it/s][A
25677it [00:01, 14320.76it/s][A
27110it [00:01, 14076.56it/s][A
28598it [00:02, 14311.55it/s][A
30031it [00:02, 14248.30it/s][A
31518it [00:02, 14430.33it/s][A
32962it [00:02, 14178.76it/s][A
34400it [00:02, 14235.62it/s][A
35825it [00:02, 13740.71it/s][A
37204it [00:02, 11903.20it/s][A
38438it [00:02, 10920.94it/s][A
39570it [00:02, 10450.96it/s][A
40642it [00:03, 9827.60it/s] [A
41645it [

(0, 0) (1888, 8660)
(1888, 8660) (1715, 8643)
(1715, 8643) (1714, 8601)
(1714, 8601) (1713, 8601)
(1713, 8601) (1713, 8601)



  0%|          | 0/2212 [00:00<?, ?it/s][A
 13%|█▎        | 290/2212 [00:00<00:01, 1547.99it/s][A
 51%|█████     | 1129/2212 [00:00<00:00, 4507.63it/s][A
100%|██████████| 2212/2212 [00:00<00:00, 5355.01it/s]

  0%|          | 0/95872 [00:00<?, ?it/s][A
  6%|▌         | 5381/95872 [00:00<00:01, 53804.54it/s][A
 13%|█▎        | 12339/95872 [00:00<00:01, 63080.95it/s][A
 26%|██▌       | 24986/95872 [00:00<00:00, 92019.88it/s][A
 48%|████▊     | 45929/95872 [00:00<00:00, 138364.35it/s][A
100%|██████████| 95872/95872 [00:00<00:00, 173602.94it/s]
1it [00:27, 27.44s/it]
0it [00:00, ?it/s][A
304it [00:00, 3039.14it/s][A
1745it [00:00, 9723.49it/s][A
3160it [00:00, 11741.73it/s][A
4407it [00:00, 12028.43it/s][A
5814it [00:00, 12762.73it/s][A
7123it [00:00, 12872.07it/s][A
8565it [00:00, 13377.81it/s][A
10012it [00:00, 13725.07it/s][A
11452it [00:00, 13933.37it/s][A
12928it [00:01, 14187.05it/s][A
14432it [00:01, 14447.60it/s][A
15931it [00:01, 14611.34it/s][A
17393it [00:0

(0, 0) (2630, 20964)
(2630, 20964) (2628, 20964)
(2628, 20964) (2628, 20964)



  0%|          | 0/2712 [00:00<?, ?it/s][A
  1%|▏         | 37/2712 [00:00<00:27, 97.50it/s][A
  4%|▍         | 115/2712 [00:00<00:09, 284.15it/s][A
  7%|▋         | 203/2712 [00:00<00:05, 450.14it/s][A
 12%|█▏        | 320/2712 [00:00<00:03, 654.00it/s][A
 20%|██        | 543/2712 [00:00<00:01, 1109.23it/s][A
 25%|██▌       | 679/2712 [00:00<00:01, 1150.06it/s][A
 30%|███       | 827/2712 [00:00<00:01, 1243.36it/s][A
 37%|███▋      | 1006/2712 [00:01<00:01, 1398.46it/s][A
 44%|████▍     | 1192/2712 [00:01<00:00, 1533.76it/s][A
 51%|█████     | 1383/2712 [00:01<00:00, 1643.95it/s][A
 62%|██████▏   | 1676/2712 [00:01<00:00, 2021.30it/s][A
 69%|██████▉   | 1884/2712 [00:01<00:00, 1923.12it/s][A
 77%|███████▋  | 2081/2712 [00:01<00:00, 1693.99it/s][A
 83%|████████▎ | 2258/2712 [00:01<00:00, 1579.07it/s][A
100%|██████████| 2712/2712 [00:01<00:00, 1357.85it/s]

  0%|          | 0/34893 [00:00<?, ?it/s][A
  0%|          | 159/34893 [00:00<00:22, 1567.33it/s][A
  1%|        

(0, 0) (1358, 7146)
(1358, 7146) (1193, 7107)
(1193, 7107) (1192, 7066)
(1192, 7066) (1190, 7065)
(1190, 7065) (1190, 7063)
(1190, 7063) (1190, 7063)



  0%|          | 0/1820 [00:00<?, ?it/s][A
100%|██████████| 1820/1820 [00:00<00:00, 8135.57it/s]

  0%|          | 0/79878 [00:00<?, ?it/s][A
 15%|█▍        | 11855/79878 [00:00<00:00, 118544.74it/s][A
100%|██████████| 79878/79878 [00:00<00:00, 297928.32it/s]
3it [03:02, 60.91s/it]
