In [16]:
import os
import sys
import json
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import defaultdict, Counter

%matplotlib inline

project = 'AliCCP'
path_to_raw = '../data/'+project+'/raw/'
path_to_processed = '../data/'+project+'/'

In [17]:
# this cell is required for colab only

from google.colab import drive
drive.mount('/content/drive')

path_to_raw = './drive/MyDrive/Science/Multidomain RecSys/' + path_to_raw[3:]
path_to_processed = './drive/MyDrive/Science/Multidomain RecSys/source/' + path_to_processed[3:]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# version 3

def create_all_data(df, output_path, output_name='all_data.txt', convert_dtypes=False):
    # part 1
    data = []
    for _, row in tqdm(df.iterrows()):
        data.append({
            'user_id': int(row.user_id) if convert_dtypes else row.user_id,
            'item_id':  int(row.item_id) if convert_dtypes else row.item_id,
            'timestamp': int(row.timestamp) if convert_dtypes else row.timestamp
        })


    # part 2
    user_history = defaultdict(list)
    item_history = defaultdict(list)

    for row in tqdm(data):
        user_raw_id = row['user_id']
        item_raw_id = row['item_id']
        interaction_timestamp = row['timestamp']

        user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
        item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


    # part 3
    is_changed = True
    threshold = 5
    good_users = set()
    good_items = set()

    while is_changed:
        old_state = (len(good_users), len(good_items))

        good_users = set()
        good_items = set()

        for user_id, history in user_history.items():
            if len(history) >= threshold:
                good_users.add(user_id)

        for item_id, history in item_history.items():
            if len(history) >= threshold:
                good_items.add(item_id)

        user_history = {
            user_id: list(filter(lambda x: x['item_id'] in good_items, history))
            for user_id, history in user_history.items()
        }

        item_history = {
            item_id: list(filter(lambda x: x['user_id'] in good_users, history))
            for item_id, history in item_history.items()
        }

        new_state = (len(good_users), len(good_items))
        is_changed = (old_state != new_state)
        print(old_state, new_state)


    # part 4
    user_mapping = {}
    item_mapping = {}
    tmp_user_history = defaultdict(list)
    tmp_item_history = defaultdict(list)

    for user_id, history in tqdm(user_history.items()):
        processed_history = []

        for filtered_item in history:
            item_id = filtered_item['item_id']
            item_timestamp = filtered_item['timestamp']

            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})

        if len(processed_history) >= threshold:
            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])


    for item_id, history in tqdm(item_history.items()):
        processed_history = []

        for filtered_user in history:
            user_id = filtered_user['user_id']
            user_timestamp = filtered_user['timestamp']

            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

        if len(processed_history) >= threshold:
            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    user_history = tmp_user_history
    item_history = tmp_item_history

    # part 5
    with open(output_path+'/'+output_name, 'w') as f:
        for user_id, item_history in user_history.items():
            f.write(' '.join([str(user_id)] + [
                str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
            ]))
            f.write('\n')

In [23]:
files = [f for f in os.listdir(path_to_raw) if '.csv' in f]
domains = dict()
for f in files:
    k = f.split('_')[0]
    domains[k] = domains.get(k, []) + [f]

for d in tqdm(domains):
    new_dir = path_to_processed + d
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    files = []
    for f in domains[d]:
        df = pd.read_csv(path_to_raw+f, names = ['user_id', 'item_id', 'rating']).drop('rating', axis=1)
        files.append(df)

    df = pd.concat(files, axis=0)
    df['timestamp'] = pd.Series(range(1,df.shape[0]+1))

    _ = create_all_data(df, new_dir)

  0%|          | 0/3 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2065it [00:00, 15664.34it/s]

100%|██████████| 2065/2065 [00:00<00:00, 247761.25it/s]


(0, 0) (0, 163)
(0, 163) (0, 0)
(0, 0) (0, 0)



100%|██████████| 1833/1833 [00:00<00:00, 853102.44it/s]

100%|██████████| 450/450 [00:00<00:00, 911805.22it/s]
 33%|███▎      | 1/3 [00:01<00:03,  1.79s/it]
0it [00:00, ?it/s][A
1651it [00:00, 16499.91it/s][A
3301it [00:00, 16135.15it/s][A
4916it [00:00, 15827.30it/s][A
6500it [00:00, 15370.57it/s][A
8103it [00:00, 15600.41it/s][A
9665it [00:00, 15126.95it/s][A
11190it [00:00, 15163.85it/s][A
12832it [00:00, 15551.85it/s][A
14478it [00:00, 15829.83it/s][A
16064it [00:01, 15627.73it/s][A
17629it [00:01, 15600.69it/s][A
19191it [00:01, 15393.21it/s][A
20742it [00:01, 15425.75it/s][A
22286it [00:01, 15295.90it/s][A
23880it [00:01, 15484.86it/s][A
25430it [00:01, 14854.19it/s][A
26998it [00:01, 15091.59it/s][A
28608it [00:01, 15385.60it/s][A
30151it [00:02, 10073.06it/s][A
31645it [00:02, 11124.09it/s][A
33144it [00:02, 12034.70it/s][A
34740it [00:02, 13023.06it/s][A
36367it [00:02, 13881.02it/s][A
37864it [00:02, 14103.14it/s][A
39352it [00:02, 10533.32it/s][A
4

(0, 0) (11658, 12203)
(11658, 12203) (9818, 9219)
(9818, 9219) (8131, 8386)
(8131, 8386) (7618, 7443)
(7618, 7443) (6940, 7138)
(6940, 7138) (6709, 6681)
(6709, 6681) (6353, 6523)
(6353, 6523) (6230, 6276)
(6230, 6276) (6038, 6177)
(6038, 6177) (5949, 6042)
(5949, 6042) (5834, 5983)
(5834, 5983) (5794, 5899)
(5794, 5899) (5718, 5876)
(5718, 5876) (5695, 5819)
(5695, 5819) (5658, 5807)
(5658, 5807) (5646, 5781)
(5646, 5781) (5621, 5766)
(5621, 5766) (5605, 5751)
(5605, 5751) (5597, 5741)
(5597, 5741) (5586, 5735)
(5586, 5735) (5578, 5724)
(5578, 5724) (5566, 5715)
(5566, 5715) (5557, 5708)
(5557, 5708) (5549, 5697)
(5549, 5697) (5544, 5692)
(5544, 5692) (5539, 5690)
(5539, 5690) (5538, 5685)
(5538, 5685) (5535, 5684)
(5535, 5684) (5535, 5679)
(5535, 5679) (5530, 5679)
(5530, 5679) (5530, 5676)
(5530, 5676) (5529, 5676)
(5529, 5676) (5529, 5675)
(5529, 5675) (5529, 5675)



  0%|          | 0/23492 [00:00<?, ?it/s][A
100%|██████████| 23492/23492 [00:00<00:00, 94987.83it/s]

  0%|          | 0/18149 [00:00<?, ?it/s][A
100%|██████████| 18149/18149 [00:00<00:00, 144699.90it/s]
 67%|██████▋   | 2/3 [00:25<00:14, 14.56s/it]
0it [00:00, ?it/s][A
811it [00:00, 8108.42it/s][A
1699it [00:00, 8557.42it/s][A
2589it [00:00, 8710.43it/s][A
3461it [00:00, 8656.50it/s][A
4351it [00:00, 8742.80it/s][A
5302it [00:00, 9001.34it/s][A
6203it [00:00, 9002.61it/s][A
7212it [00:00, 9346.78it/s][A
8187it [00:00, 9470.03it/s][A
9184it [00:01, 9623.79it/s][A
10190it [00:01, 9755.63it/s][A
11166it [00:01, 9734.89it/s][A
12174it [00:01, 9837.13it/s][A
13158it [00:01, 9772.29it/s][A
14136it [00:01, 9695.88it/s][A
15169it [00:01, 9884.67it/s][A
16158it [00:01, 9764.03it/s][A
17163it [00:01, 9848.01it/s][A
18149it [00:01, 9654.03it/s][A
19116it [00:02, 9571.90it/s][A
20074it [00:02, 9437.74it/s][A
21019it [00:02, 9415.54it/s][A
21961it [00:02, 9367.96it/s][A


(0, 0) (19304, 19038)
(19304, 19038) (17189, 17426)
(17189, 17426) (16511, 16558)
(16511, 16558) (16089, 16243)
(16089, 16243) (15939, 16030)
(15939, 16030) (15801, 15940)
(15801, 15940) (15750, 15861)
(15750, 15861) (15704, 15830)
(15704, 15830) (15681, 15805)
(15681, 15805) (15660, 15791)
(15660, 15791) (15645, 15777)
(15645, 15777) (15635, 15771)
(15635, 15771) (15633, 15765)
(15633, 15765) (15629, 15763)
(15629, 15763) (15628, 15757)
(15628, 15757) (15623, 15755)
(15623, 15755) (15622, 15751)
(15622, 15751) (15621, 15750)
(15621, 15750) (15619, 15748)
(15619, 15748) (15618, 15747)
(15618, 15747) (15618, 15747)



  0%|          | 0/25464 [00:00<?, ?it/s][A
 20%|██        | 5199/25464 [00:00<00:00, 51856.03it/s][A
 44%|████▍     | 11315/25464 [00:00<00:00, 57316.10it/s][A
100%|██████████| 25464/25464 [00:00<00:00, 73792.47it/s]

  0%|          | 0/27122 [00:00<?, ?it/s][A
 18%|█▊        | 4956/27122 [00:00<00:00, 49543.98it/s][A
 41%|████      | 11023/27122 [00:00<00:00, 56083.53it/s][A
100%|██████████| 27122/27122 [00:00<00:00, 73432.52it/s]
100%|██████████| 3/3 [00:57<00:00, 19.07s/it]


In [22]:
domains

{'d1': ['d1_val.csv', 'd1_train.csv', 'd1_test.csv'],
 'd0': ['d0_val.csv', 'd0_test.csv', 'd0_train.csv'],
 'd2': ['d2_val.csv', 'd2_test.csv', 'd2_train.csv']}