In [None]:
import os
import sys
import json
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import defaultdict, Counter

%matplotlib inline

project = 'AliAd'
path_to_raw = '../data/'+project+'/raw/'
path_to_processed = '../data/'+project+'/'

In [None]:
# this cell is required for colab only

from google.colab import drive
drive.mount('/content/drive')

path_to_raw = './drive/MyDrive/Science/Multidomain RecSys/' + path_to_raw[3:]
path_to_processed = './drive/MyDrive/Science/Multidomain RecSys/source/' + path_to_processed[3:]

Mounted at /content/drive


In [None]:
def create_all_data(df, output_path, output_name='all_data.txt', convert_dtypes=False):
    # part 1
    data = []
    for _, row in tqdm(df.iterrows()):
        data.append({
            'user_id': int(row.user_id) if convert_dtypes else row.user_id,
            'item_id':  int(row.item_id) if convert_dtypes else row.item_id,
            'timestamp': int(row.timestamp) if convert_dtypes else row.timestamp
        })


    # part 2
    user_history = defaultdict(list)
    item_history = defaultdict(list)

    for row in tqdm(data):
        user_raw_id = row['user_id']
        item_raw_id = row['item_id']
        interaction_timestamp = row['timestamp']

        user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
        item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


    # part 3
    is_changed = True
    threshold = 5
    good_users = set()
    good_items = set()

    while is_changed:
        old_state = (len(good_users), len(good_items))

        good_users = set()
        good_items = set()

        for user_id, history in user_history.items():
            if len(history) >= threshold:
                good_users.add(user_id)

        for item_id, history in item_history.items():
            if len(history) >= threshold:
                good_items.add(item_id)

        user_history = {
            user_id: list(filter(lambda x: x['item_id'] in good_items, history))
            for user_id, history in user_history.items()
        }

        item_history = {
            item_id: list(filter(lambda x: x['user_id'] in good_users, history))
            for item_id, history in item_history.items()
        }

        new_state = (len(good_users), len(good_items))
        is_changed = (old_state != new_state)
        print(old_state, new_state)


    # part 4
    user_mapping = {}
    item_mapping = {}
    tmp_user_history = defaultdict(list)
    tmp_item_history = defaultdict(list)

    for user_id, history in tqdm(user_history.items()):
        processed_history = []

        for filtered_item in history:
            item_id = filtered_item['item_id']
            item_timestamp = filtered_item['timestamp']

            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})

        if len(processed_history) >= threshold:
            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])


    for item_id, history in tqdm(item_history.items()):
        processed_history = []

        for filtered_user in history:
            user_id = filtered_user['user_id']
            user_timestamp = filtered_user['timestamp']

            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

        if len(processed_history) >= threshold:
            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    user_history = tmp_user_history
    item_history = tmp_item_history

    # part 5
    with open(output_path+'/'+output_name, 'w') as f:
        for user_id, item_history in user_history.items():
            f.write(' '.join([str(user_id)] + [
                str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
            ]))
            f.write('\n')

In [None]:
files = [f for f in os.listdir(path_to_raw) if '.csv' in f]
domains = dict()
for f in files:
    k = f.split('_')[0]
    domains[k] = domains.get(k, []) + [f]

for d in tqdm(domains):
    new_dir = path_to_processed + d
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    files = []
    for f in domains[d]:
        df = pd.read_csv(path_to_raw+f, names = ['user_id', 'item_id', 'rating']).drop('rating', axis=1)
        files.append(df)

    df = pd.concat(files, axis=0)
    df['timestamp'] = pd.Series(range(1,df.shape[0]+1))

    _ = create_all_data(df, new_dir)

  0%|          | 0/8 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2273it [00:00, 22721.52it/s][A
4546it [00:00, 18164.02it/s][A
6419it [00:00, 13658.38it/s][A
7896it [00:00, 12356.32it/s][A
9194it [00:00, 11086.56it/s][A
10404it [00:00, 11345.28it/s][A
11582it [00:00, 11430.64it/s][A
12750it [00:01, 10480.11it/s][A
13822it [00:01, 9091.12it/s] [A
15001it [00:01, 9753.75it/s][A
16228it [00:01, 10406.71it/s][A
17379it [00:01, 10704.29it/s][A
18482it [00:01, 10394.74it/s][A
20492it [00:01, 11157.40it/s]

100%|██████████| 20492/20492 [00:00<00:00, 431214.52it/s]


(0, 0) (647, 846)
(647, 846) (361, 713)
(361, 713) (321, 566)
(321, 566) (269, 530)
(269, 530) (259, 493)
(259, 493) (251, 478)
(251, 478) (247, 473)
(247, 473) (245, 467)
(245, 467) (242, 466)
(242, 466) (242, 462)
(242, 462) (238, 462)
(238, 462) (238, 455)
(238, 455) (234, 455)
(234, 455) (234, 453)
(234, 453) (232, 453)
(232, 453) (232, 451)
(232, 451) (229, 451)
(229, 451) (229, 447)
(229, 447) (225, 447)
(225, 447) (225, 441)
(225, 441) (221, 441)
(221, 441) (221, 435)
(221, 435) (219, 435)
(219, 435) (219, 433)
(219, 433) (219, 433)



100%|██████████| 1702/1702 [00:00<00:00, 344931.65it/s]

100%|██████████| 9245/9245 [00:00<00:00, 592774.45it/s]
 12%|█▎        | 1/8 [00:04<00:29,  4.16s/it]
0it [00:00, ?it/s][A
2164it [00:00, 21638.22it/s][A
4328it [00:00, 21582.53it/s][A
6487it [00:00, 21438.67it/s][A
8631it [00:00, 21156.79it/s][A
10748it [00:00, 20712.00it/s][A
12821it [00:00, 20168.71it/s][A
14988it [00:00, 20641.05it/s][A
17272it [00:00, 21324.80it/s][A
19431it [00:00, 21405.10it/s][A
21626it [00:01, 21569.92it/s][A
23786it [00:01, 21063.15it/s][A
25912it [00:01, 21120.66it/s][A
28027it [00:01, 21036.25it/s][A
30133it [00:01, 20171.15it/s][A
32158it [00:01, 19998.39it/s][A
34282it [00:01, 20357.98it/s][A
36323it [00:01, 18056.78it/s][A
38413it [00:01, 18824.63it/s][A
40567it [00:01, 19581.85it/s][A
42768it [00:02, 20273.09it/s][A
44851it [00:02, 20432.80it/s][A
47078it [00:02, 20968.11it/s][A
49191it [00:02, 20215.84it/s][A
51383it [00:02, 20705.16it/s][A
53633it [00:02, 21227.52it/s]

(0, 0) (1031, 5365)
(1031, 5365) (958, 5300)
(958, 5300) (956, 5281)
(956, 5281) (956, 5280)
(956, 5280) (955, 5280)
(955, 5280) (955, 5280)



  0%|          | 0/1768 [00:00<?, ?it/s][A
100%|██████████| 1768/1768 [00:00<00:00, 8616.20it/s]

  0%|          | 0/16421 [00:00<?, ?it/s][A
100%|██████████| 16421/16421 [00:00<00:00, 122687.48it/s]
 25%|██▌       | 2/8 [00:10<00:33,  5.59s/it]
0it [00:00, ?it/s][A
1743it [00:00, 17427.27it/s][A
3486it [00:00, 17175.12it/s][A
5204it [00:00, 16055.37it/s][A
6818it [00:00, 15745.96it/s][A
8397it [00:00, 15546.20it/s][A
9954it [00:00, 15234.50it/s][A
11480it [00:00, 15071.59it/s][A
12988it [00:00, 14457.86it/s][A
15034it [00:00, 16235.51it/s][A
17262it [00:01, 18031.57it/s][A
19382it [00:01, 18975.72it/s][A
21451it [00:01, 19486.39it/s][A
23600it [00:01, 20083.94it/s][A
25707it [00:01, 20378.32it/s][A
27761it [00:01, 20425.55it/s][A
29956it [00:01, 20880.47it/s][A
32150it [00:01, 21197.21it/s][A
34389it [00:01, 21553.96it/s][A
36547it [00:01, 21096.22it/s][A
38660it [00:02, 20988.39it/s][A
40780it [00:02, 21049.19it/s][A
43025it [00:02, 21462.85it/s][A
45290it [

(0, 0) (1265, 5511)
(1265, 5511) (1053, 5404)
(1053, 5404) (1045, 5323)
(1045, 5323) (1037, 5322)
(1037, 5322) (1037, 5317)
(1037, 5317) (1036, 5317)
(1036, 5317) (1036, 5315)
(1036, 5315) (1035, 5315)
(1035, 5315) (1035, 5315)



100%|██████████| 2106/2106 [00:00<00:00, 42400.26it/s]

  0%|          | 0/44101 [00:00<?, ?it/s][A
100%|██████████| 44101/44101 [00:00<00:00, 389404.55it/s]
 38%|███▊      | 3/8 [00:19<00:34,  6.96s/it]
0it [00:00, ?it/s][A
2332it [00:00, 23317.80it/s][A
4664it [00:00, 21865.09it/s][A
6896it [00:00, 22062.65it/s][A
9107it [00:00, 21979.37it/s][A
11308it [00:00, 21573.58it/s][A
13468it [00:00, 21066.54it/s][A
15621it [00:00, 21211.60it/s][A
17745it [00:00, 20686.24it/s][A
19962it [00:00, 21132.56it/s][A
22079it [00:01, 21033.31it/s][A
24185it [00:01, 20846.08it/s][A
26272it [00:01, 20279.29it/s][A
29203it [00:01, 20920.74it/s]

100%|██████████| 29203/29203 [00:00<00:00, 838654.03it/s]


(0, 0) (983, 1766)
(983, 1766) (815, 1661)
(815, 1661) (797, 1594)
(797, 1594) (791, 1587)
(791, 1587) (789, 1584)
(789, 1584) (787, 1583)
(787, 1583) (787, 1582)
(787, 1582) (787, 1582)



100%|██████████| 2084/2084 [00:00<00:00, 96246.66it/s]

100%|██████████| 7647/7647 [00:00<00:00, 261825.15it/s]
 50%|█████     | 4/8 [00:21<00:20,  5.21s/it]
0it [00:00, ?it/s][A
1559it [00:00, 15587.19it/s][A
3118it [00:00, 15216.36it/s][A
4641it [00:00, 14305.48it/s][A
6125it [00:00, 14505.18it/s][A
7700it [00:00, 14939.82it/s][A
9299it [00:00, 15288.95it/s][A
10839it [00:00, 15324.38it/s][A
12546it [00:00, 15873.67it/s][A
14229it [00:00, 16168.96it/s][A
15848it [00:01, 15970.08it/s][A
17447it [00:01, 15689.42it/s][A
19018it [00:01, 15204.51it/s][A
20543it [00:01, 15180.48it/s][A
22129it [00:01, 15379.55it/s][A
23670it [00:01, 15214.47it/s][A
25322it [00:01, 15597.92it/s][A
27003it [00:01, 15955.87it/s][A
30031it [00:01, 15447.20it/s]

  0%|          | 0/30031 [00:00<?, ?it/s][A
100%|██████████| 30031/30031 [00:00<00:00, 223227.33it/s]


(0, 0) (861, 1321)
(861, 1321) (571, 1192)
(571, 1192) (547, 1077)
(547, 1077) (525, 1055)
(525, 1055) (524, 1037)
(524, 1037) (519, 1036)
(519, 1036) (518, 1034)
(518, 1034) (518, 1034)



100%|██████████| 1818/1818 [00:00<00:00, 107743.81it/s]

100%|██████████| 12900/12900 [00:00<00:00, 297081.28it/s]
 62%|██████▎   | 5/8 [00:25<00:13,  4.61s/it]
0it [00:00, ?it/s][A
2348it [00:00, 23475.71it/s][A
4696it [00:00, 21999.32it/s][A
6903it [00:00, 20737.84it/s][A
9049it [00:00, 21005.38it/s][A
11156it [00:00, 18095.09it/s][A
13316it [00:00, 19140.66it/s][A
15459it [00:00, 19824.23it/s][A
17788it [00:00, 20861.75it/s][A
19955it [00:00, 21103.05it/s][A
22090it [00:01, 21100.50it/s][A
24241it [00:01, 21220.13it/s][A
26379it [00:01, 21266.94it/s][A
28515it [00:01, 20902.60it/s][A
30613it [00:01, 20800.81it/s][A
32791it [00:01, 21088.86it/s][A
34904it [00:01, 20994.89it/s][A
37104it [00:01, 21291.71it/s][A
39370it [00:01, 21698.06it/s][A
41542it [00:01, 21552.77it/s][A
43699it [00:02, 21432.54it/s][A
45844it [00:02, 21408.60it/s][A
47986it [00:02, 21198.51it/s][A
50107it [00:02, 20433.68it/s][A
52409it [00:02, 21178.81it/s][A
54534it [00:02, 21044.50it/s

(0, 0) (1124, 3681)
(1124, 3681) (928, 3564)
(928, 3564) (920, 3481)
(920, 3481) (910, 3477)
(910, 3477) (910, 3466)
(910, 3466) (909, 3466)
(909, 3466) (909, 3465)
(909, 3465) (909, 3465)



  0%|          | 0/1901 [00:00<?, ?it/s][A
100%|██████████| 1901/1901 [00:00<00:00, 14700.33it/s]

100%|██████████| 22780/22780 [00:00<00:00, 310177.82it/s]
 75%|███████▌  | 6/8 [00:30<00:09,  4.92s/it]
0it [00:00, ?it/s][A
2349it [00:00, 23485.04it/s][A
4698it [00:00, 21231.48it/s][A
6851it [00:00, 21357.93it/s][A
9075it [00:00, 21691.89it/s][A
12565it [00:00, 21749.59it/s]

100%|██████████| 12565/12565 [00:00<00:00, 547513.19it/s]


(0, 0) (608, 548)
(608, 548) (288, 404)
(288, 404) (239, 260)
(239, 260) (178, 229)
(178, 229) (164, 177)
(164, 177) (126, 163)
(126, 163) (116, 128)
(116, 128) (90, 117)
(90, 117) (87, 95)
(87, 95) (67, 93)
(67, 93) (65, 76)
(65, 76) (54, 75)
(54, 75) (52, 63)
(52, 63) (48, 62)
(48, 62) (48, 58)
(48, 58) (45, 58)
(45, 58) (45, 57)
(45, 57) (45, 57)



100%|██████████| 1745/1745 [00:00<00:00, 507739.19it/s]

100%|██████████| 5265/5265 [00:00<00:00, 934493.27it/s]
 88%|████████▊ | 7/8 [00:32<00:03,  3.85s/it]
0it [00:00, ?it/s][A
2362it [00:00, 23613.38it/s][A
4724it [00:00, 22691.63it/s][A
6996it [00:00, 22042.68it/s][A
9203it [00:00, 21410.48it/s][A
11347it [00:00, 21331.44it/s][A
13482it [00:00, 21167.65it/s][A
15711it [00:00, 21522.83it/s][A
19986it [00:00, 20466.72it/s]

100%|██████████| 19986/19986 [00:00<00:00, 796330.85it/s]


(0, 0) (687, 865)
(687, 865) (396, 717)
(396, 717) (357, 582)
(357, 582) (315, 548)
(315, 548) (306, 513)
(306, 513) (292, 508)
(292, 508) (289, 498)
(289, 498) (286, 496)
(286, 496) (285, 490)
(285, 490) (285, 490)



100%|██████████| 1679/1679 [00:00<00:00, 175915.18it/s]

100%|██████████| 8799/8799 [00:00<00:00, 501926.90it/s]
100%|██████████| 8/8 [00:34<00:00,  4.35s/it]
