In [3]:
import os
import sys
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter

if sys.version_info[0] < 3:
    from StringIO import StringIO
else:
    from io import StringIO

project = 'HVIDEO'
path_to_raw = '../data/'+project+'/raw/'
path_to_processed = '../data/'+project+'/'

In [4]:
# this cell is required for colab only

from google.colab import drive
drive.mount('/content/drive')

path_to_raw = './drive/MyDrive/Science/Multidomain RecSys/' + path_to_raw[3:]
path_to_processed = './drive/MyDrive/Science/Multidomain RecSys/source/iz-dev/' + path_to_processed[3:]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
files = ['traindata.txt','validdata.txt','testdata.txt']
domains = {'E':[],
           'V':[]
          }

all_data = pd.DataFrame(columns=['user_id','history'])
for f in files:

    string_input = []
    with open(path_to_raw+f, 'r') as myfile:
        print('loading file: %s...'%(path_to_raw+f))

        for line in tqdm(myfile.readlines()):
            user_id = line.split('\t')[0]
            line = line.replace('||\t', '\n'+user_id+'\t').replace('||','')
            string_input.append(line)

    string_input = ''.join(string_input)
    df = pd.read_csv(StringIO(string_input), sep='\t', header=None, names=['user_id','history'])

    all_data = pd.concat([all_data, df], axis=0)

all_data['domain'] = all_data['history'].apply(lambda x: x.split('|')[0][0])
all_data['item_id'] = all_data['history'].apply(lambda x: x.split('|')[0][1:])
all_data['timestamp'] = all_data['history'].apply(lambda x: x.split('|')[3])

all_data.drop('history', axis=1, inplace=True)
all_data.drop_duplicates(inplace=True)

loading file: ./drive/MyDrive/Science/Multidomain RecSys/data/HVIDEO/raw/traindata.txt...


100%|██████████| 102182/102182 [00:01<00:00, 86556.40it/s]


creating dataframe...
loading file: ./drive/MyDrive/Science/Multidomain RecSys/data/HVIDEO/raw/validdata.txt...


100%|██████████| 18966/18966 [00:00<00:00, 68487.78it/s]


creating dataframe...
loading file: ./drive/MyDrive/Science/Multidomain RecSys/data/HVIDEO/raw/testdata.txt...


100%|██████████| 13201/13201 [00:00<00:00, 41108.69it/s]


creating dataframe...


In [None]:
def create_all_data(
    df,
    output_path,
    output_name='all_data.txt',
    convert_dtypes=False,
    save_result=False,
    return_result=True,
):
    # part 1
    data = []
    for _, row in tqdm(df.iterrows()):
        data.append({
            'user_id': int(row.user_id) if convert_dtypes else row.user_id,
            'item_id':  int(row.item_id) if convert_dtypes else row.item_id,
            'timestamp': int(row.timestamp) if convert_dtypes else row.timestamp
        })


    # part 2
    user_history = defaultdict(list)
    item_history = defaultdict(list)

    for row in tqdm(data):
        user_raw_id = row['user_id']
        item_raw_id = row['item_id']
        interaction_timestamp = row['timestamp']

        user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
        item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})


    # part 3
    is_changed = True
    threshold = 5
    good_users = set()
    good_items = set()

    while is_changed:
        old_state = (len(good_users), len(good_items))

        good_users = set()
        good_items = set()

        for user_id, history in user_history.items():
            if len(history) >= threshold:
                good_users.add(user_id)

        for item_id, history in item_history.items():
            if len(history) >= threshold:
                good_items.add(item_id)

        user_history = {
            user_id: list(filter(lambda x: x['item_id'] in good_items, history))
            for user_id, history in user_history.items()
        }

        item_history = {
            item_id: list(filter(lambda x: x['user_id'] in good_users, history))
            for item_id, history in item_history.items()
        }

        new_state = (len(good_users), len(good_items))
        is_changed = (old_state != new_state)
        print(old_state, new_state)


    # part 4
    user_mapping = {}
    item_mapping = {}
    tmp_user_history = defaultdict(list)
    tmp_item_history = defaultdict(list)

    for user_id, history in tqdm(user_history.items()):
        processed_history = []

        for filtered_item in history:
            item_id = filtered_item['item_id']
            item_timestamp = filtered_item['timestamp']

            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})

        if len(processed_history) >= threshold:
            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])


    for item_id, history in tqdm(item_history.items()):
        processed_history = []

        for filtered_user in history:
            user_id = filtered_user['user_id']
            user_timestamp = filtered_user['timestamp']

            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

        if len(processed_history) >= threshold:
            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    user_history = tmp_user_history
    item_history = tmp_item_history

    # part 5
    if save_result:
        with open(output_path+'/'+output_name, 'w') as f:
            for user_id, item_history in user_history.items():
                f.write(' '.join([str(user_id)] + [
                    str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
                ]))
                f.write('\n')

    if return_result:
        return user_history, item_history
    return None, None

In [None]:
for d in tqdm(domains):
    new_dir = path_to_processed + d
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    df = all_data[all_data['domain']==d].drop('domain', axis=1).reset_index(drop=True)

    user_history, item_history = create_all_data(df, new_dir)
    for output_name in ['all_data.txt','train_new.txt','validation_new.txt','test_new.txt']:
        with open(new_dir+'/'+output_name, 'w') as f:
                for user_id, item_history in user_history.items():
                    f.write(' '.join([str(user_id)] + [
                        str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
                    ]))
                    f.write('\n')

  0%|          | 0/2 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1328it [00:00, 12245.25it/s][A
2673it [00:00, 12927.51it/s][A
4055it [00:00, 13325.55it/s][A
5528it [00:00, 13870.16it/s][A
6917it [00:00, 13157.70it/s][A
8342it [00:00, 13511.99it/s][A
9725it [00:00, 13612.05it/s][A
11222it [00:00, 14034.18it/s][A
12659it [00:00, 14137.17it/s][A
14126it [00:01, 14298.20it/s][A
15558it [00:01, 14184.95it/s][A
16979it [00:01, 14072.83it/s][A
18393it [00:01, 14092.04it/s][A
19886it [00:01, 14341.83it/s][A
21321it [00:01, 13915.72it/s][A
22716it [00:01, 13894.34it/s][A
24200it [00:01, 14171.07it/s][A
25624it [00:01, 14188.96it/s][A
27046it [00:01, 14196.89it/s][A
28545it [00:02, 14430.86it/s][A
30052it [00:02, 14619.51it/s][A
31527it [00:02, 14657.35it/s][A
33028it [00:02, 14761.04it/s][A
34505it [00:02, 14666.38it/s][A
35973it [00:02, 14575.21it/s][A
37431it [00:02, 13843.58it/s][A
38859it [00:02, 13967.90it/s][A
40357it [00:02, 14262.15it/s][A
41843it [00:02, 14437.3

(0, 0) (13713, 3306)
(13713, 3306) (13710, 3306)
(13710, 3306) (13710, 3306)



  0%|          | 0/13714 [00:00<?, ?it/s][A
 19%|█▉        | 2572/13714 [00:00<00:00, 25706.60it/s][A
 38%|███▊      | 5170/13714 [00:00<00:00, 25864.90it/s][A
 60%|█████▉    | 8171/13714 [00:00<00:00, 27754.86it/s][A
100%|██████████| 13714/13714 [00:00<00:00, 29333.91it/s]

  0%|          | 0/8367 [00:00<?, ?it/s][A
  1%|          | 42/8367 [00:00<00:20, 413.05it/s][A
  1%|▏         | 116/8367 [00:00<00:13, 600.17it/s][A
  3%|▎         | 232/8367 [00:00<00:09, 837.70it/s][A
  5%|▌         | 435/8367 [00:00<00:06, 1297.04it/s][A
  9%|▊         | 719/8367 [00:00<00:04, 1845.18it/s][A
 18%|█▊        | 1470/8367 [00:00<00:01, 3754.31it/s][A
100%|██████████| 8367/8367 [00:00<00:00, 11403.97it/s]
 50%|█████     | 1/2 [00:31<00:31, 32.00s/it]
0it [00:00, ?it/s][A
1534it [00:00, 15336.87it/s][A
3068it [00:00, 13857.16it/s][A
4483it [00:00, 13982.17it/s][A
5953it [00:00, 14252.75it/s][A
7383it [00:00, 13658.63it/s][A
8755it [00:00, 13626.32it/s][A
10176it [00:00, 13809.93it/

(0, 0) (13689, 3540)
(13689, 3540) (13649, 3538)
(13649, 3538) (13649, 3534)
(13649, 3534) (13649, 3534)



  0%|          | 0/13714 [00:00<?, ?it/s][A
 10%|▉         | 1325/13714 [00:00<00:02, 4488.02it/s][A
 27%|██▋       | 3635/13714 [00:00<00:00, 10556.88it/s][A
 45%|████▍     | 6126/13714 [00:00<00:00, 15117.05it/s][A
 63%|██████▎   | 8581/13714 [00:00<00:00, 18060.93it/s][A
100%|██████████| 13714/13714 [00:00<00:00, 17545.92it/s]

  0%|          | 0/11404 [00:00<?, ?it/s][A
  0%|          | 20/11404 [00:00<00:57, 199.10it/s][A
  1%|          | 77/11404 [00:00<00:28, 401.23it/s][A
  2%|▏         | 224/11404 [00:00<00:12, 876.01it/s][A
  4%|▍         | 439/11404 [00:00<00:08, 1370.37it/s][A
  6%|▌         | 646/11404 [00:00<00:06, 1609.57it/s][A
 11%|█         | 1239/11404 [00:00<00:03, 3061.38it/s][A
 23%|██▎       | 2665/11404 [00:00<00:01, 6693.01it/s][A
100%|██████████| 11404/11404 [00:00<00:00, 13929.53it/s]
100%|██████████| 2/2 [00:58<00:00, 29.11s/it]
