In [1]:
import argparse
from logging import getLogger
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import pickle

from recbole.quick_start import load_data_and_model
from recbole.utils import init_logger, get_model, get_trainer, init_seed, set_color
from recbole.data import create_dataset
from recbole.utils.case_study import full_sort_topk, full_sort_scores

import logging
from logging import getLogger

import torch
import pickle

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, save_split_dataloaders, load_split_dataloaders
from recbole.utils import init_logger, get_model, get_trainer, init_seed, set_color

import copy
from recbole.data.interaction import Interaction, cat_interactions
from collections import OrderedDict

pd.set_option('display.max_rows', None)


In [2]:
df = pd.read_csv('dataset/pretrain/pretrain.inter', sep='\t')
df.head()

Unnamed: 0,sessionid:token,cityid:token,user_id:token,item_id:token,item_type_recommend:token,exposure_time:float,inter_type:token,user_loc:float_seq
0,23b17f8d-6c9a-4f80-9a8f-beb6b8ff753d1622217513...,482,1148278752,625526893_DEAL_GROUP,DEAL_GROUP,1622217600,click,98.4845 25.0468
1,44db1150-c92f-4739-be97-6484484a276b1622215430...,554,3025141532,28142906_DEAL_GROUP,DEAL_GROUP,1622217600,click,119.7374 29.8086
2,169021b5-a95e-4fd4-b6b9-2e2784458a6d1622217473...,324,871599891,659897130_DEAL_GROUP,DEAL_GROUP,1622217601,click,105.8715 26.2293
3,5b34a233-9c96-4e33-b46e-0f3f086cac4b1622216948...,432,208173370,693739980_DEAL_GROUP,DEAL_GROUP,1622217601,click,121.1979 31.5911
4,dc30f9fc-6ccd-45b1-bbba-a04a019afd611622216208...,30,918204676,690308484_DEAL_GROUP,DEAL_GROUP,1622217601,click,114.0401 22.5183


In [3]:
def split_df(df, city_id, ratio=0.8, split_time=None):
    city_df = df[df['cityid:token'] == city_id]
    user_df = city_df[['user_id:token', 'exposure_time:float']].drop_duplicates('user_id:token', keep='first')
    user_df.columns = ['user_id:token', 'first_inter_time:float']
    if split_time is None:
        time_list = user_df['first_inter_time:float'].values
        split_time = time_list[int(len(time_list) * ratio)]
    city_df = pd.merge(city_df, user_df, how='left', on='user_id:token')
    up_df = city_df[city_df['exposure_time:float'] <= split_time]
    down_df = city_df[city_df['first_inter_time:float'] > split_time]
    return up_df, down_df

In [4]:
def filter_by_inter_num(df, user_inter_num):
    user_count_df = df[['user_id:token', 'item_id:token']].groupby('user_id:token').count().reset_index()
    # print(len(user_count_df))
    user_count_df = user_count_df[user_count_df['item_id:token'] >= user_inter_num]
    # print(len(user_count_df))
    return pd.merge(df, user_count_df[['user_id:token']], on='user_id:token', how='right')

In [44]:
# up_df, down_df = split_df(df, 10, ratio=0.8)
up_df, down_df = split_df(df, 10, split_time=1631289599)

In [45]:
up_df['user_id:token'].nunique()

33114

In [46]:
down_df['user_id:token'].nunique()

5290

In [47]:
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)

33114
18055
5290
1448


In [48]:
up_df['user_id:token'].nunique()

18055

In [49]:
down_df['user_id:token'].nunique()

1448

In [50]:
1448 / (1448 + 18055)

0.0742449879505717

In [5]:
up_df, down_df = split_df(df, 10, split_time=1631289599)
origin_up_user_num = up_df['user_id:token'].nunique()
origin_down_user_num = down_df['user_id:token'].nunique()
origin_user_num = origin_up_user_num + origin_down_user_num
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)
up_user_num = up_df['user_id:token'].nunique()
down_user_num = down_df['user_id:token'].nunique()
user_num = up_user_num + down_user_num
print(origin_up_user_num, origin_down_user_num, origin_up_user_num * 1.0 / origin_user_num)
print(up_user_num, down_user_num, up_user_num * 1.0 / user_num)

33114 5290 0.8622539318820956
18055 1448 0.9257550120494283


In [6]:
up_df, down_df = split_df(df, 70, split_time=1631289599)
origin_up_user_num = up_df['user_id:token'].nunique()
origin_down_user_num = down_df['user_id:token'].nunique()
origin_user_num = origin_up_user_num + origin_down_user_num
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)
up_user_num = up_df['user_id:token'].nunique()
down_user_num = down_df['user_id:token'].nunique()
user_num = up_user_num + down_user_num
print(origin_up_user_num, origin_down_user_num, origin_up_user_num * 1.0 / origin_user_num)
print(up_user_num, down_user_num, up_user_num * 1.0 / user_num)

15678 2874 0.8450840879689522
9209 943 0.9071118991331757


In [7]:
up_df, down_df = split_df(df, 361, split_time=1631289599)
origin_up_user_num = up_df['user_id:token'].nunique()
origin_down_user_num = down_df['user_id:token'].nunique()
origin_user_num = origin_up_user_num + origin_down_user_num
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)
up_user_num = up_df['user_id:token'].nunique()
down_user_num = down_df['user_id:token'].nunique()
user_num = up_user_num + down_user_num
print(origin_up_user_num, origin_down_user_num, origin_up_user_num * 1.0 / origin_user_num)
print(up_user_num, down_user_num, up_user_num * 1.0 / user_num)

4385 679 0.865916271721959
2518 189 0.9301810121906169


In [8]:
up_df, down_df = split_df(df, 65, split_time=1631289599)
origin_up_user_num = up_df['user_id:token'].nunique()
origin_down_user_num = down_df['user_id:token'].nunique()
origin_user_num = origin_up_user_num + origin_down_user_num
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)
up_user_num = up_df['user_id:token'].nunique()
down_user_num = down_df['user_id:token'].nunique()
user_num = up_user_num + down_user_num
print(origin_up_user_num, origin_down_user_num, origin_up_user_num * 1.0 / origin_user_num)
print(up_user_num, down_user_num, up_user_num * 1.0 / user_num)

5882 907 0.8664015318898217
3427 267 0.9277206280454792


In [9]:
up_df, down_df = split_df(df, 20, split_time=1631289599)
origin_up_user_num = up_df['user_id:token'].nunique()
origin_down_user_num = down_df['user_id:token'].nunique()
origin_user_num = origin_up_user_num + origin_down_user_num
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)
up_user_num = up_df['user_id:token'].nunique()
down_user_num = down_df['user_id:token'].nunique()
user_num = up_user_num + down_user_num
print(origin_up_user_num, origin_down_user_num, origin_up_user_num * 1.0 / origin_user_num)
print(up_user_num, down_user_num, up_user_num * 1.0 / user_num)

29880 4880 0.859608745684695
18295 1341 0.9317070686494194


In [10]:
up_df, down_df = split_df(df, 50, split_time=1631289599)
origin_up_user_num = up_df['user_id:token'].nunique()
origin_down_user_num = down_df['user_id:token'].nunique()
origin_user_num = origin_up_user_num + origin_down_user_num
up_df = filter_by_inter_num(up_df, 4)
down_df = filter_by_inter_num(down_df, 4)
up_user_num = up_df['user_id:token'].nunique()
down_user_num = down_df['user_id:token'].nunique()
user_num = up_user_num + down_user_num
print(origin_up_user_num, origin_down_user_num, origin_up_user_num * 1.0 / origin_user_num)
print(up_user_num, down_user_num, up_user_num * 1.0 / user_num)

19410 3373 0.851951016108502
10861 972 0.9178568410377758


In [11]:
all_item = np.unique(df[df['cityid:token'] == 50]['item_id:token'].values)
seen_item = np.unique(np.concatenate([up_df['item_id:token'].values, down_df['item_id:token'].values]))

In [12]:
len(seen_item), len(all_item), len(seen_item) / len(all_item)

(50776, 60981, 0.8326527934930552)

In [13]:
up_df.head()

Unnamed: 0,sessionid:token,cityid:token,user_id:token,item_id:token,item_type_recommend:token,exposure_time:float,inter_type:token,user_loc:float_seq,first_inter_time:float
0,5c34b9ae-b03e-474d-bdbd-829af8852e8d1622297372...,50,113930990,666407514_DEAL_GROUP,DEAL_GROUP,1622217605,click,120.1791 30.2416,1622217605
1,5c34b9ae-b03e-474d-bdbd-829af8852e8d1622270237...,50,113930990,700334231_DEAL_GROUP,DEAL_GROUP,1622270523,click,120.1791 30.2416,1622217605
2,5c34b9ae-b03e-474d-bdbd-829af8852e8d1622300855...,50,113930990,666407514_DEAL_GROUP,DEAL_GROUP,1622301745,click,120.1791 30.2417,1622217605
3,5c34b9ae-b03e-474d-bdbd-829af8852e8d1622300855...,50,113930990,579809618_POI_WAIMAI,POI_WAIMAI,1622302284,click,120.1791 30.2417,1622217605
4,5c34b9ae-b03e-474d-bdbd-829af8852e8d1622386246...,50,113930990,666407514_DEAL_GROUP,DEAL_GROUP,1622305711,click,120.1791 30.2416,1622217605
