In [1]:
import pandas as pd
import numpy as np
import tqdm
import pickle

In [2]:
data_path = 'mathorcup_recom_listwise/data/'

In [3]:
info_df = pd.read_csv(data_path + 'doc_info.csv', dtype=str)
info_df

Unnamed: 0,contentID,contentType,contentCategory
0,124564892986,video,历史/中国史
1,124564906548,video,科学/动植物与微生物
2,124564909185,video,科学/动植物与微生物
3,124564912145,video,科学/动植物与微生物
4,124564932495,video,科学/动植物与微生物
...,...,...,...
2510698,509523681,news,情感/婚姻与家庭
2510699,509523744,news,科技/互联网
2510700,509524347,news,科技/数码产品
2510701,509524365,news,国际/国际趣闻


In [4]:
sorted_info_df = info_df.sort_values(by=['contentCategory', 'contentType'])
sorted_info_df

Unnamed: 0,contentID,contentType,contentCategory
785196,133657464327,video,"A_0_24:0.653325,A_25_29:0.170250,A_30_39:0.134..."
535841,133658459211,video,"A_0_24:0.947994,A_25_29:0.003303,A_30_39:0.032..."
185162,505670245,news,两性/两性健康
186246,505857707,news,两性/两性健康
187525,506010721,news,两性/两性健康
...,...,...,...
2438682,133687111738,video,
2439143,133687211179,video,
2439454,133687293980,video,
2443290,133688111043,video,


In [5]:
sorted_info_df.to_csv(data_path + 'sorted_doc_info.csv', encoding='utf_8_sig', index=False)

# cID to cType & cCategory

In [6]:
# 2510703it [02:04, 20093.00it/s]
contentTC2ID = dict()
contentID2idx = dict()
key2idx = dict()
i = 0

for _, row in tqdm.tqdm(sorted_info_df.iterrows()):
    cID, cT, cC = row

    if isinstance(cC, float):  # cC is nan
        key = cT[0] + 'nan'
    else:
        key = cT[0] + cC
    if key in contentTC2ID.keys():
        contentTC2ID[key].append(cID)
    else:
        contentTC2ID[key] = [cID]
        key2idx[key] = i
        i += 1
    contentID2idx[cID] = key2idx[key]
    # contentTC2ID.setdefault(key,[]).append(cID)

2510703it [02:04, 20093.00it/s]


In [7]:
len(info_df) / len(contentTC2ID)  # item compression rate

1690.709090909091

In [8]:
len(contentTC2ID)

1485

In [9]:
len(contentTC2ID['vnan'])  # 947 video items have no category

947

In [10]:
len(contentTC2ID['nnan'])  # 34 news items have no category

34

In [11]:
with open(data_path + 'contentTC2ID.pickle', 'wb') as handle:
    pickle.dump(contentTC2ID, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open(data_path + 'contentID2idx.pickle', 'wb') as handle:
    pickle.dump(contentID2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
with open(data_path + 'key2idx.pickle', 'wb') as handle:
    pickle.dump(key2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Create Dataset

In [14]:
import torch

In [15]:
train_df = pd.read_csv(data_path + 'train_data.csv', dtype=str, nrows=2)
train_df

Unnamed: 0,userID,requestID,date,time,sequence
0,1000014754,500007377_1635422685108_3822,20211028,20,133669542676:1:148;133658378700:1:16;133650937...
1,1000019906,500009953_1635375063077_3893,20211028,6,133679233276:0:0;133658338671:0:0;133677846615...


In [16]:
from torch.utils.data import Dataset

def icd2i(icd):
    i, c, d = icd.split(':')
    return i


def seq2itemID(sequence):
    """
    :param sequence: {str}, e.g. '133679233276:0:0;133658338671:0:0;133677846615:0:0'
    :return:
    """
    return {icd2i(icd) for icd in sequence.split(';')}


def icd2dict(icd):
    i, c, d = icd.split(':')
    return {'itemID': i, 'clicked': bool(eval(c)), 'duration': eval(d)}


class Sequence:
    def __init__(self, sequence):
        """
        :param sequence: {str}, e.g. '133679233276:0:0;133658338671:0:0;133677846615:0:0'
        :return:
        """
        self.sequence = [icd2dict(icd) for icd in sequence.split(';')]
        self.length = len(self.sequence)
        self.avg_clicked = np.mean([_['clicked'] for _ in self.sequence])
        self.sum_duration = np.sum([_['duration'] for _ in self.sequence])
        self.avg_duration = self.sum_duration / self.length
    def __len__(self):
        return self.length


class TrainDataset(Dataset):
    def __init__(self, df, userID2idx, itemID2idx):
        self.length = len(df)

        self.userID2idx = userID2idx
        self.itemID2idx = itemID2idx

        self.userLen = len(userID2idx)
        self.itemLen = len(itemID2idx)

        self.userID, self.requestID = df['userID'], df['requestID']  # string
        self.userIdx = torch.tensor([userID2idx[_] for _ in self.userID], dtype=torch.int32)  # {Tensor: (len(df),)}

        # self.date = torch.tensor(df.astype({'date': 'int32'})['date'])  # e.g. 20220106
        # self.time = torch.tensor(df.astype({'time': 'int8'})['time'])  # range in [00, 23]

        self.date = torch.zeros([len(df), 6], dtype=torch.int16)
        for _ in range(len(df)):
            self.date[_, 0] = int(df.loc[_, 'time'])  # hour
            date = df.loc[_, 'date']
            self.date[_, 3] = int(date[:4])  # year
            self.date[_, 4] = int(date[4:6])  # month
            self.date[_, 5] = int(date[6:8])  # day

        self.sequence = [Sequence(_) for _ in df['sequence']]
        self.max_sum_duration = max([_.sum_duration for _ in self.sequence])

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        # udt = torch.tensor([self.userIdx[idx], self.date[idx], self.time[idx]], dtype=torch.int32)
        userIdx = self.userIdx[idx]
        date = self.date[idx]
        sequence = self.sequence[idx]
        itemID = torch.tensor([self.itemID2idx[_['itemID']] for _ in self.sequence[idx].sequence], dtype=torch.int32)
        duration = torch.tensor([_['duration'] for _ in self.sequence[idx].sequence], dtype=torch.int32)
        return userIdx, date, itemID, duration, torch.tensor(len(sequence))

In [17]:
train_userID = set(train_df['userID'])
userID2idx = {_:i for i, _ in enumerate(train_userID)}
userID2idx

{'1000019906': 0, '1000014754': 1}

In [18]:
train_dataset = TrainDataset(train_df, userID2idx, contentID2idx)

In [19]:
train_dataset[0]

(tensor(1, dtype=torch.int32),
 tensor([  20,    0,    0, 2021,   10,   28], dtype=torch.int16),
 tensor([ 302, 1176,  274,  717,  316, 1103, 1220,  271,  654,  348,  348],
        dtype=torch.int32),
 tensor([148,  16,  85, 221,   0, 101,  60,   0, 102,   0, 120],
        dtype=torch.int32),
 tensor(11))

In [20]:
train_dataset[1]

(tensor(0, dtype=torch.int32),
 tensor([   6,    0,    0, 2021,   10,   28], dtype=torch.int16),
 tensor([1202,  509,  126, 1288,  301,  348,  224,  570,  689,  863],
        dtype=torch.int32),
 tensor([  0,   0,   0,   0, 113,   0, 251,   0,   0,   0], dtype=torch.int32),
 tensor(10))