## Amazon books dataset preprocess

download link: http://jmcauley.ucsd.edu/data/amazon/

In [None]:
import os
import numpy as np
import pandas as pd

data_dir = '../data/amazonbooks'

files = ['ratings_Books.csv']

for f in files:
    if not os.path.exists(os.path.join(data_dir, f)):
        raise FileNotFoundError(f'Not found file: {f} in directory {os.path.abspath(data_dir)}')

print('Check files over.')

In [None]:
columns = ['user', 'item', 'rating', 'timestamp']

In [None]:
def mapped2sequential(df: pd.DataFrame, columns: list, start_from_1=True):
    for c in columns:
        m = {}
        sign = 1 if start_from_1 else 0
        if isinstance(c, str):
            if c in df.columns:
                l = df[c].unique().tolist()
                for v in l:
                    if m.setdefault(v, sign) == sign:
                        sign += 1
                df[c] = df[c].map(m)
        elif isinstance(c, list):
            for sub_c in c:
                if isinstance(sub_c, str):
                    if sub_c in df.columns:
                        l = df[sub_c].unique().tolist()
                        for v in l:
                            if m.setdefault(v, sign) == sign:
                                sign += 1
                        df[sub_c] = df[sub_c].map(m)
                else:
                    raise ValueError('最多支持二级list')


def min_max_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())


def std_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].mean()) / df[c].std()


def multi_value_process(df: pd.DataFrame, column: str, sep: str):
    uMap, ans = {}, []
    for arr in df[column]:
        que = []
        for i in arr.split(sep):
            if uMap.get(i) is None:
                uMap[i] = len(uMap) + 1
            que.append(uMap[i])
        ans.append(que)
    return ans, len(uMap) + 1

In [None]:
def read_data(file: str, sample_size, sep, names=None, dtype=None):
    if not os.path.exists(file):
        e = f'The file: {file} not exists.'
        raise FileNotFoundError(e)
    df = pd.read_csv(file, iterator=True, names=names, sep=sep, dtype=dtype)
    if sample_size > 0:
        df = df.get_chunk(sample_size)
    else:
        df = df.get_chunk()
    return df

amazon book 数据集中只有 userid, item_id, ratings, timestamp 构成

In [None]:
ratings = read_data(f'{data_dir}/{files[0]}', -1, ',', columns)
ratings.head()

In [None]:
ratings.describe()

和 MIND 中的处理方式一样，去除掉交互较少的用户数据和 item 数据

In [None]:
sub = ratings[['user', 'item']]
sub = sub.drop_duplicates().reset_index(drop=True)

找到交互多的用户

In [None]:
gu = sub.groupby(['user'])
utl = ((gu.count() > 5) & (gu.count() < 500)).reset_index()
utl.columns = ['user', 'usuit']
utl.shape

用户看过的书最大为 40000 条，为了防止内存爆掉，应该去除多余的部分，比如说只保留前 500 条记录

In [None]:
print(np.sum(gu.count()['item']>5))
print(np.sum((((gu.count() > 5) | (gu.count() < 500))['item']) == True))

找到交互多的item

In [None]:
iu = sub.groupby(['item'])
itl = (iu.count() > 200).reset_index()
itl.columns = ['item', 'isuit']
itl.shape

In [None]:
sub_ = ratings.merge(utl).merge(itl)
sub_

In [None]:
sub_1 = sub_[sub_['usuit']]
tmp = sub_1[sub_1['isuit']]
tmp.describe()

In [None]:
tmp.shape

In [None]:
tmp['like_type']=np.where(tmp['rating']>=3, 1, 0)

负样本在训练的时候进行拿取，在这里删除

In [None]:
tmp = tmp[tmp['like_type'] == 1]
tmp.shape

In [None]:
tmp=tmp.drop(columns=['usuit', 'isuit', 'rating', 'timestamp', 'like_type'])

In [None]:
mapped2sequential(tmp, ['user', 'item'])

为每个用户生成购买记录

In [None]:
item_list_per_user = tmp.groupby(['user'])['item'].apply(list).reset_index()
item_list_per_user.columns = ['user', 'list']

In [None]:
item_list_per_user['true_list'] = item_list_per_user['list'].apply(lambda x: True if len(x) > 3 else False)

In [None]:
item_list_per_user = item_list_per_user[item_list_per_user['true_list']]
item_list_per_user = item_list_per_user.drop(columns=['true_list'])
item_list_per_user

In [None]:
tmp = tmp.merge(item_list_per_user, on='user')
tmp

In [None]:
tmp.describe()

生成用户测试数据和 item数据集

In [None]:
tmp = tmp.sample(frac=1).reset_index(drop=True)
tmp

In [None]:
item_data = tmp['item'].drop_duplicates().reset_index(drop=True)
item_data = pd.DataFrame(item_data)