## Amazon books dataset preprocess

download link: http://jmcauley.ucsd.edu/data/amazon/

In [1]:
import os
import numpy as np
import pandas as pd

data_dir = '../data/amazonbooks'

files = ['ratings_Books.csv']

for f in files:
    if not os.path.exists(os.path.join(data_dir, f)):
        raise FileNotFoundError(f'Not found file: {f} in directory {os.path.abspath(data_dir)}')

print('Check files over.')

Check files over.


In [2]:
columns = ['user', 'item', 'rating', 'timestamp']

In [3]:
def mapped2sequential(df: pd.DataFrame, columns: list, start_from_1=True):
    for c in columns:
        m = {}
        sign = 1 if start_from_1 else 0
        if isinstance(c, str):
            if c in df.columns:
                l = df[c].unique().tolist()
                for v in l:
                    if m.setdefault(v, sign) == sign:
                        sign += 1
                df[c] = df[c].map(m)
        elif isinstance(c, list):
            for sub_c in c:
                if isinstance(sub_c, str):
                    if sub_c in df.columns:
                        l = df[sub_c].unique().tolist()
                        for v in l:
                            if m.setdefault(v, sign) == sign:
                                sign += 1
                        df[sub_c] = df[sub_c].map(m)
                else:
                    raise ValueError('最多支持二级list')


def min_max_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())


def std_normalize(df: pd.DataFrame, columns: list):
    for c in columns:
        if c in df.columns:
            df[c] = (df[c] - df[c].mean()) / df[c].std()


def multi_value_process(df: pd.DataFrame, column: str, sep: str):
    uMap, ans = {}, []
    for arr in df[column]:
        que = []
        for i in arr.split(sep):
            if uMap.get(i) is None:
                uMap[i] = len(uMap) + 1
            que.append(uMap[i])
        ans.append(que)
    return ans, len(uMap) + 1

In [4]:
def read_data(file: str, sample_size, sep, names=None, dtype=None):
    if not os.path.exists(file):
        e = f'The file: {file} not exists.'
        raise FileNotFoundError(e)
    df = pd.read_csv(file, iterator=True, names=names, sep=sep, dtype=dtype)
    if sample_size > 0:
        df = df.get_chunk(sample_size)
    else:
        df = df.get_chunk()
    return df

amazon book 数据集中只有 userid, item_id, ratings, timestamp 构成

In [5]:
ratings = read_data(f'{data_dir}/{files[0]}', -1, ',', columns)
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,AH2L9G3DQHHAJ,116,4.0,1019865600
1,A2IIIDRK3PRRZY,116,1.0,1395619200
2,A1TADCM7YWPQ8M,868,4.0,1031702400
3,AWGH7V0BDOJKB,13714,4.0,1383177600
4,A3UTQPQPM4TQO0,13714,5.0,1374883200


In [6]:
ratings.describe()

Unnamed: 0,rating,timestamp
count,22507160.0,22507160.0
mean,4.295759,1310415000.0
std,1.111507,110615400.0
min,1.0,832550400.0
25%,4.0,1274573000.0
50%,5.0,1358813000.0
75%,5.0,1384560000.0
max,5.0,1406074000.0


和 MIND 中的处理方式一样，去除掉交互较少的用户数据和 item 数据

In [7]:
sub = ratings[['user', 'item']]
sub = sub.drop_duplicates().reset_index(drop=True)

找到交互多的用户

In [8]:
gu = sub.groupby(['user'])
utl = ((gu.count() > 5) & (gu.count() < 500)).reset_index()
utl.columns = ['user', 'usuit']
utl.shape

(8026324, 2)

用户看过的书最大为 40000 条，为了防止内存爆掉，应该去除多余的部分，比如说只保留前 500 条记录

In [9]:
print(np.sum(gu.count()['item']>5))
print(np.sum((((gu.count() > 5) | (gu.count() < 500))['item']) == True))

624295
8026324


找到交互多的item

In [10]:
iu = sub.groupby(['item'])
itl = (iu.count() > 200).reset_index()
itl.columns = ['item', 'isuit']
itl.shape

(2330066, 2)

In [11]:
sub_ = ratings.merge(utl).merge(itl)
sub_

Unnamed: 0,user,item,rating,timestamp,usuit,isuit
0,AH2L9G3DQHHAJ,0000000116,4.0,1019865600,False,False
1,A2IIIDRK3PRRZY,0000000116,1.0,1395619200,False,False
2,AH2L9G3DQHHAJ,0553107755,5.0,1019952000,False,True
3,ADDB0Y73L2CHU,0553107755,5.0,1056067200,True,True
4,A3Z4Y7K8YJK1F,0553107755,4.0,1094601600,True,True
...,...,...,...,...,...,...
22507150,ARCJK1T6IQLGE,B00LXJVSRQ,5.0,1405728000,False,False
22507151,AE8J4KWX7PL3R,B00LY7P80K,4.0,1405987200,False,False
22507152,A354637YD3MCTH,B00LZVRA5Q,5.0,1406073600,False,False
22507153,A1VIVS7VU2U7WM,B00M01MFQ4,4.0,1405900800,False,False


In [12]:
sub_1 = sub_[sub_['usuit']]
tmp = sub_1[sub_1['isuit']]
tmp.describe()

Unnamed: 0,rating,timestamp
count,2529902.0,2529902.0
mean,4.252771,1333670000.0
std,1.084229,92424480.0
min,1.0,835660800.0
25%,4.0,1330560000.0
50%,5.0,1367194000.0
75%,5.0,1387670000.0
max,5.0,1406074000.0


In [13]:
tmp.shape

(2529902, 6)

In [14]:
tmp['like_type']=np.where(tmp['rating']>=3, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


负样本在训练的时候进行拿取，在这里删除

In [15]:
tmp = tmp[tmp['like_type'] == 1]
tmp.shape

(2305196, 7)

In [16]:
tmp=tmp.drop(columns=['usuit', 'isuit', 'rating', 'timestamp', 'like_type'])

In [17]:
mapped2sequential(tmp, ['user', 'item'])

为每个用户生成购买记录

In [18]:
item_list_per_user = tmp.groupby(['user'])['item'].apply(list).reset_index()
item_list_per_user.columns = ['user', 'list']

In [19]:
item_list_per_user['true_list'] = item_list_per_user['list'].apply(lambda x: True if len(x) > 3 else False)

In [20]:
item_list_per_user = item_list_per_user[item_list_per_user['true_list']]
item_list_per_user = item_list_per_user.drop(columns=['true_list'])
item_list_per_user

Unnamed: 0,user,list
0,1,"[1, 11, 527, 540, 591, 745, 2492, 2714, 2738, ..."
1,2,"[1, 822, 1155, 1162, 2618, 3774, 3777, 8254]"
2,3,"[1, 2, 11, 85, 103, 161, 526, 540, 549, 569, 6..."
3,4,"[1, 7, 10, 37, 51, 95, 98, 164, 167, 256, 423,..."
4,5,"[1, 284, 654, 815, 1034, 1647, 1692, 2244, 2510]"
...,...,...
475472,475473,"[10988, 11016, 11060, 11081]"
475474,475475,"[10988, 11050, 11059, 11066, 11079]"
475476,475477,"[10988, 11008, 11088, 11092]"
475947,475948,"[11046, 11070, 11080, 11085]"


In [21]:
tmp = tmp.merge(item_list_per_user, on='user')
tmp

Unnamed: 0,user,item,list
0,1,1,"[1, 11, 527, 540, 591, 745, 2492, 2714, 2738, ..."
1,1,11,"[1, 11, 527, 540, 591, 745, 2492, 2714, 2738, ..."
2,1,527,"[1, 11, 527, 540, 591, 745, 2492, 2714, 2738, ..."
3,1,540,"[1, 11, 527, 540, 591, 745, 2492, 2714, 2738, ..."
4,1,591,"[1, 11, 527, 540, 591, 745, 2492, 2714, 2738, ..."
...,...,...,...
1802997,476001,11050,"[11050, 11059, 11077, 11078, 11079]"
1802998,476001,11059,"[11050, 11059, 11077, 11078, 11079]"
1802999,476001,11077,"[11050, 11059, 11077, 11078, 11079]"
1803000,476001,11078,"[11050, 11059, 11077, 11078, 11079]"


In [22]:
tmp.describe()

Unnamed: 0,user,item
count,1803002.0,1803002.0
mean,170863.0,4895.508
std,126471.5,3061.903
min,1.0,1.0
25%,56387.0,2298.0
50%,144363.0,4579.0
75%,284065.0,7286.0
max,476001.0,11093.0


生成用户测试数据和 item数据集

In [23]:
tmp = tmp.sample(frac=1).reset_index(drop=True)
tmp

Unnamed: 0,user,item,list
0,66832,4332,"[244, 303, 312, 313, 429, 473, 568, 1385, 2071..."
1,47275,1203,"[170, 746, 1197, 1201, 1203, 1249, 1292, 1392,..."
2,42510,1281,"[158, 544, 1281, 2582, 6333]"
3,5476,545,"[16, 85, 94, 529, 545, 772, 1606, 2561, 3662, ..."
4,284468,6408,"[2823, 3636, 4871, 4936, 6267, 6408, 6939, 7311]"
...,...,...,...
1802997,9061,190,"[27, 166, 169, 174, 190, 198, 5423, 6542, 7216..."
1802998,28940,6753,"[113, 825, 973, 1127, 3636, 4242, 4575, 6753, ..."
1802999,101911,1290,"[534, 621, 1263, 1265, 1267, 1290, 2009, 2494,..."
1803000,49646,391,"[185, 290, 391, 5443]"


In [24]:
item_data = tmp['item'].drop_duplicates().reset_index(drop=True)
item_data = pd.DataFrame(item_data)

In [29]:
tmp['item'].max(), tmp['user'].max()

(11093, 476001)

In [32]:
mapped2sequential(sub_, ['user', 'item'])

In [33]:
sub_['item'].max(), sub_['user'].max()

(2330066, 8026324)