In [1]:
import os
import random
import pickle
import numpy as np

random.seed(1234)

In [2]:
# データセットの読み込みと利用する要素の選択
with open('../raw_data/reviews.pkl', 'rb') as f:
  reviews_df = pickle.load(f)
  reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime', 'reviewText']]
with open('../raw_data/meta.pkl', 'rb') as f:
  meta_df = pickle.load(f)
  meta_df = meta_df[['asin', 'categories', 'imUrl']]
meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])
# URLを画像埋め込み表現を取得するためのキーへ変換する
meta_df['imUrl'] = meta_df['imUrl'].map(lambda url: os.path.basename(url) if isinstance(url, str) else 'not_available')

In [3]:
def build_map(df, col_name):
  """キーをユニークなIDに変換する。そのキーとそのIDをマッピングする辞書との逆処理の配列を返す"""
  key = sorted(df[col_name].unique().tolist())
  m = dict(zip(key, range(len(key))))
  df[col_name] = df[col_name].map(lambda x: m[x])
  return m, key

In [4]:
# 商品、カテゴリ、レビュアーのIDを整数へ変換
asin_map, asin_key = build_map(meta_df, 'asin')
cate_map, cate_key = build_map(meta_df, 'categories')
revi_map, revi_key = build_map(reviews_df, 'reviewerID')
img_map, img_key = build_map(meta_df, 'imUrl')

In [5]:
reviews_df

Unnamed: 0,reviewerID,asin,unixReviewTime,reviewText
0,176008,0528881469,1370131200,We got this GPS for my husband who is an (OTR)...
1,173739,0528881469,1290643200,"I'm a professional OTR truck driver, and I bou..."
2,134504,0528881469,1283990400,"Well, what can I say. I've had this unit in m..."
3,24476,0528881469,1290556800,"Not going to write a long review, even thought..."
4,57419,0528881469,1317254400,I've had mine for a year and here's what we go...
...,...,...,...,...
1689183,107787,B00LGQ6HL8,1405555200,Burned these in before listening to them for a...
1689184,22987,B00LGQ6HL8,1405382400,Some people like DJ style headphones or earbud...
1689185,58729,B00LGQ6HL8,1405555200,I&#8217;m a big fan of the Brainwavz S1 (actua...
1689186,19901,B00LGQ6HL8,1405641600,"I've used theBrainwavz S1 In Ear Headphones, a..."


In [7]:
meta_df

Unnamed: 0,asin,categories,imUrl
0,0,738,50992
1,1,157,53416
2,2,571,26870
3,3,707,38759
4,7,799,49743
...,...,...,...
62996,62997,368,52493
62997,62998,63,16217
62998,62996,475,48578
62999,62999,674,31126


In [6]:
user_count, item_count, cate_count, example_count =\
    len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]
print('user_count: %d\titem_count: %d\tcate_count: %d\texample_count: %d' %
      (user_count, item_count, cate_count, example_count))
print('image_count: %d' % len(img_map))

user_count: 192403	item_count: 63001	cate_count: 801	example_count: 1689188
image_count: 59881


In [8]:
meta_df = meta_df.sort_values('asin')
meta_df = meta_df.reset_index(drop=True)

In [9]:
meta_df

Unnamed: 0,asin,categories,imUrl
0,0,738,50992
1,1,157,53416
2,2,571,26870
3,3,707,38759
4,4,714,31480
...,...,...,...
62996,62996,475,48578
62997,62997,368,52493
62998,62998,63,16217
62999,62999,674,31126


In [10]:
reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime'])
reviews_df = reviews_df.reset_index(drop=True)

In [11]:
reviews_df

Unnamed: 0,reviewerID,asin,unixReviewTime,reviewText
0,0,13179,1400457600,So the screen itself is OK. it is an actual sc...
1,0,17993,1400457600,I had a complicated set up for my screen. I ne...
2,0,28326,1400457600,The mount is good if you account for the play ...
3,0,29247,1400457600,For some reason this product doesnt work that ...
4,0,62275,1400457600,Great box Exactly what i needed. it isnt water...
...,...,...,...,...
1689183,192402,57576,1389744000,It fits so so bit it doesn't really support th...
1689184,192402,22519,1396396800,"So I carry my iPad mini, phone and wallet in i..."
1689185,192402,20977,1404172800,Its a cable. It works well and it is great for...
1689186,192402,60283,1404172800,I would give these 4.5 stars if possible. The...


In [14]:
reviews_df[reviews_df['reviewerID'] == 0]

Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,13179,1400457600
1,0,17993,1400457600
2,0,28326,1400457600
3,0,29247,1400457600
4,0,62275,1400457600


In [16]:
reviews_df[reviews_df['reviewerID'] == 0]['asin'].tolist()

[13179, 17993, 28326, 29247, 62275]

In [12]:
texts = np.array(reviews_df['reviewText'], dtype=object)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]

In [17]:
with open('../raw_data/remap.pkl', 'rb') as f:
  # asinが整数になっているレビューのデータ
  reviews_df = pickle.load(f)

In [18]:
reviews_df

Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,13179,1400457600
1,0,17993,1400457600
2,0,28326,1400457600
3,0,29247,1400457600
4,0,62275,1400457600
...,...,...,...
131173,14999,54333,1392422400
131174,14999,33077,1392940800
131175,14999,60974,1397174400
131176,14999,16776,1398384000


In [19]:
with open('../raw_data/text_embeddings.pkl', 'rb') as f:
  r = pickle.load(f)

In [20]:
len(r)

50360