In [24]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [25]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [26]:
DATASET = 'Toys_and_Games'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [27]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

Downloading interaction data into ./Toys_and_Games
Downloading item metadata into ./Toys_and_Games


In [28]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1VXOAVRGKGEAK,439893577,Angie,"[0, 0]",I like the item pricing. My granddaughter want...,5.0,Magnetic board,1390953600,"01 29, 2014"
1,A8R62G708TSCM,439893577,Candace,"[1, 1]",Love the magnet easel... great for moving to d...,4.0,it works pretty good for moving to different a...,1395964800,"03 28, 2014"
2,A21KH420DK0ICA,439893577,capemaychristy,"[1, 1]",Both sides are magnetic. A real plus when you...,5.0,love this!,1359331200,"01 28, 2013"
3,AR29QK6HPFYZ4,439893577,dcrm,"[0, 0]",Bought one a few years ago for my daughter and...,5.0,Daughters love it,1391817600,"02 8, 2014"
4,ACCH8EOML6FN5,439893577,DoyZ,"[1, 1]",I have a stainless steel refrigerator therefor...,4.0,Great to have so he can play with his alphabet...,1399248000,"05 5, 2014"


In [29]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,asin,description,title,price,salesRank,imUrl,brand,categories,related
0,191639,"Three Dr. Suess' Puzzles: Green Eggs and Ham, ...",Dr. Suess 19163 Dr. Seuss Puzzle 3 Pack Bundle,37.12,{'Toys & Games': 612379},http://ecx.images-amazon.com/images/I/414PLROX...,Dr. Seuss,"[[Toys & Games, Puzzles, Jigsaw Puzzles]]",
1,5069491,,Nursery Rhymes Felt Book,,{'Toys & Games': 576683},http://ecx.images-amazon.com/images/I/51z4JDBC...,,[[Toys & Games]],
2,76561046,Learn Fractions Decimals Percents using flash ...,Fraction Decimal Percent Card Deck,,{'Toys & Games': 564211},http://ecx.images-amazon.com/images/I/51ObabPu...,,"[[Toys & Games, Learning & Education, Flash Ca...",{'also_viewed': ['0075728680']}
3,131358936,"New, Sealed. Fast Shipping with tracking, buy ...",,36.22,{'Software': 8080},http://ecx.images-amazon.com/images/I/51%2B7Ej...,,"[[Toys & Games, Learning & Education, Mathemat...","{'also_bought': ['0321845536', '0078787572'], ..."
4,133642984,,Algebra 2 California Teacher Center,731.93,{'Toys & Games': 1150291},http://ecx.images-amazon.com/images/I/51VK%2BL...,Prentice Hall,"[[Toys & Games, Learning & Education, Mathemat...",


In [30]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

### Statistics

In [31]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [32]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 19412
# Items: 11924
# Interactions: 167597
Time Span: 2000-07-28/2014-07-23


# Build Dataset

### Interaction data

In [33]:
np.random.seed(RANDOM_SEED)

In [34]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()
#len(out_df)

Unnamed: 0,user_id,item_id,time
0,A2B8GXSCB1R05T,B00004SDAP,964742400
1,A32JIQG3B1XX5I,B00000IW2S,966297600
2,A1M2T0J45TTE64,B00000IZXG,967680000
3,ADX9NTN40T1HN,B00000IZOU,968025600
4,ADX9NTN40T1HN,B00000IZQQ,968025600


In [35]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()
#len(out_df)

Unnamed: 0,user_id,item_id,time
0,6699,413,964742400
1,10631,242,966297600
2,3119,294,967680000
3,16241,286,968025600
4,16241,291,968025600


### Find cutoff between advantage vs. disadvantage

In [36]:
# find number of instances of each user

user_dict = {}
for idx, row in out_df.iterrows():
    key = out_df.iloc[idx]['user_id']
    if key in user_dict:
        user_dict[key] += 1
    else:
        user_dict[key] = 1

user_interact_df = pd.DataFrame(list(user_dict.items()),columns = ['user_id','#_of_interactions'])
#print(user_interact_df['#_of_interactions'].sum())
# find cutoff and begin seperation
cutoff = user_interact_df['#_of_interactions'].quantile(0.9)
#print(user_interact_df)

In [37]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())

def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [38]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(128773, 19412, 19412)

### Output.csv 

In [39]:
#output_df = dev_df.append(test_df).sort_index()
#output_df['#_of_transactions'] = output_df['neg_items'].str.len()
#output_df["group"] = ""

#cutoff = output_df['#_of_transactions'].quantile(0.95)

#output_df.head()

#for idx, row in output_df.iterrows():
    #group_val = 1
    #if(int(output_df.loc[idx]['#_of_transactions']) < cutoff):
        #group_val = 0
    #output_df.at[idx, 'group'] = group_val

#print(output_df['#_of_transactions'].min())


In [40]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
18,14194,412,973814400,"[2733, 10800, 9846, 3265, 4860, 9226, 7892, 43..."
20,6232,188,974678400,"[3763, 10147, 9009, 2436, 1635, 974, 4465, 102..."
45,15758,376,980640000,"[11053, 8409, 8217, 11219, 5104, 7940, 2283, 1..."
51,10631,529,985737600,"[3613, 8402, 6762, 3969, 8151, 1041, 6251, 835..."
60,9177,396,990230400,"[8473, 1029, 2661, 2354, 10584, 5663, 7735, 83..."


### Seperate dev, test into advantage vs. disadvantage groups 

In [41]:
adv_dev_df = pd.DataFrame(columns = {'user_id', 'item_id', 'time', 'neg_items'})
disadv_dev_df = pd.DataFrame(columns = {'user_id', 'item_id', 'time', 'neg_items'})
adv_test_df = pd.DataFrame(columns = {'user_id', 'item_id', 'time', 'neg_items'})
disadv_test_df = pd.DataFrame(columns = {'user_id', 'item_id', 'time', 'neg_items'})

for idx, row in dev_df.iterrows():
    if(int(user_dict.get(dev_df.loc[idx]['user_id'])) >= cutoff):
        adv_dev_df = adv_dev_df.append(dev_df.loc[idx])
    else:
        disadv_dev_df = disadv_dev_df.append(dev_df.loc[idx])
for idx, row in test_df.iterrows():
    if(int(user_dict.get(test_df.loc[idx]['user_id'])) >= cutoff):
        adv_test_df = adv_test_df.append(test_df.loc[idx])
    else:
        disadv_test_df = disadv_test_df.append(test_df.loc[idx])

In [42]:
# save results
adv_dev_df.to_csv(os.path.join(RAW_PATH, 'adv_dev.csv'), sep='\t', index=False)
disadv_dev_df.to_csv(os.path.join(RAW_PATH, 'disadv_dev.csv'), sep='\t', index=False)
adv_test_df.to_csv(os.path.join(RAW_PATH, 'adv_test.csv'), sep='\t', index=False)
disadv_test_df.to_csv(os.path.join(RAW_PATH, 'disadv_test.csv'), sep='\t', index=False)
train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [43]:
# level-2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['categories']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [44]:
item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    info = useful_meta_df.iloc[idx]['related']
    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'i_category': useful_meta_df.iloc[idx]['l2_category'],
        'r_complement': list(map(lambda x: item2id[x], info['also_bought'])) if 'also_bought' in info else [],
        'r_substitute': list(map(lambda x: item2id[x], info['also_viewed'])) if 'also_viewed' in info else [],
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'i_category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,1,119,"[4529, 2296, 4624, 5147, 1982, 1375, 3746, 488...","[488, 2148, 2296, 1993, 3587, 4624, 2139, 2532..."
1,2,45,"[5927, 1712, 306]",[1712]
2,3,81,"[3040, 1363, 127, 10535, 10078, 87, 1768, 5585...","[87, 54, 3724]"
3,4,109,"[10526, 8152, 8788, 7118, 6827, 6988, 2784, 32...","[10526, 8152, 6827, 6959, 6992, 2784, 7123, 11..."
4,5,6,"[6856, 6427, 3662, 3987, 56, 3652, 3727, 3518,...","[6535, 3987, 3556]"


### Seperate advantage vs. disadvantage metadata within category (to gauge item fairness)

In [45]:
# Advantage being top 5% of users in # of interactions
item_meta_df['length'] = item_meta_df['r_complement'].str.len()
cutoff = item_meta_df['length'].quantile(0.95)

adv_item_meta_df = pd.DataFrame(columns = ['item_id', 'i_category', 'r_complement', 'r_substitute'])
disadv_item_meta_df = pd.DataFrame(columns = ['item_id', 'i_category', 'r_complement', 'r_substitute'])

for idx, row in item_meta_df.iterrows():
    if(int(item_meta_df.iloc[idx]['length']) >= cutoff):
        adv_item_meta_df = adv_item_meta_df.append(item_meta_df.iloc[idx])
    else:
        disadv_item_meta_df = disadv_item_meta_df.append(item_meta_df.iloc[idx])

adv_item_meta_df.drop('length', axis = 1, inplace = True)
disadv_item_meta_df.drop('length', axis = 1, inplace = True)
item_meta_df.drop('length', axis = 1, inplace = True)

In [None]:
# save results
#adv_item_meta_df.to_csv(os.path.join(RAW_PATH, 'adv_item_meta.csv'), sep='\t', index=False)
#disadv_item_meta_df.to_csv(os.path.join(RAW_PATH, 'disadv_item_meta.csv'), sep='\t', index=False)
item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)