In [1]:
import pickle as pkl
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import time
import datetime

In [2]:
# some helper functions to load raw data

def read_user_rating_records(dir_path, rating_file):
    col_names = ['user_id', 'item_id', 'rating', 'timestamp']
    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')
    return data_records

def remove_infrequent_node(df, node_type, min_counts=5):
    n_node_type = len(df[node_type].unique())
    counts = df[node_type].value_counts()
    df = df[df[node_type].isin(counts[counts >= min_counts].index)]
    n_removed = n_node_type - len(df[node_type].unique())
    return df, n_removed

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pkl.dump(obj, f)
        
def load_obj(path, name):
    with open(path + name, 'rb') as f:
        return pkl.load(f, encoding='latin1')

In [30]:
def load_dataset(dataset_name):
    # Amazon_CDs
    if dataset_name in ['Amazon_Books', 'Amazon_CDs']:
        dir_path = 'Amazon_Books/' if dataset_name == 'Amazon_Books' else 'Amazon_CDs/'
        rating_file = 'ratings_Books.csv' if dataset_name == 'Amazon_Books' else 'ratings_CDs_and_Vinyl.csv'

        data_records = read_user_rating_records(dir_path, rating_file)
        data_records.loc[data_records.rating <= 3, 'rating'] = 0
        data_records.loc[data_records.rating > 3, 'rating'] = 1
        data_records = data_records[data_records.rating > 0]
        
    # Amazon_Electrionics
    elif dataset_name == 'Amazon_Electronics':
        data_records = read_user_rating_records('Amazon_Electronics/', 'ratings_Electronics-original.csv')
        data_records.loc[data_records.rating <= 3, 'rating'] = 0
        data_records.loc[data_records.rating > 3, 'rating'] = 1
        data_records = data_records[data_records.rating > 0]
        
    elif dataset_name == 'Gowalla':
        rating_file = './gowalla/Gowalla_totalCheckins.txt'
        
        dtypes = {'0': np.int64, '4': np.int64, '1': np.float64}
        data_records = pd.read_csv(rating_file, sep=r'\t', engine='python', encoding='latin-1',
                                   names=['user_id','timestamp','item_id'], usecols=[0,1,4], parse_dates=[1], dtype=dtypes)
    
    elif dataset_name == 'moivelens':
        rating_file = './ml-10M/ratings.dat'

        dtypes = {'user_id': np.int64, 'item_id': np.int64, 'ratings': np.float32, 'timestamp': np.float64}
        data_records = pd.read_csv(rating_file, sep=r'\:\:', engine='python', encoding='latin-1',
                           header=None, names=['user_id', 'item_id', 'ratings', 'timestamp'], dtype=dtypes)
        
        data_records.loc[data_records.ratings <= 3, 'ratings'] = 0
        data_records.loc[data_records.ratings > 3, 'ratings'] = 1
        data_records = data_records[data_records.ratings > 0]
    
    elif dataset_name == 'lastfm-2k':
        rating_file = './lastfm-2k/user_taggedartists-timestamps.dat'

        dtypes = {'user_id': np.int64, 'item_id': np.int64}
        data_records = pd.read_csv(rating_file, sep=r'\t', engine='python', encoding='latin-1', 
                           header=0, names=['user_id', 'item_id', 'tag_id', 'timestamp'], dtype=dtypes)
    
    elif dataset_name == 'tb2014':
        rating_file = './tb2014/tianchi_mobile_recommend_train_user.csv'
        
        col_names = ['user_id', 'item_id', 'behavior_type', 'time']
        data_records = pd.read_csv(rating_file, sep=',', usecols=col_names, engine='python')
        data_records = data_records.rename(columns={"time": "timestamp"})
        
    elif dataset_name == 'tb2015':
        rating_file = './tb2015/ijcai2016_taobao.csv'
        
        col_names = ['use_ID', 'sel_ID', 'act_ID', 'time']
        data_records = pd.read_csv(rating_file, sep=',', usecols=col_names, engine='python', chunksize=1e7)
        data_records = pd.concat(data_records)
        data_records = data_records.rename(columns={"use_ID": "user_id",
                                                    "sel_ID": "item_id",
                                                    "act_ID" : "behavior_type",
                                                    "time": "timestamp"})
    elif dataset_name == 'alimama':
        rating_file = './alimama/UserBehavior/UserBehavior.csv'
        
        col_names = ['user_id', 'item_id', 'behavior_type', 'timestamp']
        data_records = pd.read_csv(rating_file, sep=',', usecols=[0,1,3,4], names=col_names, engine='python')
    
    elif dataset_name == 'yelp':
        rating_file = './yelp_reviews_RV.csv'
        col_names = ['user_id', 'business_id', 'rating', 'date']
        data_records = pd.read_csv(rating_file, engine='python')
        data_records = data_records.rename(columns={"business_id": "item_id",
                                                    "date": "timestamp"})
        
        data_records.loc[data_records.rating <= 3, 'rating'] = 0
        data_records.loc[data_records.rating > 3, 'rating'] = 1
        data_records = data_records[data_records.rating > 0]
    else:
        print("Error!!!!")

    return data_records

In [19]:
# filter out user node with degree less than u_thre
# filter out item node with degree less than i_thre
# sort filtered data by timestamp

def filter_dataset(u_thre, i_thre, data):

    filtered_data = data.copy()
    filtered_data, u_removed = remove_infrequent_node(filtered_data, 'user_id', u_thre)
    filtered_data, i_removed = remove_infrequent_node(filtered_data, 'item_id', i_thre)

    while(u_removed != 0 or i_removed != 0):
        filtered_data, u_removed = remove_infrequent_node(filtered_data, 'user_id', u_thre)
        filtered_data, i_removed = remove_infrequent_node(filtered_data, 'item_id', i_thre)

    print('user with < {} and and items with < {} interactions are removed'.format(u_thre, i_thre))
    print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))

    filtered_data = filtered_data.sort_values('timestamp')
    return filtered_data

In [None]:
# example to pre-process yelp dataset

DATASET = "yelp"
U_FILTER, I_FILTER = 10,10
SAVEFILE = DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

In [31]:
print("=========================================================")
print("Dataset: ", DATASET)

data = load_dataset(DATASET)
print("Raw data length: ", len(data))

In [33]:
# for taobao datasets only
# to remove unwanted edges

# data = data.loc[data['behavior_type'] != 'pv'] # alimama
# data = data.loc[data['behavior_type'] == 'buy'] # tb2015

No dup data length:  5116476


In [34]:
# remove duplicates

data = data.groupby(['user_id', 'item_id'], sort=False).last().reset_index().sort_values('timestamp')
print("No dup data length: ", len(data))

# filter low degree nodes

filtered_data = filter_dataset(U_FILTER, I_FILTER, data)
n_data = len(filtered_data)
print("Filtered data length: ", n_data)

user with < 10 and and items with < 10 interactions are removed
Filtered data length:  1712236
First and last timestamps:  2004-10-19 02:46:40 2019-12-13 15:45:49
#users, #items :  68423 39327


In [41]:
# print time range and dataset stats after pre-processing

print("First and last timestamps: ", filtered_data.iloc[0]['timestamp'], filtered_data.iloc[-1]['timestamp'])
n_user, n_item = len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())
print("#users, #items : ", n_user, n_item)

1712236

In [None]:
# save pre-processed data

save_obj(filtered_data, SAVEFILE)

In [None]:
# Below is some functions to get some insight of data blocks

In [1]:
def get_inc_blocks_info(filtered_data, base_user_set, base_item_set):
    n_records = len(filtered_data)
    n_split = 20
    n_newedge = n_records/n_split

    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_day_list = []
    n_new_user_list, n_new_item_list = [], []
    node_base_overlap_list = []
    acc_user = base_user_set
    acc_item = base_item_set
    
    base_data = filtered_data[0:int(n_records*0.5)]
    base_user = set(base_data['user_id'].unique())
    base_item = set(base_data['item_id'].unique())
    
    
    for i in range(10, n_split):
        sub_data = filtered_data[int(n_records*i/n_split):int(n_records*(i+1)/n_split)]
        acc_data = filtered_data[0:int(n_records*(i+1)/n_split)]

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())
        
        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())
        
        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)
        
        start_day = sub_data.iloc[0]['timestamp']
        end_day = sub_data.iloc[-1]['timestamp']
        # Gowalla
        diff_day = (end_day - start_day).days 
        
#         # Tb2015 date format
#         start_day = time.mktime(datetime.datetime.strptime(str(start_day), "%Y%m%d").timetuple()) 
#         end_day = time.mktime(datetime.datetime.strptime(str(end_day), "%Y%m%d").timetuple())
#         # Movie-lens date format
#         diff_day = int((end_day - start_day) / 3600 / 24) # movie-lens
#         # Some other dataset date format
#         diff_day = int((end_day - start_day) / 3600 / 24 / 1000)

        n_day_list.append(diff_day)
        
        node_base_overlap = (len((cur_user & base_user_set)) + len((cur_item & base_item_set))) \
                            / (len((cur_user | base_user_set)) + len((cur_item | base_item_set)))
        
        node_base_overlap_2 = (len((cur_user & base_user)) + len((cur_item & base_item))) \
                            / (len(base_user_set) + len(base_item_set))
        
        n_new_user, n_new_item = len(new_user), len(new_item)
        n_new_user_list.append(n_new_user)
        n_new_item_list.append(n_new_item)
        node_base_overlap_list.append(node_base_overlap)
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_day_list, n_new_user_list, n_new_item_list, node_base_overlap_list

In [None]:
base_block = filtered_data[:int(n_data * 0.5)]
inc_data = filtered_data[int(n_data * 0.5):]

print("% of user in base block: ", len(base_block['user_id'].unique()) / n_user)
print("% of item in base block: ", len(base_block['item_id'].unique()) / n_item)

n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_day_list, \
n_new_user_list, \
n_new_item_list, \
node_base_overlap_list = get_inc_blocks_info(inc_data, set(base_block['user_id'].unique()), set(base_block['item_id'].unique()))

print("#edge for 1 inc block: ", int(len(inc_data)/10))
print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("timespan(mean, std): ", np.mean(n_day_list), np.std(n_day_list))
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_base_overlap_list: ", node_base_overlap_list)

print("Done.")