In [1]:
import pickle as pkl
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import time
import datetime
#Recommended datasets: Alimama, Foursquare, Netflix, Taobao2014, Taobao2015

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [27]:
# some helper functions to load raw data

def read_user_rating_records(dir_path, rating_file):
    col_names = ['user_id', 'item_id', 'rating', 'timestamp']
    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')
    return data_records

def remove_infrequent_node(df, node_type, min_counts=5):
    n_node_type = len(df[node_type].unique())
    counts = df[node_type].value_counts()
    df = df[df[node_type].isin(counts[counts >= min_counts].index)]
    n_removed = n_node_type - len(df[node_type].unique())
    return df, n_removed

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pkl.dump(obj, f)
        
def load_obj(path, name):
    with open(path + name, 'rb') as f:
        return pkl.load(f, encoding='latin1')

In [28]:
def load_dataset(dataset_name):
    # Amazon_CDs
    if dataset_name in ['Amazon_Books', 'Amazon_CDs']:
        dir_path = '../data/Amazon_Books/' if dataset_name == 'Amazon_Books' else '../data/Amazon_CDs/'
        rating_file = 'ratings_Books.csv' if dataset_name == 'Amazon_Books' else 'ratings_CDs_and_Vinyl.csv'

        data_records = read_user_rating_records(dir_path, rating_file)
        data_records.loc[data_records.rating <= 3, 'rating'] = 0
        data_records.loc[data_records.rating > 3, 'rating'] = 1
        data_records = data_records[data_records.rating > 0]
        
    # Amazon_Electrionics
    elif dataset_name == 'Amazon_Electronics':
        data_records = read_user_rating_records('../data/Amazon_Electronics/', 'ratings_Electronics.csv')
        data_records.loc[data_records.rating <= 3, 'rating'] = 0
        data_records.loc[data_records.rating > 3, 'rating'] = 1
        data_records = data_records[data_records.rating > 0]
        
    elif dataset_name == 'Gowalla':
        rating_file = '../data/Gowalla/Gowalla_totalCheckins.txt'
        
        dtypes = {'0': np.int64, '4': np.int64, '1': np.float64}
        data_records = pd.read_csv(rating_file, sep=r'\t', engine='python', encoding='latin-1',
                                   names=['user_id','timestamp','item_id'], usecols=[0,1,4], parse_dates=[1], dtype=dtypes)
    
    elif dataset_name == 'ml-10m':
        rating_file = '../data/ml-10m/ratings.dat'

        dtypes = {'user_id': np.int64, 'item_id': np.int64, 'ratings': np.float32, 'timestamp': np.float64}
        data_records = pd.read_csv(rating_file, sep=r'\:\:', engine='python', encoding='latin-1',
                           header=None, names=['user_id', 'item_id', 'ratings', 'timestamp'], dtype=dtypes)
        
        data_records.loc[data_records.ratings <= 3, 'ratings'] = 0
        data_records.loc[data_records.ratings > 3, 'ratings'] = 1
        data_records = data_records[data_records.ratings > 0]

    elif dataset_name == 'ml-20m':
        data_records = read_user_rating_records('../data/ml-20m/', 'ratings.csv')
        data_records.loc[data_records.ratings <= 3, 'ratings'] = 0
        data_records.loc[data_records.ratings > 3, 'ratings'] = 1
        data_records = data_records[data_records.ratings > 0]

    elif dataset_name == 'ml-25m':
        data_records = read_user_rating_records('../data/ml-25m/', 'ratings.csv')
        data_records.loc[data_records.ratings <= 3, 'ratings'] = 0
        data_records.loc[data_records.ratings > 3, 'ratings'] = 1
        data_records = data_records[data_records.ratings > 0]

    elif dataset_name == 'lastfm-2k':
        rating_file = './lastfm-2k/user_taggedartists-timestamps.dat'

        dtypes = {'user_id': np.int64, 'item_id': np.int64}
        data_records = pd.read_csv(rating_file, sep=r'\t', engine='python', encoding='latin-1', 
                           header=0, names=['user_id', 'item_id', 'tag_id', 'timestamp'], dtype=dtypes)
    
    elif dataset_name == 'Taobao2014':
        rating_file = '../data/Taobao2014/tianchi_mobile_recommend_train_user.csv'
        
        col_names = ['user_id', 'item_id', 'behavior_type', 'time']
        data_records = pd.read_csv(rating_file, sep=',', usecols=col_names, engine='python')
        data_records = data_records.rename(columns={"time": "timestamp"})
        
    elif dataset_name == 'Taobao2015':
        rating_file = '../data/Taobao2015/ijcai2016_taobao.csv'
        
        col_names = ['use_ID', 'sel_ID', 'act_ID', 'time']
        data_records = pd.read_csv(rating_file, sep=',', usecols=col_names, engine='python', chunksize=1e7)
        data_records = pd.concat(data_records)
        data_records = data_records.rename(columns={"use_ID": "user_id",
                                                    "sel_ID": "item_id",
                                                    "act_ID" : "behavior_type",
                                                    "time": "timestamp"})
    elif dataset_name == 'Alimama':
        rating_file = '../data//Alimama/UserBehavior.csv'
        
        col_names = ['user_id', 'item_id', 'behavior_type', 'timestamp']
        data_records = pd.read_csv(rating_file, sep=',', usecols=[0,1,3,4], names=col_names, engine='python')
        data_records = data_records[(data_records.timestamp > time.mktime(time.strptime(str('2017-11-25 00:00:00'), "%Y-%m-%d %H:%M:%S"))) & (data_records.timestamp < time.mktime(time.strptime(str('2017-12-3 23:59:59'), "%Y-%m-%d %H:%M:%S")))]

    elif dataset_name == 'Netflix':
        rating_file = '../data/Netflix/data.csv'
        data_records = pd.read_csv(rating_file, engine='python')
        data_records = data_records.rename(columns={"1":"item_id", "1488844":"user_id", "3":"ratings", "2005-09-06":"timestamp"})
    
    elif dataset_name == 'Foursquare':
        rating_file = '../data/Foursquare/processed_data.csv' 
        data_records = pd.read_csv(rating_file, engine='python')
        data_records = data_records.rename(columns={"date":"timestamp"})
           
    elif dataset_name == 'yelp':
        rating_file = '../data/yelp/yelp_review.csv'
        col_names = ['user_id', 'business_id', 'stars', 'date']
        data_records = pd.read_csv(rating_file, engine='python')
        data_records = data_records.rename(columns={"business_id": "item_id",
                                                    "date": "timestamp"})
        
        data_records.loc[data_records.stars <= 3, 'stars'] = 0
        data_records.loc[data_records.stars > 3, 'stars'] = 1
        data_records = data_records[data_records.stars > 0]
    else:
        print("Error!!!!")

    return data_records

In [29]:
# filter out user node with degree less than u_thre
# filter out item node with degree less than i_thre
# sort filtered data by timestamp

def filter_dataset(u_thre, i_thre, data):

    filtered_data = data.copy()
    filtered_data, u_removed = remove_infrequent_node(filtered_data, 'user_id', u_thre)
    filtered_data, i_removed = remove_infrequent_node(filtered_data, 'item_id', i_thre)

    while(u_removed != 0 or i_removed != 0):
        filtered_data, u_removed = remove_infrequent_node(filtered_data, 'user_id', u_thre)
        filtered_data, i_removed = remove_infrequent_node(filtered_data, 'item_id', i_thre)

    print('user with < {} and and items with < {} interactions are removed'.format(u_thre, i_thre))
    print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))

    filtered_data = filtered_data.sort_values('timestamp')
    return filtered_data

In [33]:
# example to pre-process yelp dataset

#DATASET = "yelp"
#U_FILTER, I_FILTER = 10,10
#SAVEFILE = DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "Amazon_Books"
#U_FILTER, I_FILTER = 20,20
#SAVEFILE = DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "Taobao2014"
#U_FILTER, I_FILTER = 10,10
#SAVEFILE = '../data/Taobao2014/' + DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "Taobao2015"
#U_FILTER, I_FILTER = 10,10
#SAVEFILE = '../data/Taobao2015/' + DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "Alimama"
#U_FILTER, I_FILTER = 50,50
#SAVEFILE = '../data/Alimama/' + DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "Gowalla"
#U_FILTER, I_FILTER = 10,10
#SAVEFILE = DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "ml-10m"
#U_FILTER, I_FILTER = 10,10
#SAVEFILE = DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

DATASET = "Netflix"
U_FILTER, I_FILTER = 30,30
SAVEFILE = '../data/Netflix/' + DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

#DATASET = "Foursquare"
#U_FILTER, I_FILTER = 20,20
#SAVEFILE = '../data/Foursquare/' + DATASET+'-nodup-'+str(U_FILTER)+'-'+str(I_FILTER)

In [31]:
print("=========================================================")
print("Dataset: ", DATASET)

data = load_dataset(DATASET)
print("Raw data length: ", len(data))

Dataset:  Netflix
Raw data length:  51031354


In [32]:
# for taobao datasets only
# to remove unwanted edges

# data = data.loc[data['behavior_type'] != 'pv'] # alimama
# data = data.loc[data['behavior_type'] == 'buy'] # tb2015

#print(data['timestamp'].max())
#print(data['timestamp'].min())

print(len(data))
print(data.columns.values.tolist())
data.head()

51031354
['item_id', 'user_id', 'ratings', 'timestamp']


Unnamed: 0,item_id,user_id,ratings,timestamp
0,1,822109,5,2005-05-13
1,1,885013,4,2005-10-19
2,1,30878,4,2005-12-26
3,1,823519,3,2004-05-03
4,1,893988,3,2005-11-17


In [34]:
# remove duplicates

data = data.groupby(['user_id', 'item_id'], sort=False).last().reset_index().sort_values('timestamp')
print("No dup data length: ", len(data))

# filter low degree nodes

filtered_data = filter_dataset(U_FILTER, I_FILTER, data)
n_data = len(filtered_data)
print("Filtered data length: ", n_data)

No dup data length:  51031354
user with < 30 and and items with < 30 interactions are removed
num of users:300836, num of items:9204
Filtered data length:  48500583


In [35]:
# print time range and dataset stats after pre-processing

print("First and last timestamps: ", filtered_data.iloc[0]['timestamp'], filtered_data.iloc[-1]['timestamp'])
#print("First and last localtime: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(filtered_data.iloc[0]['timestamp'])), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(filtered_data.iloc[-1]['timestamp'])))
#n_user, n_item = len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())
#print('users, items : ', n_user, n_item)
#print('start local time', time.localtime(filtered_data.iloc[0]['timestamp']))
#print('start year', time.localtime(filtered_data.iloc[0]['timestamp']).tm_year)
#print('start month', time.localtime(filtered_data.iloc[0]['timestamp']).tm_mon)

#print('start year', time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d %H').tm_year)
#print('start month', time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d %H').tm_mon)
print('the shortest item list length corresponding to each user', filtered_data['user_id'].value_counts().min())
print('the longest item list length corresponding to each user', filtered_data['user_id'].value_counts().max())
print('the shortest user list length corresponding to each item', filtered_data['item_id'].value_counts().min())
print('the longest user list length corresponding to each item', filtered_data['item_id'].value_counts().max())

First and last timestamps:  1999-11-11 2005-12-31
the shortest item list length corresponding to each user 30
the longest item list length corresponding to each user 9144
the shortest user list length corresponding to each item 30
the longest user list length corresponding to each item 181210


In [23]:
# save pre-processed data

save_obj(filtered_data, SAVEFILE)

In [73]:
#Below is some functions to get some insights of data segments with chronological order # Taobao2014
def get_data_segements_info_Taobao2014(filtered_data):
    n_records = len(filtered_data)
    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_new_user_list, n_new_item_list = [], []
    node_first_segment_overlap_list = []
    node_acc_segments_overlap_list = []
    node_pre_segment_overlap_list = []
    
    first_day = time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d %H').tm_mday
    first_month = time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d %H').tm_mon

    last_day = time.strptime(filtered_data.iloc[-1]['timestamp'], '%Y-%m-%d %H').tm_mday + 30
    last_month = time.strptime(filtered_data.iloc[-1]['timestamp'], '%Y-%m-%d %H').tm_mon

    filtered_data['month'] = filtered_data.timestamp.apply(lambda x: int(x[5:7]))
    filtered_data['day'] = filtered_data.timestamp.apply(lambda x: int(x[8:10]))

    first_segment_data = filtered_data[(filtered_data.month == first_month) & (filtered_data.day == first_day)] 
    first_segment_user_set = set(first_segment_data['user_id'].unique())
    first_segment_item_set = set(first_segment_data['item_id'].unique())
    acc_user = first_segment_user_set
    acc_item = first_segment_item_set
    pre_user = first_segment_user_set
    pre_item = first_segment_item_set

    for i in range(first_day + 1, last_day + 1):
        if i <= 30:
            print('month', first_month)
            print('day', i)
            sub_data = filtered_data[(filtered_data.day == i) & (filtered_data.month == first_month)]
            acc_data = filtered_data[(filtered_data.month == first_month) & (filtered_data.day <= i)]
        else:
            month = (i - 1) // 30 + first_month
            print('month', month)
            day = (i - 1) % 30 +1
            print('day', day)
          #  sub_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == month) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == year)]
          #  acc_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[:4])) <= year) | \
          #                           ((filtered_data.timestamp.apply(lambda x: int(x[:4])) == year) & (filtered_data.timestamp.apply(lambda x: int(x[5:7])) <= month))]  
            sub_data = filtered_data[(filtered_data.month == month) & (filtered_data.day == day)]
            acc_data = filtered_data[(filtered_data.month < month) | \
                                     ((filtered_data.month == month) & (filtered_data.day <= day))]          

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())

        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())   

        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)    

        node_first_segment_overlap = (len((cur_user & first_segment_user_set)) + len((cur_item & first_segment_item_set))) \
                            / (len((cur_user | first_segment_user_set)) + len((cur_item | first_segment_item_set)))   
        n_new_user, n_new_item = len(new_user), len(new_item)
        n_new_user_list.append(n_new_user)
        n_new_item_list.append(n_new_item)
        node_first_segment_overlap_list.append(node_first_segment_overlap)  
        node_acc_segments_overlap = (len((cur_user & acc_user)) + len((cur_item & acc_item))) \
                            / (len((cur_user | acc_user)) + len((cur_item | acc_item)))  
        node_acc_segments_overlap_list.append(node_acc_segments_overlap)
        node_pre_segment_overlap = (len((cur_user & pre_user)) + len((cur_item & pre_item))) \
                            / (len((cur_user | pre_user)) + len((cur_item | pre_item)))
        node_pre_segment_overlap_list.append(node_pre_segment_overlap)

        pre_user = cur_user
        pre_item = cur_item
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_new_user_list, n_new_item_list, node_first_segment_overlap_list, node_acc_segments_overlap_list, node_pre_segment_overlap_list


In [74]:
n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_new_user_list, \
n_new_item_list, \
node_first_segment_overlap_list, \
node_acc_segments_overlap_list, \
node_pre_segment_overlap_list = get_data_segements_info_Taobao2014(filtered_data)

print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("new_user_node_list", n_new_user_list)
print("new_item_node_list", n_new_item_list)
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_first_segment_overlap_list: ", node_first_segment_overlap_list)
print("mean_node_first_segment_overlap", np.mean(node_first_segment_overlap_list))
print("node_acc_segments_overlap_list: ", node_acc_segments_overlap_list)
print("mean_node_acc_segments_overlap", np.mean(node_acc_segments_overlap_list))
print("node_pre_segment_overlap_list: ", node_pre_segment_overlap_list)
print("mean_node_pre_segment_overlap: ", np.mean(node_pre_segment_overlap_list))

print("Done.")

month 11
day 19
month 11
day 20
month 11
day 21
month 11
day 22
month 11
day 23
month 11
day 24
month 11
day 25
month 11
day 26
month 11
day 27
month 11
day 28
month 11
day 29
month 11
day 30
month 12
day 1
month 12
day 2
month 12
day 3
month 12
day 4
month 12
day 5
month 12
day 6
month 12
day 7
month 12
day 8
month 12
day 9
month 12
day 10
month 12
day 11
month 12
day 12
month 12
day 13
month 12
day 14
month 12
day 15
month 12
day 16
month 12
day 17
month 12
day 18
edge_on_new_node_list:  [13572, 8719, 6003, 5261, 3635, 3998, 3165, 2293, 2227, 1550, 1605, 1590, 2676, 1790, 1758, 1498, 1093, 1090, 1065, 867, 803, 1035, 829, 1760, 1461, 764, 799, 744, 723, 419]
edge_btw_new_node_list:  [2801, 855, 341, 277, 115, 75, 53, 24, 26, 9, 8, 10, 14, 9, 15, 1, 8, 4, 1, 1, 1, 1, 6, 7, 7, 0, 2, 0, 0, 0]
new_user_node_list [1488, 794, 468, 344, 279, 193, 156, 112, 108, 73, 69, 67, 65, 50, 51, 32, 38, 32, 24, 21, 19, 22, 20, 17, 15, 5, 8, 3, 2, 2]
new_item_node_list [6425, 3964, 2713, 2159, 1510, 15

In [99]:
#Below is some functions to get some insights of data segments with chronological order # Taobao2015
def get_data_segements_info_Taobao2015(filtered_data):
    n_records = len(filtered_data)
    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_new_user_list, n_new_item_list = [], []
    node_first_segment_overlap_list = []
    node_acc_segments_overlap_list = []
    node_pre_segment_overlap_list = []
    
    first_month = int(str(filtered_data.iloc[0]['timestamp'])[4:6])
    first_day = int(str(filtered_data.iloc[0]['timestamp'])[6:8])

    last_day = int(str(filtered_data.iloc[-1]['timestamp'])[6:8]) + 123
    last_month = int(str(filtered_data.iloc[-1]['timestamp'])[4:6])

    filtered_data['month'] = filtered_data.timestamp.apply(lambda x: int(str(x)[4:6]))
    filtered_data['day'] = filtered_data.timestamp.apply(lambda x: int(str(x)[6:8]))

    first_segment_data = filtered_data[(filtered_data.month == first_month) & (filtered_data.day == first_day)] 
    first_segment_user_set = set(first_segment_data['user_id'].unique())
    first_segment_item_set = set(first_segment_data['item_id'].unique())
    acc_user = first_segment_user_set
    acc_item = first_segment_item_set
    pre_user = first_segment_user_set
    pre_item = first_segment_item_set

    for i in range(first_day + 1, last_day + 1):
        if i <= 31:
            print('month', first_month)
            print('day', i)
            sub_data = filtered_data[(filtered_data.day == i) & (filtered_data.month == first_month)]
            acc_data = filtered_data[(filtered_data.month == first_month) & (filtered_data.day <= i)]
        else:
            if i > 31 and i <= 62:
                month = 8
                day = i - 31
            elif i > 62 and i <= 92:
                month = 9
                day = i - 62
            elif i > 92 and i <= 123:
                month = 10
                day = i - 92
            else:
                month = 11
                day = i - 123

            print('month', month)
            print('day', day)
          #  sub_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == month) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == year)]
          #  acc_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[:4])) <= year) | \
          #                           ((filtered_data.timestamp.apply(lambda x: int(x[:4])) == year) & (filtered_data.timestamp.apply(lambda x: int(x[5:7])) <= month))]  
            sub_data = filtered_data[(filtered_data.month == month) & (filtered_data.day == day)]
            acc_data = filtered_data[(filtered_data.month < month) | \
                                     ((filtered_data.month == month) & (filtered_data.day <= day))]          

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())

        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())   

        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)    
        
        if len(cur_user) != 0 and len(cur_item) != 0:
            node_first_segment_overlap = (len((cur_user & first_segment_user_set)) + len((cur_item & first_segment_item_set))) \
                                / (len((cur_user | first_segment_user_set)) + len((cur_item | first_segment_item_set)))   
            n_new_user, n_new_item = len(new_user), len(new_item)
            n_new_user_list.append(n_new_user)
            n_new_item_list.append(n_new_item)
            node_first_segment_overlap_list.append(node_first_segment_overlap)  
            node_acc_segments_overlap = (len((cur_user & acc_user)) + len((cur_item & acc_item))) \
                                / (len((cur_user | acc_user)) + len((cur_item | acc_item)))  
            node_acc_segments_overlap_list.append(node_acc_segments_overlap)
            node_pre_segment_overlap = (len((cur_user & pre_user)) + len((cur_item & pre_item))) \
                                / (len((cur_user | pre_user)) + len((cur_item | pre_item))) 
            node_pre_segment_overlap_list.append(node_pre_segment_overlap)

            pre_user = cur_user
            pre_item = cur_item
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_new_user_list, n_new_item_list, node_first_segment_overlap_list, node_acc_segments_overlap_list, node_pre_segment_overlap_list

In [100]:
n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_new_user_list, \
n_new_item_list, \
node_first_segment_overlap_list, \
node_acc_segments_overlap_list, \
node_pre_segment_overlap_list = get_data_segements_info_Taobao2015(filtered_data)

print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("new_user_node_list", n_new_user_list)
print("new_item_node_list", n_new_item_list)
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_first_segment_overlap_list: ", node_first_segment_overlap_list)
print("mean_node_first_segment_overlap", np.mean(node_first_segment_overlap_list))
print("node_acc_segments_overlap_list: ", node_acc_segments_overlap_list)
print("mean_node_acc_segments_overlap", np.mean(node_acc_segments_overlap_list))
print("node_pre_segment_overlap_list: ", node_pre_segment_overlap_list)
print("mean_node_pre_segment_overlap: ", np.mean(node_pre_segment_overlap_list))

print("Done.")

month 7
day 2
month 7
day 3
month 7
day 4
month 7
day 5
month 7
day 6
month 7
day 7
month 7
day 8
month 7
day 9
month 7
day 10
month 7
day 11
month 7
day 12
month 7
day 13
month 7
day 14
month 7
day 15
month 7
day 16
month 7
day 17
month 7
day 18
month 7
day 19
month 7
day 20
month 7
day 21
month 7
day 22
month 7
day 23
month 7
day 24
month 7
day 25
month 7
day 26
month 7
day 27
month 7
day 28
month 7
day 29
month 7
day 30
month 7
day 31
month 8
day 1
month 8
day 2
month 8
day 3
month 8
day 4
month 8
day 5
month 8
day 6
month 8
day 7
month 8
day 8
month 8
day 9
month 8
day 10
month 8
day 11
month 8
day 12
month 8
day 13
month 8
day 14
month 8
day 15
month 8
day 16
month 8
day 17
month 8
day 18
month 8
day 19
month 8
day 20
month 8
day 21
month 8
day 22
month 8
day 23
month 8
day 24
month 8
day 25
month 8
day 26
month 8
day 27
month 8
day 28
month 8
day 29
month 8
day 30
month 8
day 31
month 9
day 1
month 9
day 2
month 9
day 3
month 9
day 4
month 9
day 5
month 9
day 6
month 9
day 7
mont

In [24]:
#Below is some functions to get some insights of data segments with chronological order # Foursquare
def get_data_segements_info_Foursquare(filtered_data):
    n_records = len(filtered_data)
    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_new_user_list, n_new_item_list = [], []
    node_first_segment_overlap_list = []
    node_acc_segments_overlap_list = []
    node_pre_segment_overlap_list = []
    
    first_month = time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d').tm_mon
    first_year = time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d').tm_year
    last_month = time.strptime(filtered_data.iloc[-1]['timestamp'], '%Y-%m-%d').tm_mon + 24
    last_year = time.strptime(filtered_data.iloc[-1]['timestamp'], '%Y-%m-%d').tm_year
    filtered_data['month'] = filtered_data.timestamp.apply(lambda x: int(x[5:7]))
    filtered_data['year'] = filtered_data.timestamp.apply(lambda x: int(x[:4]))

    #first_segment_data = filtered_data[(pd.to_datetime(filtered_data.timestamp, '%Y-%m-%d').tm_mon == first_month) & (pd.to_datetime(filtered_data.timestamp, '%Y-%m-%d').tm_year == first_year)]
    #first_segment_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == first_month) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == first_year)]    
    first_segment_data = filtered_data[(filtered_data.month == first_month) & (filtered_data.year == first_year)] 
    first_segment_user_set = set(first_segment_data['user_id'].unique())
    first_segment_item_set = set(first_segment_data['item_id'].unique())
    acc_user = first_segment_user_set
    acc_item = first_segment_item_set
    pre_user = first_segment_user_set
    pre_item = first_segment_item_set

    for i in range(first_month + 1, last_month + 1):
        if i <= 12:
            print('year', first_year)
            print('month', i)
           # sub_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == i) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == first_year)]
           # acc_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[:4])) == first_year) & (filtered_data.timestamp.apply(lambda x: int(x[5:7])) <= i)]
            sub_data = filtered_data[(filtered_data.month == i) & (filtered_data.year == first_year)]
            acc_data = filtered_data[(filtered_data.year == first_year) & (filtered_data.month <= i)]
        else:
            year = (i - 1) // 12 + first_year
            print('year', year)
            month = (i - 1) % 12 +1
            print('month', month)
          #  sub_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == month) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == year)]
          #  acc_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[:4])) <= year) | \
          #                           ((filtered_data.timestamp.apply(lambda x: int(x[:4])) == year) & (filtered_data.timestamp.apply(lambda x: int(x[5:7])) <= month))]  
            sub_data = filtered_data[(filtered_data.month == month) & (filtered_data.year == year)]
            acc_data = filtered_data[(filtered_data.year < year) | \
                                     ((filtered_data.year == year) & (filtered_data.month <= month))]          

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())

        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())   

        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)    

        node_first_segment_overlap = (len((cur_user & first_segment_user_set)) + len((cur_item & first_segment_item_set))) \
                            / (len((cur_user | first_segment_user_set)) + len((cur_item | first_segment_item_set)))   
        n_new_user, n_new_item = len(new_user), len(new_item)
        n_new_user_list.append(n_new_user)
        n_new_item_list.append(n_new_item)
        node_first_segment_overlap_list.append(node_first_segment_overlap)  
        node_acc_segments_overlap = (len((cur_user & acc_user)) + len((cur_item & acc_item))) \
                            / (len((cur_user | acc_user)) + len((cur_item | acc_item)))  
        node_acc_segments_overlap_list.append(node_acc_segments_overlap)
        node_pre_segment_overlap = (len((cur_user & pre_user)) + len((cur_item & pre_item))) \
                            / (len((cur_user | pre_user)) + len((cur_item | pre_item)))
        node_pre_segment_overlap_list.append(node_pre_segment_overlap)

        pre_user = cur_user
        pre_item = cur_item
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_new_user_list, n_new_item_list, node_first_segment_overlap_list, node_acc_segments_overlap_list, node_pre_segment_overlap_list


In [26]:
n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_new_user_list, \
n_new_item_list, \
node_first_segment_overlap_list, \
node_acc_segments_overlap_list, \
node_pre_segment_overlap_list = get_data_segements_info_Foursquare(filtered_data)

print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("new_user_node_list", n_new_user_list)
print("new_item_node_list", n_new_item_list)
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_first_segment_overlap_list: ", node_first_segment_overlap_list)
print("mean_node_first_segment_overlap", np.mean(node_first_segment_overlap_list))
print("node_acc_segments_overlap_list: ", node_acc_segments_overlap_list)
print("mean_node_acc_segments_overlap", np.mean(node_acc_segments_overlap_list))
print("node_pre_segment_overlap_list: ", node_pre_segment_overlap_list)
print("mean_node_pre_segment_overlap: ", np.mean(node_pre_segment_overlap_list))

print("Done.")

year 2012
month 5
year 2012
month 6
year 2012
month 7
year 2012
month 8
year 2012
month 9
year 2012
month 10
year 2012
month 11
year 2012
month 12
year 2013
month 1
year 2013
month 2
year 2013
month 3
year 2013
month 4
year 2013
month 5
year 2013
month 6
year 2013
month 7
year 2013
month 8
year 2013
month 9
year 2013
month 10
year 2013
month 11
year 2013
month 12
year 2014
month 1
edge_on_new_node_list:  [36817, 11860, 13425, 4426, 1883, 9515, 6656, 6544, 4121, 2526, 4236, 2164, 1549, 148, 324, 353, 328, 121, 7, 26, 29]
edge_btw_new_node_list:  [1698, 180, 156, 17, 18, 139, 62, 40, 22, 10, 22, 9, 7, 0, 0, 0, 0, 0, 0, 0, 0]
new_user_node_list [6273, 2768, 2692, 1317, 898, 2395, 1517, 1231, 824, 523, 452, 195, 79, 21, 27, 10, 0, 0, 0, 0, 0]
new_item_node_list [3449, 852, 494, 188, 100, 405, 279, 234, 146, 98, 153, 73, 56, 26, 26, 17, 14, 8, 1, 3, 1]
new_user_node(mean, std):  1010.5714285714286 1481.77472651414
new_item_node(mean, std):  315.3809523809524 729.756807253229
node_first_segm

In [16]:
#Below is some functions to get some insights of data segments with chronological order # Netflix
def get_data_segements_info_Netflix(filtered_data):
    n_records = len(filtered_data)
    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_new_user_list, n_new_item_list = [], []
    node_first_segment_overlap_list = []
    node_acc_segments_overlap_list = []
    node_pre_segment_overlap_list = []
    
    first_month = time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d').tm_mon
    first_year = time.strptime(filtered_data.iloc[0]['timestamp'], '%Y-%m-%d').tm_year
    last_month = time.strptime(filtered_data.iloc[-1]['timestamp'], '%Y-%m-%d').tm_mon + 72
    last_year = time.strptime(filtered_data.iloc[-1]['timestamp'], '%Y-%m-%d').tm_year
    filtered_data['month'] = filtered_data.timestamp.apply(lambda x: int(x[5:7]))
    filtered_data['year'] = filtered_data.timestamp.apply(lambda x: int(x[:4]))

    #first_segment_data = filtered_data[(pd.to_datetime(filtered_data.timestamp, '%Y-%m-%d').tm_mon == first_month) & (pd.to_datetime(filtered_data.timestamp, '%Y-%m-%d').tm_year == first_year)]
    #first_segment_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == first_month) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == first_year)]    
    first_segment_data = filtered_data[(filtered_data.month == first_month) & (filtered_data.year == first_year)] 
    first_segment_user_set = set(first_segment_data['user_id'].unique())
    first_segment_item_set = set(first_segment_data['item_id'].unique())
    acc_user = first_segment_user_set
    acc_item = first_segment_item_set
    pre_user = first_segment_user_set
    pre_item = first_segment_item_set

    for i in range(first_month + 1, last_month + 1):
        if i <= 12:
            print('year', first_year)
            print('month', i)
           # sub_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == i) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == first_year)]
           # acc_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[:4])) == first_year) & (filtered_data.timestamp.apply(lambda x: int(x[5:7])) <= i)]
            sub_data = filtered_data[(filtered_data.month == i) & (filtered_data.year == first_year)]
            acc_data = filtered_data[(filtered_data.year == first_year) & (filtered_data.month <= i)]
        else:
            year = (i - 1) // 12 + first_year
            print('year', year)
            month = (i - 1) % 12 +1
            print('month', month)
          #  sub_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[5:7])) == month) & (filtered_data.timestamp.apply(lambda x: int(x[:4])) == year)]
          #  acc_data = filtered_data[(filtered_data.timestamp.apply(lambda x: int(x[:4])) <= year) | \
          #                           ((filtered_data.timestamp.apply(lambda x: int(x[:4])) == year) & (filtered_data.timestamp.apply(lambda x: int(x[5:7])) <= month))]  
            sub_data = filtered_data[(filtered_data.month == month) & (filtered_data.year == year)]
            acc_data = filtered_data[(filtered_data.year < year) | \
                                     ((filtered_data.year == year) & (filtered_data.month <= month))]          

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())

        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())   

        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)    

        node_first_segment_overlap = (len((cur_user & first_segment_user_set)) + len((cur_item & first_segment_item_set))) \
                            / (len((cur_user | first_segment_user_set)) + len((cur_item | first_segment_item_set)))   
        n_new_user, n_new_item = len(new_user), len(new_item)
        n_new_user_list.append(n_new_user)
        n_new_item_list.append(n_new_item)
        node_first_segment_overlap_list.append(node_first_segment_overlap)  
        node_acc_segments_overlap = (len((cur_user & acc_user)) + len((cur_item & acc_item))) \
                            / (len((cur_user | acc_user)) + len((cur_item | acc_item)))  
        node_acc_segments_overlap_list.append(node_acc_segments_overlap)
        node_pre_segment_overlap = (len((cur_user & pre_user)) + len((cur_item & pre_item))) \
                            / (len((cur_user | pre_user)) + len((cur_item | pre_item)))
        node_pre_segment_overlap_list.append(node_pre_segment_overlap)

        pre_user = cur_user
        pre_item = cur_item
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_new_user_list, n_new_item_list, node_first_segment_overlap_list, node_acc_segments_overlap_list, node_pre_segment_overlap_list


In [17]:
n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_new_user_list, \
n_new_item_list, \
node_first_segment_overlap_list, \
node_acc_segments_overlap_list, \
node_pre_segment_overlap_list = get_data_segements_info_Netflix(filtered_data)

print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("new_user_node_list", n_new_user_list)
print("new_item_node_list", n_new_item_list)
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_first_segment_overlap_list: ", node_first_segment_overlap_list)
print("mean_node_first_segment_overlap", np.mean(node_first_segment_overlap_list))
print("node_acc_segments_overlap_list: ", node_acc_segments_overlap_list)
print("mean_node_acc_segments_overlap", np.mean(node_acc_segments_overlap_list))
print("node_pre_segment_overlap_list: ", node_pre_segment_overlap_list)
print("mean_node_pre_segment_overlap: ", np.mean(node_pre_segment_overlap_list))

print("Done.")


year 1999
month 12
year 2000
month 1
year 2000
month 2
year 2000
month 3
year 2000
month 4
year 2000
month 5
year 2000
month 6
year 2000
month 7
year 2000
month 8
year 2000
month 9
year 2000
month 10
year 2000
month 11
year 2000
month 12
year 2001
month 1
year 2001
month 2
year 2001
month 3
year 2001
month 4
year 2001
month 5
year 2001
month 6
year 2001
month 7
year 2001
month 8
year 2001
month 9
year 2001
month 10
year 2001
month 11
year 2001
month 12
year 2002
month 1
year 2002
month 2
year 2002
month 3
year 2002
month 4
year 2002
month 5
year 2002
month 6
year 2002
month 7
year 2002
month 8
year 2002
month 9
year 2002
month 10
year 2002
month 11
year 2002
month 12
year 2003
month 1
year 2003
month 2
year 2003
month 3
year 2003
month 4
year 2003
month 5
year 2003
month 6
year 2003
month 7
year 2003
month 8
year 2003
month 9
year 2003
month 10
year 2003
month 11
year 2003
month 12
year 2004
month 1
year 2004
month 2
year 2004
month 3
year 2004
month 4
year 2004
month 5
year 2004
month

In [16]:
#Below is some functions to get some insights of data segments with chronological order # Alimama
def get_data_segements_info_Alimama(filtered_data):
    n_records = len(filtered_data)
    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_new_user_list, n_new_item_list = [], []
    node_first_segment_overlap_list = []
    node_acc_segments_overlap_list = []
    node_pre_segment_overlap_list = []
    
    first_day = time.localtime(filtered_data.iloc[0]['timestamp']).tm_mday
    last_day = time.localtime(filtered_data.iloc[-1]['timestamp']).tm_mday + 30
    first_segment_data = filtered_data[(filtered_data.timestamp >= time.mktime(time.strptime(str('2017-11-{} 00:00:00'.format(first_day)), "%Y-%m-%d %H:%M:%S"))) & (filtered_data.timestamp <= time.mktime(time.strptime(str('2017-11-{} 23:59:59'.format(first_day)), "%Y-%m-%d %H:%M:%S")))]
    first_segment_user_set = set(first_segment_data['user_id'].unique())
    first_segment_item_set = set(first_segment_data['item_id'].unique())
    acc_user = first_segment_user_set
    acc_item = first_segment_item_set
    pre_user = first_segment_user_set
    pre_item = first_segment_item_set

    for i in range(first_day + 1, last_day + 1):
        if i <= 30:
            print('day', i)
            sub_data = filtered_data[(filtered_data.timestamp >= time.mktime(time.strptime(str('2017-11-{} 00:00:00'.format(i)), "%Y-%m-%d %H:%M:%S"))) & (filtered_data.timestamp <= time.mktime(time.strptime(str('2017-11-{} 23:59:59'.format(i)), "%Y-%m-%d %H:%M:%S")))]
            acc_data = filtered_data[filtered_data.timestamp <= time.mktime(time.strptime(str('2017-11-{} 23:59:59'.format(i)), "%Y-%m-%d %H:%M:%S"))]
        else:
            print('day', i)
            sub_data = filtered_data[(filtered_data.timestamp >= time.mktime(time.strptime(str('2017-12-{} 00:00:00'.format(i-30)), "%Y-%m-%d %H:%M:%S"))) & (filtered_data.timestamp <= time.mktime(time.strptime(str('2017-12-{} 23:59:59'.format(i-30)), "%Y-%m-%d %H:%M:%S")))]
            acc_data = filtered_data[filtered_data.timestamp <= time.mktime(time.strptime(str('2017-12-{} 23:59:59'.format(i-30)), "%Y-%m-%d %H:%M:%S"))]           

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())

        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())   

        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)    

        node_first_segment_overlap = (len((cur_user & first_segment_user_set)) + len((cur_item & first_segment_item_set))) \
                            / (len((cur_user | first_segment_user_set)) + len((cur_item | first_segment_item_set)))   
        n_new_user, n_new_item = len(new_user), len(new_item)
        n_new_user_list.append(n_new_user)
        n_new_item_list.append(n_new_item)
        node_first_segment_overlap_list.append(node_first_segment_overlap)  
        node_acc_segments_overlap = (len((cur_user & acc_user)) + len((cur_item & acc_item))) \
                            / (len((cur_user | acc_user)) + len((cur_item | acc_item)))  
        node_acc_segments_overlap_list.append(node_acc_segments_overlap)
        node_pre_segment_overlap = (len((cur_user & pre_user)) + len((cur_item & pre_item))) \
                            / (len((cur_user | pre_user)) + len((cur_item | pre_item)))
        node_pre_segment_overlap_list.append(node_pre_segment_overlap)

        pre_user = cur_user
        pre_item = cur_item
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_new_user_list, n_new_item_list, node_first_segment_overlap_list, node_acc_segments_overlap_list, node_pre_segment_overlap_list



In [17]:
n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_new_user_list, \
n_new_item_list, \
node_first_segment_overlap_list, \
node_acc_segments_overlap_list, \
node_pre_segment_overlap_list = get_data_segements_info_Alimama(filtered_data)

print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("new_user_node_list", n_new_user_list)
print("new_item_node_list", n_new_item_list)
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_first_segment_overlap_list: ", node_first_segment_overlap_list)
print("mean_node_first_segment_overlap", np.mean(node_first_segment_overlap_list))
print("node_acc_segments_overlap_list: ", node_acc_segments_overlap_list)
print("mean_node_acc_segments_overlap", np.mean(node_acc_segments_overlap_list))
print("node_pre_segment_overlap_list: ", node_pre_segment_overlap_list)
print("mean_node_pre_segment_overlap: ", np.mean(node_pre_segment_overlap_list))

print("Done.")



day 26
day 27
day 28
day 29
day 30
day 31
day 32
day 33
edge_on_new_node_list:  [497744, 164613, 67835, 40484, 27745, 10110, 7182, 1494]
edge_btw_new_node_list:  [1512, 341, 85, 45, 17, 8, 1, 0]
new_user_node_list [43754, 13464, 5017, 2406, 1191, 271, 42, 0]
new_item_node_list [2169, 1194, 850, 633, 460, 265, 138, 14]
new_user_node(mean, std):  8268.125 14059.025103447784
new_item_node(mean, std):  715.375 657.681521996019
node_first_segment_overlap_list:  [0.8158130943806762, 0.800889477303547, 0.7935135342316406, 0.791916345047058, 0.7908778313318973, 0.7929624324199436, 0.8256012839524557, 0.8175777378785303]
mean_node_first_segment_overlap 0.8036439670682185
node_acc_segments_overlap_list:  [0.9151380661012953, 0.8893144573792109, 0.8819013576984568, 0.8858462899889898, 0.8902917758826497, 0.8957246641070924, 0.9549051572656785, 0.9425369038452086]
mean_node_acc_segments_overlap 0.9069573340335728
node_pre_segment_overlap_list:  [0.8158130943806762, 0.8180854579083111, 0.8261028345

In [19]:
def get_inc_blocks_info(filtered_data, base_user_set, base_item_set):
    n_records = len(filtered_data)
    n_split = 10
    n_newedge = n_records/n_split

    n_edge_on_new_node_list = []
    n_edge_btw_new_node_list = []
    n_day_list = []
    n_new_user_list, n_new_item_list = [], []
    node_base_overlap_list = []
    acc_user = base_user_set
    acc_item = base_item_set
    
    base_data = filtered_data[0:int(n_records*0.1)]
    base_user = set(base_data['user_id'].unique())
    base_item = set(base_data['item_id'].unique())
    
    
    for i in range(1, n_split):
        sub_data = filtered_data[int(n_records*i/n_split):int(n_records*(i+1)/n_split)]
        acc_data = filtered_data[0:int(n_records*(i+1)/n_split)]

        cur_acc_user = set(acc_data['user_id'].unique())
        new_user = cur_acc_user - acc_user
        acc_user = cur_acc_user
        cur_user = set(sub_data['user_id'].unique())
        
        cur_acc_item = set(acc_data['item_id'].unique())
        new_item = cur_acc_item - acc_item
        acc_item = cur_acc_item
        cur_item = set(sub_data['item_id'].unique())
        
        n_edge_on_new_node = len(sub_data[sub_data['user_id'].isin(new_user) | sub_data['item_id'].isin(new_item)])
        n_edge_btw_new_node = len(sub_data[sub_data['user_id'].isin(new_user) & sub_data['item_id'].isin(new_item)])
        n_edge_on_new_node_list.append(n_edge_on_new_node)
        n_edge_btw_new_node_list.append(n_edge_btw_new_node)
        
        start_day = time.localtime(sub_data.iloc[0]['timestamp']).tm_mday + time.localtime(sub_data.iloc[0]['timestamp']).tm_mon * 30
        end_day = time.localtime(sub_data.iloc[-1]['timestamp']).tm_mday + time.localtime(sub_data.iloc[-1]['timestamp']).tm_mon * 30
        diff_day = end_day - start_day
        print('diff_day', diff_day)
        # Gowalla
        #start_day = sub_data.iloc[0]['timestamp']
        #end_day = sub_data.iloc[-1]['timestamp']
        #diff_day = (end_day - start_day).days
        
#         # Tb2015 date format
#         start_day = time.mktime(datetime.datetime.strptime(str(start_day), "%Y%m%d").timetuple()) 
#         end_day = time.mktime(datetime.datetime.strptime(str(end_day), "%Y%m%d").timetuple())
#         # Movie-lens date format
#         diff_day = int((end_day - start_day) / 3600 / 24) # movie-lens
#         # Some other dataset date format
#         diff_day = int((end_day - start_day) / 3600 / 24 / 1000)

        n_day_list.append(diff_day)
        
        node_base_overlap = (len((cur_user & base_user_set)) + len((cur_item & base_item_set))) \
                            / (len((cur_user | base_user_set)) + len((cur_item | base_item_set)))
        
        node_base_overlap_2 = (len((cur_user & base_user)) + len((cur_item & base_item))) \
                            / (len(base_user_set) + len(base_item_set))
        
        n_new_user, n_new_item = len(new_user), len(new_item)
        n_new_user_list.append(n_new_user)
        n_new_item_list.append(n_new_item)
        node_base_overlap_list.append(node_base_overlap)
        
    return n_edge_on_new_node_list, n_edge_btw_new_node_list, n_day_list, n_new_user_list, n_new_item_list, node_base_overlap_list

In [20]:
base_block = filtered_data[:int(n_data * 0.1)]
inc_data = filtered_data[int(n_data * 0.1):]

print("% of user in base block: ", len(base_block['user_id'].unique()) / n_user)
print("% of item in base block: ", len(base_block['item_id'].unique()) / n_item)

n_edge_on_new_node_list, \
n_edge_btw_new_node_list, \
n_day_list, \
n_new_user_list, \
n_new_item_list, \
node_base_overlap_list = get_inc_blocks_info(inc_data, set(base_block['user_id'].unique()), set(base_block['item_id'].unique()))

print("#edge for 1 inc block: ", int(len(inc_data)/10))
print("edge_on_new_node_list: ", n_edge_on_new_node_list)
print("edge_btw_new_node_list: ", n_edge_btw_new_node_list)
print("timespan(mean, std): ", np.mean(n_day_list), np.std(n_day_list))
print("new_user_node(mean, std): ", np.mean(n_new_user_list), np.std(n_new_user_list))
print("new_item_node(mean, std): ", np.mean(n_new_item_list), np.std(n_new_item_list))
print("node_base_overlap_list: ", node_base_overlap_list)

print("Done.")

% of user in base block:  0.721394153751437
% of item in base block:  0.9220903332271491
diff_day 1
diff_day 1
diff_day 1
diff_day 1
diff_day 1
diff_day 1
diff_day 0
diff_day 1
diff_day 0
#edge for 1 inc block:  5345220
edge_on_new_node_list:  [1196181, 417271, 211653, 129321, 81057, 43089, 12898, 9860, 292]
edge_btw_new_node_list:  [22909, 2185, 695, 298, 120, 24, 5, 3, 0]
timespan(mean, std):  0.7777777777777778 0.41573970964154905
new_user_node(mean, std):  30270.666666666668 53381.49631556696
new_item_node(mean, std):  5394.111111111111 9457.706328050917
node_base_overlap_list:  [0.7391554620654368, 0.7330323089256453, 0.7237747839230312, 0.7168927698783271, 0.712017476131561, 0.7063116471768909, 0.7252336961325462, 0.7124896845191718, 0.713649995014611]
Done.
