In [5]:
import pandas as pd
import numpy as np
import gc
import os
from os import listdir
from os.path import isfile, join
import time

In [6]:
# Arguments
min_length = 2
max_length = 20
target_folder_name = 'min_2_max_20'
project_folder = '/data/workspace/yeqi/projects/RNN4REC/GRU4REC'
data_folder = '/data/workspace/yeqi/projects/RNN4REC/GRU4REC/Data/paths'

train_folder = data_folder + '/training set'
test_folder = data_folder + '/test set'
full_folder = data_folder + '/full data'

pro_data_folder = project_folder + '/Processed Data'
target_folder = pro_data_folder + '/' + target_folder_name

In [7]:
if not os.path.exists(target_folder):
    os.makedirs(target_folder)

In [8]:
# function for processing all the path files in folder
def txt2list(txt_path):
    '''
    input:
        1. txt_path: path to the text file
    output:
        2. a list containing individual lists, each of which contains:
            userid, pathid, itemids in path
    '''
    results = []
    with open(txt_path) as inputfile:
        for line in inputfile:
            results.append(line.strip().split('|'))
            
    valid_results = []
    for row in results:
        if ',' in row[2]:
            valid_results.append(row)
            
    return valid_results

def get_paths_from_folder(folder_path):    
    return [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]

In [9]:
def folder2arr(folder_path):
    # this is a list of all the daily paths (txtfiles)
    file_paths = get_paths_from_folder(folder_path)

    all_paths = []
    for txt_path in file_paths:
        if not txt_path.endswith('.DS_Store'):
            all_paths = all_paths + txt2list(txt_path)
    len(all_paths)    

    arr = np.array(all_paths)
    arr = arr.astype(object)

    # process array so that in each row: element 0 = userid, element 1 = sessid, element 2 = list of itemids
    for j in range(len(arr)):
        arr[j][0] = int(arr[j][0])
        arr[j][1] = int(arr[j][1])
        arr[j][2] = list(arr[j][2].split(','))
        for i in range(len(arr[j][2])):
            arr[j][2][i] = int(arr[j][2][i])

    # create a list containing the session lengths
    sess_len_list = []
    for i in range(arr.shape[0]):
        sess_len_list.append(len(arr[i][2]))
    sess_len_arr = np.array(sess_len_list)
    sess_len_arr = sess_len_arr.reshape([len(sess_len_list),1])

    # concatenate the array so that in each row element 3 = session length (number of items presented)
    arr = np.concatenate((arr,sess_len_arr),axis = 1)
    
    # create a new array with desired session length
    new_arr = []
    for row in arr:
        if row[3] >= min_length and row[3] <= max_length:
            new_arr.append(row)
    new_arr = np.array(new_arr)
    return new_arr

In [10]:
arr = folder2arr(full_folder)
train_arr = folder2arr(train_folder)
test_arr = folder2arr(test_folder)

In [11]:
# np array to dataframe for better statistics
df = pd.DataFrame(arr)
df.columns = ['userid', 'sess_id', 'sess_path', 'sess_length']

df = df.sort_values(by = 'sess_length')
df = df.reset_index(drop = 'True')

summary = df.groupby('sess_length').count()
summary = pd.DataFrame(summary.drop(columns=['sess_id','sess_path']))
summary = summary.reset_index()
summary.columns = ['sess_length', 'count']

slice_table = summary
slice_table['start_index'] = slice_table['count']
slice_table['end_index'] = slice_table['count']

# here is the logic for create the start index + end index
for i in range(1, len(slice_table)):
    slice_table['end_index'][i] = slice_table['end_index'][i-1] + slice_table['count'][i]

for i in range(1, len(slice_table)):
    slice_table['start_index'][i] = slice_table['end_index'][i-1]

slice_table['start_index'][0] = 0

print('The raw data is as followed: ')
print(slice_table)

The raw data is as followed: 
    sess_length    count  start_index  end_index
0             2  1592038            0    1592038
1             3   392060      1592038    1984098
2             4   153887      1984098    2137985
3             5    78750      2137985    2216735
4             6    44196      2216735    2260931
5             7    27154      2260931    2288085
6             8    18229      2288085    2306314
7             9    13102      2306314    2319416
8            10     8727      2319416    2328143
9            11     7256      2328143    2335399
10           12     5661      2335399    2341060
11           13     4090      2341060    2345150
12           14     3227      2345150    2348377
13           15     2860      2348377    2351237
14           16     2226      2351237    2353463
15           17     1559      2353463    2355022
16           18     1373      2355022    2356395
17           19     1240      2356395    2357635
18           20      797      2357635  

In [None]:
arr = df.values

print("start processing at", time.ctime())

# now we gonna count the active users and active items
userid_list = []
itemid_list = []
for i in range(arr.shape[0]):
    userid_list.append(arr[i][0])
    itemid_list = itemid_list + arr[i][2]
    
userid_set = set(userid_list)
itemid_set = set(itemid_list)

print('Total amount of user presented before and after dropping duplication:')
print(len(userid_list), len(userid_set))
print('Total amount of item presented before and after dropping duplication:')
print(len(itemid_list), len(itemid_set))

print("end processing at", time.ctime())

start processing at Mon Sep 24 09:42:59 2018


In [None]:
userid_df = pd.DataFrame([0]+list(userid_set))
itemid_df = pd.DataFrame([0]+list(itemid_set))
userid_df.columns = ['userid']
itemid_df.columns = ['itemid']
userid_df['userid'] = pd.to_numeric(userid_df['userid'])
itemid_df['itemid'] = pd.to_numeric(itemid_df['itemid'])

userid_df = userid_df.sort_values(by = 'userid').reset_index(drop=True)
itemid_df = itemid_df.sort_values(by = 'itemid').reset_index(drop=True)

In [None]:
userid_df.to_csv(target_folder + '/userid_map.csv')
itemid_df.to_csv(target_folder + '/itemid_map.csv')

# below is the dictionary for mapping item/user ids in shopee db into model based id
userid_dict = dict([(userid_df['userid'][i], i) for i in range(len(userid_df))])
itemid_dict = dict([(itemid_df['itemid'][i], i) for i in range(len(itemid_df))])

def sort_and_map(arr):
    # sorting
    df = pd.DataFrame(arr)
    df.columns = ['userid', 'sess_id', 'sess_path', 'sess_length']
    df = df.sort_values(by = 'sess_length')
    df = df.reset_index(drop = 'True')
    arr = df.values
    # mapping the shopee userid/itemid into training data index
    for i in range(arr.shape[0]):
        arr[i][0] = userid_dict[arr[i][0]]
        for j in range(len(arr[i][2])):
            arr[i][2][j] = itemid_dict[arr[i][2][j]]
    return arr

test_arr = sort_and_map(test_arr)
train_arr = sort_and_map(train_arr)

def add_label(arr):
    Y_list = []
    for i in range(arr.shape[0]):
        Y_list.append(arr[i][2][-1])
        arr[i][2] = arr[i][2][:-1]
    Y_arr = np.array(Y_list)
    Y_arr = Y_arr.reshape([len(Y_list),1])

    arr = np.concatenate((arr,Y_arr),axis = 1)
    return arr

test_arr = add_label(test_arr)
train_arr = add_label(train_arr)

In [None]:
buckets_list = [5,6,10,20,50,100]
# we deduct all elements in the buckets_list by 1, to fulfill the length of path without Y item
for i in range(len(buckets_list)):
    buckets_list[i] = buckets_list[i] - 1
    
def pad2buckets(buckets_list, in_arr):
    '''
    Input:
        1. buckets_list: a list of ints, indicating the step lengths of data we want to generate
        2. data_arr: the half-processed data array
    Output: 
        1. pro_arr: the processed array
    '''
    data_arr = in_arr
    # cursor labels the current max length of our training data
    cursor = 0
    for i in range(data_arr.shape[0]):
        # the fourth (index 3) column is the session length of the current user path
        if len(data_arr[i][2]) > buckets_list[-1]:
            break
        # move the cursor to the correct place, 
        # by right the max_length should be bigger or equal to the length of the current row of data
        while len(data_arr[i][2]) > buckets_list[cursor]:
            cursor = cursor + 1
        
        if len(data_arr[i][2]) == buckets_list[cursor]:
            data_arr[i][2] = np.array(data_arr[i][2])
        if len(data_arr[i][2]) < buckets_list[cursor]:
            # create a temp numpy array 
            #temp_path = np.array([0 for i in range(buckets_list[cursor])])
            for j in range(buckets_list[cursor]-len(data_arr[i][2])):
                # temp_path[j] = data_arr[i][2][j]
                data_arr[i][2].append(0)
            # data_arr[i][2] = temp_path
            data_arr[i][2] = np.array(data_arr[i][2])
            data_arr[i][3] = buckets_list[cursor] + 1
    print(i)       
    # now i is at the first row we want to abandon
    return data_arr[:i]

test_arr = pad2buckets(buckets_list, test_arr)
train_arr = pad2buckets(buckets_list, train_arr)

In [None]:
def summarize(pro_arr):
    pro_df = pd.DataFrame(pro_arr)
    pro_df.columns = ['userid', 'pathid', 'path', 'sess_length', 'Y']
    # derive the summary/slice table for the training data
    summary = pro_df.groupby('sess_length').count()
    summary = pd.DataFrame(summary.drop(columns=['pathid','path','Y']))
    summary = summary.reset_index()
    summary.columns = ['sess_length', 'count']

    slice_table = summary
    slice_table['start_index'] = slice_table['count']
    slice_table['end_index'] = slice_table['count']
    # here is the logic for create the start index + end index
    for i in range(1, len(slice_table)):
        slice_table['end_index'][i] = slice_table['end_index'][i-1] + slice_table['count'][i]

    for i in range(1, len(slice_table)):
        slice_table['start_index'][i] = slice_table['end_index'][i-1]

    slice_table['start_index'][0] = 0
    return slice_table

print("Here is the summarization of the training data: ")
print(summarize(train_arr))
print("Here is the summarization of the test data: ")
print(summarize(test_arr))    

In [None]:
np.save(target_folder +'/X_train.npy', train_arr)
np.save(target_folder +'/X_test.npy', test_arr)