In [19]:
import pandas as pd
import numpy as np
import gc
import os
from os import listdir
from os.path import isfile, join
import time

In [20]:
# Arguments
min_length = 5
max_length = 20
target_folder_name = 'min_5_max_20'
project_folder = '/data/workspace/yeqi/projects/RNN4REC/GRU4REC'
data_folder = '/data/workspace/yeqi/projects/RNN4REC/GRU4REC/Data/paths'

train_folder = data_folder + '/training set'
test_folder = data_folder + '/test set'
full_folder = data_folder + '/full data'

pro_data_folder = project_folder + '/Processed Data'
target_folder = pro_data_folder + '/' + target_folder_name
sliced_data_folder = target_folder + '/sliced data'

In [21]:
if not os.path.exists(target_folder):
    os.makedirs(target_folder)
if not os.path.exists(sliced_data_folder):
    os.makedirs(sliced_data_folder)

In [22]:
# function for processing all the path files in folder
def txt2list(txt_path):
    '''
    input:
        1. txt_path: path to the text file
    output:
        2. a list containing individual lists, each of which contains:
            userid, pathid, itemids in path
    '''
    results = []
    with open(txt_path) as inputfile:
        for line in inputfile:
            results.append(line.strip().split('|'))
            
    valid_results = []
    for row in results:
        if ',' in row[2]:
            valid_results.append(row)
            
    return valid_results

def get_paths_from_folder(folder_path):    
    return [join(folder_path, f) for f in listdir(folder_path) if isfile(join(folder_path, f))]

In [23]:
def folder2arr(folder_path):
    # this is a list of all the daily paths (txtfiles)
    file_paths = get_paths_from_folder(folder_path)

    all_paths = []
    for txt_path in file_paths:
        if not txt_path.endswith('.DS_Store'):
            all_paths = all_paths + txt2list(txt_path)
    len(all_paths)    

    arr = np.array(all_paths)
    arr = arr.astype(object)

    # process array so that in each row: element 0 = userid, element 1 = sessid, element 2 = list of itemids
    for j in range(len(arr)):
        arr[j][0] = int(arr[j][0])
        arr[j][1] = int(arr[j][1])
        arr[j][2] = list(arr[j][2].split(','))
        for i in range(len(arr[j][2])):
            arr[j][2][i] = int(arr[j][2][i])

    # create a list containing the session lengths
    sess_len_list = []
    for i in range(arr.shape[0]):
        sess_len_list.append(len(arr[i][2]))
    sess_len_arr = np.array(sess_len_list)
    sess_len_arr = sess_len_arr.reshape([len(sess_len_list),1])

    # concatenate the array so that in each row element 3 = session length (number of items presented)
    arr = np.concatenate((arr,sess_len_arr),axis = 1)
    
    # create a new array with desired session length
    new_arr = []
    for row in arr:
        if row[3] >= min_length and row[3] <= max_length:
            new_arr.append(row)
    new_arr = np.array(new_arr)
    return new_arr

In [24]:
train_arr = folder2arr(train_folder)

In [25]:
def summarize(arr):
    # np array to dataframe for better statistics
    df = pd.DataFrame(arr)
    df.columns = ['userid', 'sess_id', 'sess_path', 'sess_length']

    df = df.sort_values(by = 'sess_length')
    df = df.reset_index(drop = 'True')

    summary = df.groupby('sess_length').count()
    summary = pd.DataFrame(summary.drop(columns=['sess_id','sess_path']))
    summary = summary.reset_index()
    summary.columns = ['sess_length', 'count']

    slice_table = summary
    slice_table['start_index'] = slice_table['count']
    slice_table['end_index'] = slice_table['count']

    # here is the logic for create the start index + end index
    for i in range(1, len(slice_table)):
        slice_table['end_index'][i] = slice_table['end_index'][i-1] + slice_table['count'][i]

    for i in range(1, len(slice_table)):
        slice_table['start_index'][i] = slice_table['end_index'][i-1]

    slice_table['start_index'][0] = 0

    print('The raw data is as followed: ')
    print(slice_table)
    
    return slice_table

In [26]:
userid_df = pd.read_csv(target_folder + '/userid_map.csv')
itemid_df = pd.read_csv(target_folder + '/itemid_map.csv')

# below is the dictionary for mapping item/user ids in shopee db into model based id
userid_dict = dict([(userid_df['userid'][i], i) for i in range(len(userid_df))])
itemid_dict = dict([(itemid_df['itemid'][i], i) for i in range(len(itemid_df))])

def sort_and_map(arr):
    # sorting
    df = pd.DataFrame(arr)
    df.columns = ['userid', 'sess_id', 'sess_path', 'sess_length']
    df = df.sort_values(by = 'sess_length')
    df = df.reset_index(drop = 'True')
    arr = df.values
    # mapping the shopee userid/itemid into training data index
    for i in range(arr.shape[0]):
        arr[i][0] = userid_dict[arr[i][0]]
        for j in range(len(arr[i][2])):
            arr[i][2][j] = itemid_dict[arr[i][2][j]]
    return arr

train_arr = sort_and_map(train_arr)

In [27]:
slice_table = summarize(train_arr)

The raw data is as followed: 
    sess_length  count  start_index  end_index
0             5  73170            0      73170
1             6  40952        73170     114122
2             7  24961       114122     139083
3             8  17025       139083     156108
4             9  12074       156108     168182
5            10   8117       168182     176299
6            11   6698       176299     182997
7            12   5280       182997     188277
8            13   3699       188277     191976
9            14   2964       191976     194940
10           15   2660       194940     197600
11           16   2039       197600     199639
12           17   1460       199639     201099
13           18   1263       201099     202362
14           19   1172       202362     203534
15           20    760       203534     204294


In [28]:
train_arr

array([[24054, 7, list([56972, 27704, 27703, 30325, 73218]), 5],
       [4508, 17, list([268603, 16780, 65518, 138916, 132]), 5],
       [13569, 52, list([167594, 100456, 169896, 100620, 57662]), 5],
       ...,
       [9001, 22,
        list([19647, 11802, 25455, 10036, 7129, 85363, 4024, 34407, 85362, 249334, 103496, 103500, 35766, 20413, 9095, 16562, 59215, 46568, 5783, 118062]),
        20],
       [10637, 27,
        list([42952, 45777, 36806, 99393, 90428, 135333, 95011, 80151, 172621, 106914, 151749, 8149, 71764, 31121, 116243, 39153, 110723, 41736, 41735, 41737]),
        20],
       [18707, 6,
        list([179952, 131399, 120910, 23214, 68651, 22129, 44564, 65493, 58320, 65008, 11934, 72181, 53141, 58312, 133875, 17987, 74185, 922, 43862, 175837]),
        20]], dtype=object)

In [34]:
new_data_arr = []
for row in train_arr:
    for new_length in range(min_length, row[3]):
        new_row = []
        new_row.append(row[0])
        new_row.append(row[1])
        new_row.append(row[2][:new_length])
        new_row.append(new_length)
        new_data_arr.append(new_row)
new_data_arr = np.array(new_data_arr, dtype=object)
new_data_arr

array([[14211, 8, list([140867, 174524, 187194, 66103, 258720]), 5],
       [24576, 50, list([34795, 274828, 210678, 224771, 127576]), 5],
       [3489, 50, list([98659, 145464, 127125, 67534, 98632]), 5],
       ...,
       [18707, 6,
        list([179952, 131399, 120910, 23214, 68651, 22129, 44564, 65493, 58320, 65008, 11934, 72181, 53141, 58312, 133875, 17987, 74185]),
        17],
       [18707, 6,
        list([179952, 131399, 120910, 23214, 68651, 22129, 44564, 65493, 58320, 65008, 11934, 72181, 53141, 58312, 133875, 17987, 74185, 922]),
        18],
       [18707, 6,
        list([179952, 131399, 120910, 23214, 68651, 22129, 44564, 65493, 58320, 65008, 11934, 72181, 53141, 58312, 133875, 17987, 74185, 922, 43862]),
        19]], dtype=object)

In [36]:
aug_train_arr = np.concatenate((train_arr, new_data_arr))

def sort(arr):
    # sorting
    df = pd.DataFrame(arr)
    df.columns = ['userid', 'sess_id', 'sess_path', 'sess_length']
    df = df.sort_values(by = 'sess_length')
    df = df.reset_index(drop = 'True')
    arr = df.values

    return arr

aug_train_arr = sort(aug_train_arr)

In [37]:
aug_slice_table = summarize(aug_train_arr)

The raw data is as followed: 
    sess_length   count  start_index  end_index
0             5  204294            0     204294
1             6  131124       204294     335418
2             7   90172       335418     425590
3             8   65211       425590     490801
4             9   48186       490801     538987
5            10   36112       538987     575099
6            11   27995       575099     603094
7            12   21297       603094     624391
8            13   16017       624391     640408
9            14   12318       640408     652726
10           15    9354       652726     662080
11           16    6694       662080     668774
12           17    4655       668774     673429
13           18    3195       673429     676624
14           19    1932       676624     678556
15           20     760       678556     679316


In [38]:
def add_label(arr):
    Y_list = []
    for i in range(arr.shape[0]):
        Y_list.append(arr[i][2][-1])
        arr[i][2] = arr[i][2][:-1]
    Y_arr = np.array(Y_list)
    Y_arr = Y_arr.reshape([len(Y_list),1])

    arr = np.concatenate((arr,Y_arr),axis = 1)
    return arr

aug_train_arr = add_label(aug_train_arr)

In [39]:
buckets_list = [5,6,10,20,50,100]
# we deduct all elements in the buckets_list by 1, to fulfill the length of path without Y item
for i in range(len(buckets_list)):
    buckets_list[i] = buckets_list[i] - 1
    
def pad2buckets(buckets_list, in_arr):
    '''
    Input:
        1. buckets_list: a list of ints, indicating the step lengths of data we want to generate
        2. data_arr: the half-processed data array
    Output: 
        1. pro_arr: the processed array
    '''
    data_arr = in_arr
    # cursor labels the current max length of our training data
    cursor = 0
    for i in range(data_arr.shape[0]):
        # the fourth (index 3) column is the session length of the current user path
        if len(data_arr[i][2]) > buckets_list[-1]:
            break
        # move the cursor to the correct place, 
        # by right the max_length should be bigger or equal to the length of the current row of data
        while len(data_arr[i][2]) > buckets_list[cursor]:
            cursor = cursor + 1
        
        if len(data_arr[i][2]) == buckets_list[cursor]:
            data_arr[i][2] = np.array(data_arr[i][2])
        if len(data_arr[i][2]) < buckets_list[cursor]:
            # create a temp numpy array 
            #temp_path = np.array([0 for i in range(buckets_list[cursor])])
            for j in range(buckets_list[cursor]-len(data_arr[i][2])):
                # temp_path[j] = data_arr[i][2][j]
                data_arr[i][2].append(0)
            # data_arr[i][2] = temp_path
            data_arr[i][2] = np.array(data_arr[i][2])
            data_arr[i][3] = buckets_list[cursor] + 1
    print(i)       
    # now i is at the first row we want to abandon
    return data_arr[:i]

aug_train_arr = pad2buckets(buckets_list, aug_train_arr)

679315


In [41]:
def summarize(pro_arr):
    pro_df = pd.DataFrame(pro_arr)
    pro_df.columns = ['userid', 'pathid', 'path', 'sess_length', 'Y']
    # derive the summary/slice table for the training data
    summary = pro_df.groupby('sess_length').count()
    summary = pd.DataFrame(summary.drop(columns=['pathid','path','Y']))
    summary = summary.reset_index()
    summary.columns = ['sess_length', 'count']

    slice_table = summary
    slice_table['start_index'] = slice_table['count']
    slice_table['end_index'] = slice_table['count']
    # here is the logic for create the start index + end index
    for i in range(1, len(slice_table)):
        slice_table['end_index'][i] = slice_table['end_index'][i-1] + slice_table['count'][i]

    for i in range(1, len(slice_table)):
        slice_table['start_index'][i] = slice_table['end_index'][i-1]

    slice_table['start_index'][0] = 0
    return slice_table

print("Here is the summarization of the augmented training data: ")
print(summarize(aug_train_arr))   

Here is the summarization of the augmented training data: 
   sess_length   count  start_index  end_index
0            5  204294            0     204294
1            6  131124       204294     335418
2           10  239681       335418     575099
3           20  104216       575099     679315


In [42]:
np.save(sliced_data_folder +'/X_train.npy', aug_train_arr)