In [1]:
import pandas as pd
import numpy as np
import gc
import os
from os import listdir
from os.path import isfile, join
import time

In [2]:
# Arguments
min_length = 5
max_length = 20
source_folder_name = 'min_5_max_20'
project_folder = '/data/workspace/yeqi/projects/RNN4REC/GRU4REC'
data_folder = '/data/workspace/yeqi/projects/RNN4REC/GRU4REC/Data/paths'

train_folder = data_folder + '/training set'
test_folder = data_folder + '/test set'
full_folder = data_folder + '/full data'

pro_data_folder = project_folder + '/Processed Data'
source_folder = pro_data_folder + '/' + source_folder_name

In [3]:
X_train = np.load(source_folder +'/X_train.npy')
X_test = np.load(source_folder +'/X_test.npy')

In [4]:
X_train[0:3]

array([[24054, 7, array([56972, 27704, 27703, 30325]), 5, 73218],
       [4508, 17, array([268603,  16780,  65518, 138916]), 5, 132],
       [13569, 52, array([167594, 100456, 169896, 100620]), 5, 57662]],
      dtype=object)

In [40]:
X_test.shape

(16152, 5)

In [58]:
userid_df = pd.read_csv(source_folder + '/userid_map.csv')
itemid_df = pd.read_csv(source_folder + '/itemid_map.csv')

# below is the dictionary for mapping item/user ids in shopee db into model based id
userid_dict = dict([(userid_df['userid'][i], i) for i in range(len(userid_df))])
itemid_dict = dict([(itemid_df['itemid'][i], i) for i in range(len(itemid_df))])

In [13]:
user_df = pd.DataFrame(userid_df)
user_df['train_count'] = 0
user_df['test_count'] = 0

for row in X_train:
    user_df['train_count'][row[0]] = user_df['train_count'][row[0]] + 1
for row in X_test:
    user_df['test_count'][row[0]] = user_df['test_count'][row[0]] + 1
    
# serveral data need to be considered:
# inside the training data, how many users are presented? and how many of them are presented in test data?
# inside the test data, how many users are presented? and how many of them are presented in training data?
train_user_count = 0
test_user_count = 0
co_user_count = 0
user_arr = user_df.values
for row in user_arr:
    if row[2] > 0:
        train_user_count = train_user_count + 1
    if row[3] > 0:
        test_user_count = test_user_count + 1
        if row[2] > 0:
            co_user_count = co_user_count + 1

In [27]:
train_user_count, test_user_count, co_user_count

(23943, 3308, 1990)

In [28]:
item_df = pd.DataFrame(itemid_df)
item_df['train_count'] = 0
item_df['test_count'] = 0

for row in X_train:
    for itemid in row[2]:
        item_df['train_count'][itemid] = item_df['train_count'][itemid] + 1
for row in X_test:
    for itemid in row[2]:
        item_df['test_count'][itemid] = item_df['test_count'][itemid] + 1

train_item_count = 0
test_item_count = 0
co_item_count = 0
item_arr = item_df.values
for row in item_arr:
    if row[2] > 0:
        train_item_count = train_item_count + 1
    if row[3] > 0:
        test_item_count = test_item_count + 1
        if row[2] > 0:
            co_item_count = co_item_count + 1

train_item_count, test_item_count, co_item_count

(193077, 24344, 12597)

In [30]:
user_arr[:5]

array([[    0,     0,     0,     0],
       [    1, 10002,     3,     0],
       [    2, 10051,     2,     0],
       [    3, 10108,    30,     2],
       [    4, 10116,     1,     0]])

In [37]:
# create a list containing the active users in the test data 
active_users = []
for i in range(len(user_arr)):
    row = user_arr[i]
    if row[2] >= 10 and row[3] > 0:
        active_user = []
        active_user.append(row[0])
        active_user.append(row[1])
        active_user.append(row[2])
        active_user.append(row[3])
        active_users.append(active_user)
active_users = np.array(active_users)
au_df = pd.DataFrame(active_users)
au_df.columns = ['user_id', 'shopee_id', 'train_occur', 'test_occur']

In [38]:
au_df

Unnamed: 0,user_id,shopee_id,train_occur,test_occur
0,3,10108,30,2
1,12,10287,12,4
2,89,19666,15,21
3,128,28555,13,1
4,131,29258,31,6
5,156,45584,93,1
6,212,70119,40,1
7,279,84959,34,2
8,305,93640,12,1
9,315,100204,39,1


In [39]:
np.mean(au_df['train_occur'])

48.76624857468643

## Drop inactive users in the test data

In [41]:
X_test

array([[3810, 20, array([206023, 234726, 160147, 208956]), 5, 151959],
       [11914, 200, array([118534, 175612, 187618,  60579]), 5, 231929],
       [3929, 16, array([152449, 249690, 241665, 123236]), 5, 144114],
       ...,
       [9746, 14,
        array([ 92156, 185133,  57196, 253033,   3870,  65979, 206379, 273302,
       265714, 230003, 206352, 178775, 124815, 113209,  81542,  72602,
        67898,  44884,  30868]),
        20, 26604],
       [8850, 168,
        array([249708, 153817, 166293, 126337, 183178, 135054,  47040,  47145,
       102815, 177163,  50361,   8148, 190085, 183497, 213009, 136311,
       266601, 173964, 173914]),
        20, 276027],
       [742, 167,
        array([106803, 246106, 261014, 195521, 216143, 184504, 184536, 147013,
       197592, 253330, 238733, 235892, 246510,  75673,  86210, 110127,
       124613,  31687,   3653]),
        20, 9154]], dtype=object)

In [42]:
au_ids = np.array(au_df['user_id'])

In [45]:
new_X_test = []
for row in X_test:
    if row[0] in au_ids:
        new_X_test.append(row)
new_X_test = np.array(new_X_test)

In [46]:
new_X_test.shape

(7379, 5)

In [47]:
save_folder = source_folder + '/test data for active users'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

In [48]:
np.save(save_folder + '/X_test.npy', new_X_test)

## Study the popularity of the items

In [49]:
item_df

Unnamed: 0.1,Unnamed: 0,itemid,train_count,test_count
0,0,0,296740,24733
1,1,100061,1,0
2,2,100223,1,0
3,3,100275,0,0
4,4,100278,2,0
5,5,100279,6,0
6,6,100281,5,0
7,7,100282,9,0
8,8,100544,1,0
9,9,101043,1,0


In [52]:
item_arr = item_df.values

In [53]:
np.mean(np.array(item_df['train_count']))

5.662536261323868

In [54]:
top_items = []
for i in range(1, len(item_arr)):
    row = item_arr[i]
    if row[2] >= 5 and row[3] > 0:
        top_items.append(list(row))
        
top_items = np.array(top_items)

In [55]:
top_items.shape

(7625, 4)

In [57]:
top_i_df = pd.DataFrame(top_items)

array([[        31,     104576,         26,          1],
       [        39,     105537,         12,          1],
       [        61,     107789,         22,          8],
       ...,
       [    280009, 1325467811,          7,          1],
       [    280014, 1325482337,          8,          1],
       [    280062, 1325835291,          6,          1]])