In [10]:
import numpy as np
from easydict import EasyDict as edict
import os

In [11]:
X_test_path = "/data/workspace/yeqi/projects/RNN4REC/GRU4REC/Processed Data/min_len_5/X_test.npy"
X_test = np.load(X_test_path)

In [12]:
X_test.shape

(16755, 5)

In [13]:
# demo a new tester for batch by batch testing
import numpy as np
import pandas as pd

class TestDataGenerator:
    def __init__(self, config):
        self.config = config
        # load data here
        # in this fake data generator, the number of training data is simulated to be 100000000
        # the second digit in size tuple is the time step in the training data
        # here we suppose the time step to be one
        self.pro_arr = np.load(self.config.X_test_path)
        self.slice_table = self.analyze_data()

    def test_data(self, batch_size):
        
        '''
        Input:
            1. batch_size as integer
            2. self.pro_arr as numpy array, with columns: 
                [0] userid as integer
                [1] pathid as integer
                [2] path as numpy array containing itemids as integers
                [3] session_length as integer = length_input + length_output = length_input + 1
                [4] label (output itemid) as integer
        Output:
            1. next_batch_userids as numpy array, with size: [batch_size, ] or [batch_size, num_time_step]
            2. next_batch_itemids as numpy array, with size: [batch_size, num_time_step]
            3. next_batch_y as numpy array, with size: [batch_size, ]
        '''
        # first step: use sequence to select a session length to generate the next batch
        length = 0
        start_index = 0
        end_index = 0
        for i in range(len(self.slice_table)):
            if rand >= self.slice_table['start_index'][i] and rand < self.slice_table['end_index'][i]:
                length = self.slice_table['length'][i]
                start_index = self.slice_table['start_index'][i]
                end_index = self.slice_table['end_index'][i]
                break
        # print("generating next batch with session length t_step =", length)
        next_batchids = np.random.randint(start_index, end_index, batch_size)
        next_batch_userids = self.pro_arr[next_batchids][:,0]
        next_batch_itemids = np.array(self.pro_arr[next_batchids][:,2].tolist())
        next_batch_y = self.pro_arr[next_batchids][:,-1]
        
        yield next_batch_userids, next_batch_itemids, next_batch_y    
        
    def analyze_data(self):
        '''
        Input:
            1. self.pro_arr as numpy array
        Output: 
            1. the slicing table of pro_arr
        '''
        df = pd.DataFrame(self.pro_arr)
        df.columns = ['userid', 'pathid', 'path', 'length', 'Y']
    
        print("Sample of training data: ")
        print(df.head())
    
        summary = df.groupby('length').count()
        summary = pd.DataFrame(summary.drop(columns=['pathid','path']))
        summary = summary.reset_index()
        summary = summary.drop(['Y'],axis = 1)
        summary.columns = ['length', 'count']
    
        slice_table = summary
        slice_table['start_index'] = slice_table['count']
        slice_table['end_index'] = slice_table['count']
    
        # here is the logic for create the start index + end index
        for i in range(1, len(slice_table)):
            slice_table['end_index'][i] = slice_table['end_index'][i-1] + slice_table['count'][i]
    
        for i in range(1, len(slice_table)):
            slice_table['start_index'][i] = slice_table['end_index'][i-1]
    
        slice_table['start_index'][0] = 0
        print("Here is the slicing table of the data: ")
        print(slice_table)
        
        return slice_table
        


In [14]:
def process_config(config_dict):
    config = edict(config_dict)
    config.summary_dir = os.path.join("../experiments", config.exp_name, "summary/")
    config.checkpoint_dir = os.path.join("../experiments", config.exp_name, "checkpoint/")
    return config

In [15]:
config_dict = {
  "exp_name": "GRU",
  "num_epochs": 100,
  "num_iter_per_epoch": 100,
  "learning_rate": 0.01,
  "batch_size": 256,

  "max_to_keep": 5,
  
  "num_steps": 2,
  "item_emb_size": 150,
  "user_emb_size": 150,
  "w2v_size": 1000,
  "num_item": 298000,
  "num_user": 25400,
  "num_layers": 1,
  "dropout": 0.0,
  "num_units_4emb": 300,
  

  "X_train_path": "/data/workspace/yeqi/projects/RNN4REC/GRU4REC/Processed Data/min_len_5/X_train.npy",
  "X_test_path": "/data/workspace/yeqi/projects/RNN4REC/GRU4REC/Processed Data/min_len_5/X_test.npy"
  
}

In [16]:
config = process_config(config_dict)

In [18]:
tdg = TestDataGenerator(config)

Sample of training data: 
  userid pathid                              path length       Y
0  11005      1     [32814, 151616, 30251, 20452]      5   32046
1   7023     38   [164225, 85683, 123109, 146134]      5   98443
2   4288     43  [125822, 126082, 172605, 220459]      5  207481
3  23598     34   [69876, 102562, 100926, 149212]      5  175247
4  22643     49   [190805, 206507, 166357, 98085]      5   53280
Here is the slicing table of the data: 
   length  count  start_index  end_index
0       5   5580            0       5580
1       6   3244         5580       8824
2      10   5035         8824      13859
3      20   2294        13859      16153
4      50    534        16153      16687
5     100     68        16687      16755
