# Import Packages

In [65]:
import os.path
import time
import pandas as pd
import numpy as np

# Parameters

In [66]:
# name of model. Used for saving conventions
name = 'recsys' # 'imusic'

# set sise of data (number of samples). If None (suggested), full datasets are applied.
limit = None

# how often would you like to check results?
show_every_n_batches = 3000

# decide on wether to show full validation statistics. Computational time is high when this is True
full_validation_stats = False

# decide whether to log testing
log_testing = True

# top k products to determine accuracy
top_k = 20

notes = 'Final GRU Model'

# Hyperparameters

In [67]:
# 512 - Number of sequences running through the network in one pass.
batch_size = 512

# 50 - Embedding dimensions
embed_dim = 300

# The dropout drop probability when training on input. If you're network is overfitting, try decreasing this.
x_drop_probability = 0.00

# The dropout keep probability when training on RNN neurons. If you're network is overfitting, try decreasing this.
rnn_keep_probability = 1.00

# 100 - The number of units in the hidden layers.
rnn_size = 200

# 1
num_layers = 1

# Learning rate for training
# typically 0.0001 up to 1: http://datascience.stackexchange.com/questions/410/choosing-a-learning-rate
# best learning_rate = 0.0025
learning_rate = 0.0025

# 10 epochs
num_epochs = 50

# Create model folder for hyperparameters, statistics and the model itself

## Update and get model_counter

In [68]:
model_counter_path = '../models/model_counter.txt'
# os.path.isfile() method in Python is used to check whether the specified path is an existing regular file or not.
if os.path.isfile(model_counter_path):
    # The open() function returns a file object, which has a read() method for reading the content of the file
    # Read Only (‘r’)
    model_counter_file = open(model_counter_path, 'r') 
    model_count = int(model_counter_file.read())
    model_counter_file.close()
    # Write Only (‘w’)
    model_counter_file = open(model_counter_path, 'w')
    model_counter_file.write(str(model_count + 1))
    model_counter_file.close()
else:
    # Write and Read (‘w+’)
    model_counter_file = open(model_counter_path, 'w+')
    model_count = 1000 # initial model count/number
    model_counter_file.write(str(model_count + 1))
    model_counter_file.close()

## Make model directory

In [69]:
model_path_dir = '../models/model_count/' + str(model_count) + '-' + name + '-' + time.strftime("%y%m%d") + '/'
if not os.path.exists(model_path_dir):
    os.makedirs(model_path_dir)

## Update stats_file

In [50]:
stats_file_path = model_path_dir + name + '-' + time.strftime("%y%m%d%H%M") + '-statsfile' + '.txt'
stats_file = open(stats_file_path, 'w+')
stats_file.write('model number: {}\n'.format(model_count))
stats_file.write('name: {}\n\n'.format(name))
stats_file.write('limit: {}\n'.format(limit))
stats_file.write('batch_size: {}\n'.format(batch_size))
stats_file.write('embed_dim: {}\n'.format(embed_dim))
stats_file.write('x_drop_probability: {}\n'.format(x_drop_probability))
stats_file.write('rnn_keep_probability: {}\n'.format(rnn_keep_probability))
stats_file.write('rnn_size: {}\n'.format(rnn_size))
stats_file.write('num_layers: {}\n'.format(num_layers))
stats_file.write('learning_rate: {}\n'.format(learning_rate))
stats_file.write('num_epochs: {}\n'.format(num_epochs))
stats_file.write('show_every_n_batches: {}\n'.format(show_every_n_batches))
stats_file.write('top_k: {}\n'.format(top_k))
stats_file.write('full_validation_stats: {}\n'.format(full_validation_stats))
stats_file.write('notes: {}\n'.format(notes))
stats_file.close()

# Load Data

In [51]:
def load_our_data(path, limit):
    return pd.read_csv(path, nrows = limit, sep="\t")

In [52]:
if limit == None:
    validation_limit = None
    testing_limit = None
else:
    validation_limit = int(0.2 * limit)
    testing_limit = int(0.2 * limit)

prepared_data_path = "../data/rsc15/prepared/"

tr_data = load_our_data(path=f"{prepared_data_path}yoochoose-clicks-100k_train_full.txt", limit=limit)
va_data = load_our_data(path=f"{prepared_data_path}yoochoose-clicks-100k_train_valid.txt", limit=validation_limit)
te_data = load_our_data(path=f"{prepared_data_path}yoochoose-clicks-100k_test.txt", limit=testing_limit)

# Data Preprocessing

## Get unique items

In [53]:
# get number of unique products
print('uniques in training  ', np.unique(tr_data['ItemId']).shape[0])
print('uniques in validation', np.unique(va_data['ItemId']).shape[0])
print('uniques in testing   ', np.unique(te_data['ItemId']).shape[0])

# unique item_ids
uniques = np.unique(np.append(np.append(tr_data['ItemId'], va_data['ItemId']), te_data['ItemId']))
depth = uniques.shape[0]
print('\ndepth (unique items) ', depth)
if depth != np.unique(tr_data['ItemId']).shape[0]:
    print('\nWARNING! Number of uniques in training should equal the depth (uniques in full set)')

uniques in training   2933
uniques in validation 2029
uniques in testing    1771

depth (unique items)  2933


## Creating a lookup table

In [54]:
def create_lookup_tables(item_ids):    
    
    items_to_int = pd.Series(data=np.arange(len(item_ids)),index=item_ids)
    int_to_items = pd.DataFrame({"ItemId":item_ids,'item_idx':items_to_int[item_ids].values})
    
    return items_to_int, int_to_items

In [55]:
items_to_int, int_to_items = create_lookup_tables(list(uniques))

## Transforming and splitting the data

In each session, the number of events
is also referred to as the number of timesteps in a session. Sessions with a single
timestep (one event) are dropped as it is not possible to train a model on inputs
with no targets. 

The remaining sessions will be split into input (**X**) and target values (**y**).

## Session Length

The span of session lengths can be a problem for recurrent neural networks as they require fixed sized inputs.
The main proportion of the RecSys and AVM sessions spans **19** or fewer events.

Sessions with more than 19 timesteps are split into multiple sessions and act as separate independent sessions. Loss of information by splitting long sessions is substantially low and the advantages of a much higher computation speed is valuable.

In [56]:
# 19 - Number of timesteps the rnn should take in
timesteps = 19

### Padding

A method called zero padding makes sessions the same length by adding zeros to missing timesteps in sessions shorter than n (Hearty, 2016). Padding is later reversed by masking, which ensures the added zeros of padding have no effect on model performance.

In [57]:
session_key='SessionId'
item_key='ItemId'
time_key='Time'

tr_data.sort_values([session_key, time_key], inplace=True)
print(tr_data)

       SessionId     ItemId          Time
0              3  214716935  1.396437e+09
1              3  214832672  1.396438e+09
2              6  214701242  1.396796e+09
3              6  214826623  1.396797e+09
4              7  214826835  1.396414e+09
...          ...        ...           ...
68914      32764  214717567  1.396431e+09
68915      32764  214717567  1.396431e+09
68911      32766  214585554  1.396711e+09
68912      32766  214585554  1.396711e+09
68913      32766  214819762  1.396711e+09

[70278 rows x 3 columns]


In [58]:
test= tr_data.groupby(session_key).size().cumsum() 
test1= tr_data.groupby(session_key)

print(test1.size())

SessionId
3        2
6        2
7        2
8        2
9        3
        ..
32759    4
32762    2
32763    6
32764    2
32766    3
Length: 17794, dtype: int64


In [59]:
offsets = np.zeros(tr_data[session_key].nunique() + 1, dtype=np.int32)
# group & sort the df by session_key and get the offset values
offsets[1:] = tr_data.groupby(session_key).size().cumsum()

print(offsets[1:])
print(offsets)


[    2     4     6 ... 70273 70275 70278]
[    0     2     4 ... 70273 70275 70278]


In [60]:
t1 = tr_data.groupby('SessionId').size()
print(t1)



SessionId
3        2
6        2
7        2
8        2
9        3
        ..
32759    4
32762    2
32763    6
32764    2
32766    3
Length: 17794, dtype: int64


In [61]:
data = tr_data
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=3].index)]
print(tr_data)
print(data)

       SessionId     ItemId          Time
0              3  214716935  1.396437e+09
1              3  214832672  1.396438e+09
2              6  214701242  1.396796e+09
3              6  214826623  1.396797e+09
4              7  214826835  1.396414e+09
...          ...        ...           ...
68914      32764  214717567  1.396431e+09
68915      32764  214717567  1.396431e+09
68911      32766  214585554  1.396711e+09
68912      32766  214585554  1.396711e+09
68913      32766  214819762  1.396711e+09

[70278 rows x 3 columns]
       SessionId     ItemId          Time
8              9  214576500  1.396776e+09
9              9  214576500  1.396777e+09
10             9  214576500  1.396777e+09
11            11  214821275  1.396515e+09
12            11  214821275  1.396515e+09
...          ...        ...           ...
68920      32763  214552151  1.396611e+09
68921      32763  214552151  1.396611e+09
68911      32766  214585554  1.396711e+09
68912      32766  214585554  1.396711e+09
68913   

In [None]:
def delete_rare_clicked_items(data):
    
    #delete records of items which appeared less than 5 times
    itemLen = data.groupby('ItemID').size() #groupby itemID and get size of each item
    data = data[np.in1d(train.ItemID, itemLen[itemLen > 4].index)]
    
    return data

In [62]:
def drop_single_timestep_sessions(data):
    
    ''' Sessions with a single timestep (one event) are dropped 
    as it is not possible to train a model on inputs with no targets '''
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[session_lengths>1].index)]
    
    return data 

In [63]:
def transform_and_split_our_data(data, timesteps):
    
    drop_single_timestep_sessions(data)
    
    # The remaining sessions will be split into input and target values.
    
    # After puting events with same session as a group, group length should be checked
    # if length < timesteps (19) padding should be used, adding some ziro ???
    # if length > timesteps, these sessions should be split into multiple sessions having 19 length
    # Loss of information by splitting long sessions is substantially low 
    # and the advantages of a much higher computation speed is valuable.a
    
    # Sort the df by time, and then by session ID. That is, df is sorted by session ID and
    # clicks within a session are next to each other, where the clicks within a session are time-ordered.
    data.sort_values([session_key, time_key], inplace=True)
    
    return X, y

In [64]:
# Transforming and splitting the data
X_tr, y_tr = transform_and_split_our_data(tr_data, timesteps)
X_va, y_va = transform_and_split_our_data(va_data, timesteps)
X_te, y_te = transform_and_split_our_data(te_data, timesteps)

NameError: name 'self' is not defined