## Import

In [26]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))


import pandas as pd
import numpy as np

import papermill as pm

from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.utils.timer import Timer
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED
from recommenders.models.cornac.cornac_utils import predict_ranking
import cornac

## Constant

In [2]:
RAW_PATH = '../data/raw'
TRANSACTION_PATH = os.path.join(RAW_PATH, 'transactions_train.csv')
CUSTOMER_PATH = os.path.join(RAW_PATH, 'customers.csv')
ARTICLE_PATH = os.path.join(RAW_PATH, 'articles.csv')

SAMPLE_ROW = 500000

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

## Load data

In [3]:
transaction_data = pd.read_csv(TRANSACTION_PATH)

print(len(transaction_data))

transaction_data.head()

31788324


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [40]:
data = transaction_data.sample(SAMPLE_ROW)

# data['t_dat'] = pd.to_datetime(data['t_dat'], format='%Y-%m-%d')

data = data.sort_values(by = 't_dat')

print(len(data))

data.head()

500000


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
15337,2018-09-20,50fe3d12b84ff8678e16cb3b72c530ee78386f3584d431...,635741002,0.010153,2
28836,2018-09-20,97021824157849206c3dddb417bfefc87eeb06ddb02103...,658304001,0.050831,1
173,2018-09-20,00da14cc2ed13b9df1425be332f190877d6607aa7c79f7...,684588008,0.025407,2
40388,2018-09-20,d5ac9e386a2d10eda52086889caebe02d61fba296b25be...,657203001,0.06778,2
21783,2018-09-20,7218650b2784b9d54ba22c5a3b939b3b6c1bf867d18941...,630391002,0.018627,2


In [41]:
data = data[['customer_id', 'article_id', 'sales_channel_id']].rename(
    columns = {
        'customer_id': 'userID',
        'article_id': 'itemID',
        'sales_channel_id': 'rating'
    }
)

### Data preprocessing

In [42]:
data = data[
    (data['itemID'].isin(list(data['itemID'].value_counts()[data['itemID'].value_counts()>5].index))) &
    (data['userID'].isin(list(data['userID'].value_counts()[data['userID'].value_counts()>5].index)))
]

print(len(data))

data.head()

27314


Unnamed: 0,userID,itemID,rating
32295,a8d14751a68b4cab69fed60b169c03c5d62f1c8b73fb1c...,532578009,2
27303,8ecbac3466886ba06b611fc52dd86762a661f57a7ccd2a...,622240003,2
14479,4cccf0bb281678f22d83dfbdb0e350c3b6640d68bc33d9...,682285001,2
34473,b63620c6c9efc92571226ab827015266fa8f3ed2814b40...,662980001,2
41564,dba8abd2f743f8b8b35bd5fc25a31f8694bcda228b26fc...,677341001,2


In [38]:
len_data = len(data)

train = data[:int(len_data*0.7)]
test = data[int(len_data*0.7):]

test = test[
    (test['userID'].isin(train['userID'].values)) & 
    (test['itemID'].isin(train['itemID'].values))
]

print(f'train len: {len(train)}')
print(f'test len: {len(test)}')

train.head()

train len: 19165
test len: 1930


Unnamed: 0,userID,itemID,sales_channel_id
14380,4c5a78ba45694fe8a8978af2c9c66645e2069e697b543c...,399201005,2
45224,ef8404cf52fb277cfb388f8d35dcd2d4c88c255e72bc9f...,633109001,2
34583,b6c36415205d0a2ad1e9943fe4707737b5d3e8b833ad5d...,679494004,2
9961,363c43c6b1160f5a4b2371532197116875ea5d37b50ca8...,632813005,2
17464,5b9162256d6e071146a9ce951991e707272f9f97ffe0e4...,589514001,2


In [43]:
train, test = python_stratified_split(
    data, ratio=0.75, col_user='userID', col_item='itemID', seed=42)
train.head()

Unnamed: 0,userID,itemID,rating
27530034,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,855893001,1
5874856,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,713428003,2
23663277,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,759814011,2
2516092,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,505230004,2
30990055,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,777148006,1


In [45]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 4612
Number of items: 11641


## BPR

In [46]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [47]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/100 [00:00<?, ?it/s]

Optimization finished!
Took 0.9063 seconds for training.


In [48]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 34.0803 seconds for prediction.


In [49]:
all_predictions.head()

Unnamed: 0,userID,itemID,prediction
20035,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,753802007,-0.184871
20036,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,895804005,0.153108
20037,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,720125040,0.218479
20038,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,691177005,0.678511
20039,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,648627002,-0.231011


In [50]:
k = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.001895
NDCG:	0.003081
Precision@K:	0.000861
Recall@K:	0.004931
