In [1]:
import numpy as np
import pandas as pd
from DataProcessor import DataProcessor
from evaluation import evaluate
from lib import index

In [2]:
data_file_name = "lsapp.csv"
if data_file_name == "lsapp.csv":
    process_data = DataProcessor(
        "lsapp.csv",
        column_names='userid,appid,timestamp',
        session_break_delta='15min',
    )
    process_data.prepare_data(
        usecols=['userid', 'appid', 'timestamp'],
        test_interval='14d',
        valid_interval='7d',
        min_sess_length=2,
        window="3s",
    )
    # Define global weighting matrix
    Cui = process_data.get_freqs(process_data.train, c0=1, gamma=1)

    # Define user-contex matrix:
    Sus_dict = process_data.get_users_sui(process_data.train, level='sessid')
    Sus_dict = pd.Series(Sus_dict)
    print(data_file_name)
else:
    assert False, "Stop here"


lsapp.csv


In [3]:
class ZeroScore:
    def generate_scores(self, uid, sid, sess_items, item_pool):
        return np.zeros_like(item_pool)

class RandomGuess:
    def __init__(self, seed=None):
        self.random_state = np.random.RandomState(seed)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        scores = self.random_state.rand(len(item_pool))
        return scores

class MostRecentlyUsed:
    def generate_scores(self, uid, sid, sess_items, item_pool):
        """
        `item_pool` may have more items than there're in `sess_items`,
        but it's not possible to generate more than |sess_items| scores,
        hence, all other items in `item_pool` are assigned with random score
        """
        scores = np.zeros(len(item_pool))
        for i, item in enumerate(sess_items):
            item_pos = index(item_pool, item)
            if item_pos is not None:
                scores[item_pos] = i+1
        return scores

class MostFrequentlyUsed:
    def __init__(self, userid='userid', itemid='appid'):
        self.userid = userid
        self.itemid = itemid
        self.frequencies = None

    def fit(self, train):
        self.frequencies = (
            train
            .groupby(self.userid)
            [self.itemid]
            .value_counts(sort=False)
            .sort_index()
        )

    def generate_scores(self, uid, sid, sess_items, item_pool):
        idx = pd.MultiIndex.from_product([[uid], item_pool])
        scores = self.frequencies.reindex(idx, fill_value=-1).values
        return scores

class MarkovianI2I:
    def __init__(self, group_key='sessid_global', itemid='appid'):
        self.group_key = group_key
        self.itemid = itemid
        self.transitions = None

    def fit(self, train):
        source_items = train.groupby(self.group_key)[self.itemid].shift(fill_value=-1)
        dest_items = train[self.itemid]
        self.transitions = dest_items.groupby(source_items).value_counts(sort=False)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        idx = pd.MultiIndex.from_product([[sess_items[-1]], item_pool])
        scores = self.transitions.reindex(idx, fill_value=-1).values
        return scores

class OnDeviceMarkovianI2I:
    def __init__(self, group_key='sessid_global', userid='userid', itemid='appid'):
        self.group_key = group_key
        self.userid = userid
        self.itemid = itemid
        self.transitions = None

    def fit(self, train):
        source_items = train.groupby(self.group_key)[self.itemid].shift(fill_value=-1)
        dest_items = train[self.itemid]
        self.transitions = dest_items.groupby([train[self.userid], source_items]).value_counts(sort=False)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        user_transitions = self.transitions.loc[uid]
        idx = pd.MultiIndex.from_product([[sess_items[-1]], item_pool])
        scores = user_transitions.reindex(idx, fill_value=-1).values
        return scores

In [4]:
zrs = ZeroScore()
rnd = RandomGuess(seed=42)

mfu = MostFrequentlyUsed()
mfu.fit(process_data.train)

mru = MostRecentlyUsed()

i2i_ub = MarkovianI2I(group_key='userid')
i2i_ub.fit(process_data.train)

i2i_sb = MarkovianI2I(group_key='sessid_global')
i2i_sb.fit(process_data.train)

i2i_od_ub = OnDeviceMarkovianI2I(group_key='userid')
i2i_od_ub.fit(process_data.train)

i2i_od_sb = OnDeviceMarkovianI2I(group_key='sessid_global')
i2i_od_sb.fit(process_data.train)

In [5]:
data_args = (process_data.valid_sessions, process_data.seen_interactions)

In [6]:
i2i_od_ub_metrics, i2i_od_ub_stats = evaluate(i2i_od_ub.generate_scores, *data_args)
i2i_od_sb_metrics, i2i_od_sb_stats = evaluate(i2i_od_sb.generate_scores, *data_args)

In [7]:
i2i_ub_metrics, i2i_ub_stats = evaluate(i2i_ub.generate_scores, *data_args)
i2i_sb_metrics, i2i_sb_stats = evaluate(i2i_sb.generate_scores, *data_args)

In [8]:
mfu_metrics, mfu_stats = evaluate(mfu.generate_scores, *data_args)
mru_metrics, mru_stats = evaluate(mru.generate_scores, *data_args)
rnd_metrics, rnd_stats = evaluate(rnd.generate_scores, *data_args)
zrs_metrics, zrs_stats = evaluate(zrs.generate_scores, *data_args)

# I2I models

In [9]:
i2i_od_sb_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.579786,0.579786,0.579786
3,0.791206,0.674434,0.704487
5,0.875706,0.69436,0.739829


In [10]:
i2i_od_ub_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.567783,0.567783,0.567783
3,0.79318,0.666956,0.699367
5,0.880679,0.687667,0.736031


In [11]:
i2i_ub_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.586045,0.586045,0.586045
3,0.793623,0.678397,0.708027
5,0.862847,0.694234,0.736553


In [12]:
i2i_sb_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.586039,0.586039,0.586039
3,0.792124,0.678053,0.7074
5,0.861034,0.693864,0.735836


# MFU, MRU, RND

In [13]:
mfu_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.323508,0.323508,0.323508
3,0.666823,0.4777,0.526395
5,0.803243,0.509655,0.583263


In [14]:
mru_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.57903,0.57903,0.57903
3,0.794683,0.681587,0.710952
5,0.831591,0.690457,0.726535


In [15]:
rnd_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.084865,0.084865,0.084865
3,0.226634,0.146181,0.166794
5,0.365081,0.178021,0.223991


In [16]:
zrs_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,0.0,0.0
3,0.023438,0.011285,0.014446
5,0.0625,0.02105,0.03127


# Latex 

In [17]:
def to_latex(metrics):
    return (
        metrics
        .applymap(lambda x: f"{x:.3f}")
        .unstack('topk')
        .to_frame('values')
        .T
        .to_latex(index=False)
    )

In [18]:
print(to_latex(i2i_od_sb_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.580 & 0.791 & 0.876 & 0.580 & 0.674 & 0.694 & 0.580 & 0.704 & 0.740 \\
\bottomrule
\end{tabular}



In [19]:
print(to_latex(i2i_od_ub_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.568 & 0.793 & 0.881 & 0.568 & 0.667 & 0.688 & 0.568 & 0.699 & 0.736 \\
\bottomrule
\end{tabular}



In [20]:
print(to_latex(i2i_ub_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.586 & 0.794 & 0.863 & 0.586 & 0.678 & 0.694 & 0.586 & 0.708 & 0.737 \\
\bottomrule
\end{tabular}



In [21]:
print(to_latex(i2i_sb_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.586 & 0.792 & 0.861 & 0.586 & 0.678 & 0.694 & 0.586 & 0.707 & 0.736 \\
\bottomrule
\end{tabular}



In [22]:
print(to_latex(mfu_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.324 & 0.667 & 0.803 & 0.324 & 0.478 & 0.510 & 0.324 & 0.526 & 0.583 \\
\bottomrule
\end{tabular}



In [23]:
print(to_latex(mru_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.579 & 0.795 & 0.832 & 0.579 & 0.682 & 0.690 & 0.579 & 0.711 & 0.727 \\
\bottomrule
\end{tabular}



In [24]:
print(to_latex(rnd_metrics.groupby(level='topk').mean()))

\begin{tabular}{lllllllll}
\toprule
   hr & \multicolumn{3}{l}{mrr} & \multicolumn{3}{l}{ndcg} \\
    1 &     3 &     5 &     1 &     3 &     5 &     1 &     3 &     5 \\
\midrule
0.085 & 0.227 & 0.365 & 0.085 & 0.146 & 0.178 & 0.085 & 0.167 & 0.224 \\
\bottomrule
\end{tabular}

