In [1]:
import numpy as np
import pandas as pd
from DataProcessor import DataProcessor
from evaluation import evaluate
from lib import index

In [2]:
data_file_name = "lsapp.csv"

process_data = DataProcessor(
    data_file_name,
    column_names='userid,appid,timestamp',
    session_break_delta='15min',
)
process_data.prepare_data(
    usecols=['userid', 'appid', 'timestamp'],
    test_interval='14d',
    valid_interval='7d',
    min_sess_length=2,
    window="3s",
)

In [3]:
class ZeroScore:
    def generate_scores(self, uid, sid, sess_items, item_pool):
        return np.zeros_like(item_pool)

class RandomGuess:
    def __init__(self, seed=None):
        self.random_state = np.random.RandomState(seed)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        scores = self.random_state.rand(len(item_pool))
        return scores

class MostRecentlyUsed:
    def generate_scores(self, uid, sid, sess_items, item_pool):
        """
        `item_pool` may have more items than there're in `sess_items`,
        but it's not possible to generate more than |sess_items| scores,
        hence, all other items in `item_pool` are assigned with random score
        """
        scores = np.zeros(len(item_pool))
        for i, item in enumerate(sess_items):
            item_pos = index(item_pool, item)
            if item_pos is not None:
                scores[item_pos] = i+1
        return scores

class MostFrequentlyUsed:
    def __init__(self, userid='userid', itemid='appid'):
        self.userid = userid
        self.itemid = itemid
        self.frequencies = None

    def fit(self, train):
        self.frequencies = (
            train
            .groupby(self.userid)
            [self.itemid]
            .value_counts(sort=False)
            .sort_index()
        )

    def generate_scores(self, uid, sid, sess_items, item_pool):
        idx = pd.MultiIndex.from_product([[uid], item_pool])
        scores = self.frequencies.reindex(idx, fill_value=-1).values
        return scores

class MarkovianI2I:
    def __init__(self, group_key='sessid_global', itemid='appid'):
        self.group_key = group_key
        self.itemid = itemid
        self.transitions = None

    def fit(self, train):
        source_items = train.groupby(self.group_key)[self.itemid].shift(fill_value=-1)
        dest_items = train[self.itemid]
        self.transitions = dest_items.groupby(source_items).value_counts(sort=False)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        idx = pd.MultiIndex.from_product([[sess_items[-1]], item_pool])
        scores = self.transitions.reindex(idx, fill_value=-1).values
        return scores

class OnDeviceMarkovianI2I:
    def __init__(self, group_key='sessid_global', userid='userid', itemid='appid'):
        self.group_key = group_key
        self.userid = userid
        self.itemid = itemid
        self.transitions = None

    def fit(self, train):
        source_items = train.groupby(self.group_key)[self.itemid].shift(fill_value=-1)
        dest_items = train[self.itemid]
        self.transitions = dest_items.groupby([train[self.userid], source_items]).value_counts(sort=False)

    def generate_scores(self, uid, sid, sess_items, item_pool):
        user_transitions = self.transitions.loc[uid]
        idx = pd.MultiIndex.from_product([[sess_items[-1]], item_pool])
        scores = user_transitions.reindex(idx, fill_value=-1).values
        return scores

In [4]:
zrs = ZeroScore()
rnd = RandomGuess(seed=42)

mfu = MostFrequentlyUsed()
mfu.fit(process_data.train)

mru = MostRecentlyUsed()

i2i_ub = MarkovianI2I(group_key='userid')
i2i_ub.fit(process_data.train)

i2i_sb = MarkovianI2I(group_key='sessid_global')
i2i_sb.fit(process_data.train)

i2i_od_ub = OnDeviceMarkovianI2I(group_key='userid')
i2i_od_ub.fit(process_data.train)

i2i_od_sb = OnDeviceMarkovianI2I(group_key='sessid_global')
i2i_od_sb.fit(process_data.train)

In [5]:
data_args = (process_data.test_sessions, process_data.seen_interactions)

In [6]:
i2i_od_ub_metrics, i2i_od_ub_stats = evaluate(i2i_od_ub.generate_scores, *data_args)
i2i_od_sb_metrics, i2i_od_sb_stats = evaluate(i2i_od_sb.generate_scores, *data_args)

In [7]:
i2i_ub_metrics, i2i_ub_stats = evaluate(i2i_ub.generate_scores, *data_args)
i2i_sb_metrics, i2i_sb_stats = evaluate(i2i_sb.generate_scores, *data_args)

In [8]:
mfu_metrics, mfu_stats = evaluate(mfu.generate_scores, *data_args)
mru_metrics, mru_stats = evaluate(mru.generate_scores, *data_args)
rnd_metrics, rnd_stats = evaluate(rnd.generate_scores, *data_args)
zrs_metrics, zrs_stats = evaluate(zrs.generate_scores, *data_args)

# I2I models

In [9]:
i2i_od_sb_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.584861,0.584861,0.584861
3,0.787535,0.676744,0.705307
5,0.857406,0.692375,0.733789


In [10]:
i2i_od_ub_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.588859,0.588859,0.588859
3,0.79669,0.680627,0.710443
5,0.872612,0.697493,0.741288


In [11]:
i2i_ub_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.611228,0.611228,0.611228
3,0.796811,0.694881,0.721138
5,0.863654,0.710061,0.748585


In [12]:
i2i_sb_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.611206,0.611206,0.611206
3,0.796208,0.694938,0.72104
5,0.860992,0.709542,0.747546


# MFU, MRU, RND

In [13]:
mfu_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.387227,0.387227,0.387227
3,0.638604,0.495672,0.532282
5,0.763393,0.525104,0.584479


In [14]:
mru_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.618601,0.618601,0.618601
3,0.823762,0.716533,0.744392
5,0.848818,0.722165,0.754628


In [15]:
rnd_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.078964,0.078964,0.078964
3,0.238191,0.147947,0.171074
5,0.387557,0.181821,0.232363


In [16]:
zrs_metrics.groupby(level='topk').mean()

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.0,0.0,0.0
3,0.035714,0.017857,0.022533
5,0.066964,0.024107,0.034622


## PureSVD:

In [17]:
from scipy.sparse.linalg import svds
from itertools import product

def build_svd_model(rank, c0, gamma, data_processor):
    Cui = data_processor.get_freqs(data_processor.train, c0=c0, gamma=gamma)
    _, s, vt = svds(Cui, k=rank, return_singular_vectors='vh')
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return (None, None), (item_factors, None)

def get_scores_generator(local_factors, global_factors):
    Q, _ = global_factors
    def generate_scores(uid, sid, sess_items, item_pool):
        scores = Q[item_pool] @ Q[sess_items].sum(axis=0)
        return scores 
    return generate_scores

In [18]:
rank_range = np.arange(50, 87)
gamma_range = [0.5, 1, 2]
c0 = 1.0
show_result = True

best_params = None
best_hr = 0.0
for gamma in gamma_range:
    local_factors, global_factors_ = build_svd_model(
        rank_range[-1],
        c0,
        gamma,
        process_data,
    )
    for rank in rank_range:
        global_factors = (global_factors_[0][:, :rank], None)
        up_generate_scores = get_scores_generator(local_factors, global_factors)
        metrics_df, user_stats = evaluate(
            up_generate_scores,
            process_data.valid_sessions,
            process_data.seen_interactions,
        )
        valid_results = (
            metrics_df
            .reset_index()
            .groupby(["topk"])
            .mean()[["hr", "mrr", "ndcg"]]
        )
        hr = valid_results["hr"][5]

        if hr > best_hr:
            best_hr = hr
            best_params = (gamma, rank)
            if show_result:
                print(
                    f"PureSVD:"
                    + f"\nBest HR@5: {best_hr}; MRR@5: {valid_results['mrr'][5]}"
                    + f"\nThe best performance parameters:"
                    + f'\nGamma={gamma}, Rank={rank}'
                ) 
        

PureSVD:
Best HR@5: 0.8712991752458233; MRR@5: 0.6490695197027583
The best performance parameters:
Gamma=0.5, Rank=50
PureSVD:
Best HR@5: 0.8815568552644231; MRR@5: 0.6571098844543022
The best performance parameters:
Gamma=0.5, Rank=59
PureSVD:
Best HR@5: 0.8818967380707076; MRR@5: 0.6576091810965145
The best performance parameters:
Gamma=0.5, Rank=60
PureSVD:
Best HR@5: 0.8838193670571508; MRR@5: 0.6589605427732163
The best performance parameters:
Gamma=0.5, Rank=61
PureSVD:
Best HR@5: 0.8848431983039485; MRR@5: 0.6601735245066581
The best performance parameters:
Gamma=0.5, Rank=62
PureSVD:
Best HR@5: 0.8860403978888801; MRR@5: 0.6620033652225418
The best performance parameters:
Gamma=0.5, Rank=63
PureSVD:
Best HR@5: 0.8887606475619921; MRR@5: 0.6675687667720017
The best performance parameters:
Gamma=0.5, Rank=81
PureSVD:
Best HR@5: 0.889994086559962; MRR@5: 0.664444663896932
The best performance parameters:
Gamma=1, Rank=83


In [19]:
gamma, rank = best_params

local_factors, global_factors = build_svd_model(
    rank,
    c0,
    gamma,
    process_data,
)

up_generate_scores = get_scores_generator(local_factors, global_factors)

metrics_df, user_stats = evaluate(
    up_generate_scores,
    process_data.test_sessions,
    process_data.seen_interactions,
)

test_results = (
    metrics_df
    .reset_index()
    .groupby(["topk"])
    .mean()[["hr", "mrr", "ndcg"]]
)

test_results

metrics,hr,mrr,ndcg
topk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.538037,0.538037,0.538037
3,0.807137,0.660353,0.69821
5,0.878165,0.67686,0.727704
