## Application of count models on Movielens

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from collections import deque
import numpy as np
import pandas as pd
from subprocess import call
import matplotlib.pyplot as plt
from functools import partial
from dataclasses import dataclass
from pprint import pprint
from typing import List, Tuple, Union, Optional, Dict



In [3]:
import count_model.dataset.movielens as movielens
from count_model.evaluate.evaluate import *
from count_model.evaluate.evaluation import Evaluation
from count_model.evaluate.score_summary import ScoreSummary
from count_model.assessor.avg_posterior import AvgPosterior
from count_model.assessor.bayes_posterior import BayesPosterior
from count_model.assessor.combined_posterior import CombinedPosterior
from count_model.assessor.max_posterior import MaxPosterior
from count_model.assessor.naive_bayes import NaiveBayes



In [4]:
print(os.getcwd())


/home/ctripp/zazzle/pybpr/examples


In [40]:
# %%time
# df = load_movielens_data('ml-1m')
dataset_name = 'ml-100k'

df = movielens.load_movielens_data(dataset_name)
df['positive'] = (df['rating'] >= 4).astype(np.int8)
df.head()


Unnamed: 0,user_id,item_id,rating,timestamp,positive
0,259,255,4,874724710,1
1,259,286,4,874724727,1
2,259,298,4,874724754,1
3,259,185,4,874724781,1
4,259,173,4,874724843,1


In [42]:
movies = movielens.load_movielens_items(dataset_name)
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

movies_ = movies.copy()
stemmer = PorterStemmer()
movies['stems'] = movies['movie title'].apply(lambda text : [
    stemmer.stem(token) for token in nltk.tokenize.word_tokenize(text[:-6])])
movies.head()


Unnamed: 0_level_0,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,stems
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,"[toy, stori]"
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,[goldeney]
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,"[four, room]"
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,"[get, shorti]"
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,[copycat]


In [43]:
# %%time
test_proportion = 0.2

total = len(df)
slice_index = int(test_proportion * total)
print(f'{total} total entries. {test_proportion*100}% = {slice_index}')

# train_df = df.iloc[:slice_index].sort_values(['user_id', 'timestamp']).reset_index()
# test_df = df.iloc[slice_index:].sort_values(['user_id', 'timestamp']).reset_index()
train_df = df.iloc[:slice_index]
test_df = df.iloc[slice_index:]
test_timestamp = test_df['timestamp'].iloc[0]
print(f'test_timestamp: {test_timestamp}')

100000 total entries. 20.0% = 20000
test_timestamp: 878963305


In [44]:
from count_model.link_counter import LinkCounter
from count_model.uniform_prior_model import UniformPriorModel
from count_model.window_counter import WindowCounter
from count_model.permutation_counter import PermutationCounter
from count_model.link_count_data import LinkCountData


positive_counter = PermutationCounter(LinkCounter())
negative_counter = PermutationCounter(LinkCounter())
link_counter = LinkCounter()
co_occurrence_counter = PermutationCounter(link_counter)
event_counter = LinkCounter()
# counter = WindowCounter(LinkCounter(), 10)

from count_model.feature_counter import FeatureCounter
from count_model.porter_stem_counter import PorterStemCounter



In [14]:

from count_model.interaction import Interaction


user_groups = df.groupby('user_id')
for user_id, group in user_groups:
    user_df = user_groups.get_group(user_id).join(movies, on='item_id')
    # positive_interactions = user_df[user_df['positive'] >= 1]
    
    interactions = list(movielens.make_user_item_interaction_sequence(user_df))
    for interaction in interactions:
        event_counter.observe_link(interaction.object, interaction.verb)
    
    co_occurrence_counter.observe_sequence(interactions)
    for row_index, row in user_df.iterrows():
        for stem in row['stem']:
            interaction = Interaction(
                movielens.get_user_id(row),
                movielens.get_action_id(row),
                stem,
            )
            
            # base feature interaction
            link_counter.observe_link(interaction.verb)

            #  user-feature interaction
            link_counter.observe_link(interaction.object, inter)
    
    
    # link_counter.observe_sequence(interactions)

    

    # positive_counter.observe_sequence(
    #     movielens.make_action_sequence(user_df[user_df['positive'] == 1])
    # )
    # negative_counter.observe_sequence(
    #     movielens.make_action_sequence(user_df[user_df['positive'] == 0])
    # )
    
    



In [11]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

stem_counter = FeatureCounter(
    LinkCounter(),
    lambda row: (stemmer.stem(token) for token in word_tokenize(row['Movie Title'])),
)

# user liking or disliking linked to feature
# P(previously clicked/didn't click product with feature F | click on A with feature F)
# "forward" way: P(click on product with F | clicked on a product with F)
# P(clicked on a product with F | click on product with F)
# source to dest / sum(all sources to dest)
# (click on F, click on F again) / sum(click on F, click or nonclick on F again) -> how much does a click on an item with F mean the user will click on another with F
# "what proportion of the time does the user click on products with F when shown them?"
# "If a user clicks on a product with feature F, how likely are they to click on another one with F?"
# (click on F, click on F again) / (click on F, nonclick on F)
# in aggregate: (interact with F >= 2 times)
# num clicks on F, num non-clicks on F
# simple model might have repeated probability ~= (num F clicks / total F interactions )
# (one previous click on F | click on F) vs (one previous click on F | nonclick on F) -> high
# one previous nonclick on F | click on F -> low
# many previous clicks on F | click on F -> very high

user_groups = df.groupby('user_id')
for user_id, group in user_groups:
    user_df = user_groups.get_group(user_id)
    for idx, row in user_df.iterrows():
        stem_counter.observe_link(make_action_tuple(row, False), make_action_tuple(row))


In [24]:
test_df_sample = test_df.sample(4000)
print(len(test_df_sample))
pos_df = df[df['positive'] == 1]
pos_train_df = train_df[train_df['positive'] == 1]
pos_test_df = test_df_sample[test_df_sample['positive'] == 1]

neg_df = df[df['positive'] == 0]
neg_train_df = train_df[train_df['positive'] == 0]
neg_test_df = test_df_sample[test_df_sample['positive'] == 0]


4000


In [25]:
pos_bayes = len(train_df[train_df['positive'] == 1]) / len(train_df)
print(pos_bayes)


0.55975


+ Low vs High interaction items/actions
+ Sequence-Based Factors
    + Previous N actions -> Next action
    + A,B,C,D sequence counts up how many times [A,B,C] happened and then [D] happened
    + P(D | A, B, C happened most recently)
    + Ordered and unordered conditioning 
        + P(D | A then B then C happened)
            + flipped: P(A,B,C happened | D happened next)
                + ~= (# times A,B,C happened before D / # times D happened)
        + P(D | A, B, and C all recently happened)
            + flipped: P(A,B,C happened | D happened next)
                + ~= (# times A,B,C happened before D / # times D happened)
+ Destination-Based Factors
    + A,B,C,D sequence counts up how many times A preceeded D, B, preceeded, D, and C preceeded D in the previous N actions.
    + P(D | A happened within the previous N actions), P(D | B happened within the previous N actions), etc...
        + flipped version used in NB calculation: P(A happened in previous N actions | D happened)
            + ~= (# times D happened after A / # times D happened)
+ Heuristics to compensate for Conditional Independance Assumption
+ Feature-Based Factors
    + A has feature X, B has feature X:
        + P(click another product with feature X | clicked N products with feature X)
        + flipped: P(clicked N products with X | clicked another product with X)
            + ~= (# times clicked N+1 products with X) / (# times clicked N products with X)
            + maybe simplified into (# times clicked N+1 products with X with N+1 >= 2) / (# times clicked N products with X)
                + How often a clicker of a product with X clicks on another product with X
    + alternate framing / method:
        + use clicks to update probabilities that user likes or dislikes a feature
            + or 'affinity' for feature
        + then use the feature affinities to predict likelihood of action/click


In [26]:
nb_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_naive_bayes_posterior(
            pos_bayes * 100,
            100,
            1e-9,
            2e-9,
            1e-9,
            2e-9,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(nb_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.2635477872338648,
                                 positives=-0.2395985891226623,
                                 negatives=-0.2924957688271404),
                      Evaluation(name='brier',
                                 score=0.14237455113263894,
                                 positives=0.12460648251176422,
                                 negatives=0.16385125031049352),
                      Evaluation(name='prob',
                                 score=0.8856213831261657,
                                 positives=0.8934122290216193,
                                 negatives=0.8762043971155922),
                      Evaluation(name='accuracy',
                                 score=0.901,
                                 positives=0.9237094563727729,
                                 negatives=0.8735505245720596)],
             dynamic_ndcg=0.9647023078504892)


In [27]:
max_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_max_posterior(
            pos_bayes * 10,
            10,
            pos_bayes * 10,
            10,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)
pprint(max_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.6520316051545292,
                                 positives=-0.2741244189430363,
                                 negatives=-1.1088172653516348),
                      Evaluation(name='brier',
                                 score=0.46373837376566673,
                                 positives=0.12689560155986,
                                 negatives=0.8708884722518683),
                      Evaluation(name='prob',
                                 score=0.5773672051555998,
                                 positives=0.7664181912702543,
                                 negatives=0.3488566537447887),
                      Evaluation(name='accuracy',
                                 score=0.565,
                                 positives=0.9936043855641845,
                                 negatives=0.046935394809497516)],
             dynamic_ndcg=1.0)


In [28]:
avg_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_avg_posterior(
            pos_bayes * 10,
            10,
            pos_bayes * 10,
            10,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(avg_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.5850898973785197,
                                 positives=-0.47868329541798904,
                                 negatives=-0.7137061600464392),
                      Evaluation(name='brier',
                                 score=0.3974702146555803,
                                 positives=0.2980141181932548,
                                 negatives=0.5176852313071709),
                      Evaluation(name='prob',
                                 score=0.5723377067977854,
                                 positives=0.6294279487525614,
                                 negatives=0.5033313348270482),
                      Evaluation(name='accuracy',
                                 score=0.72325,
                                 positives=0.8675194152581087,
                                 negatives=0.548868028713418)],
             dynamic_ndcg=1.0)


In [29]:
combined_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_combined_posterior(
            pos_bayes * 10,
            10,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(combined_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.5503050778330895,
                                 positives=-0.46140056202867163,
                                 negatives=-0.6577661408346746),
                      Evaluation(name='brier',
                                 score=0.3720707765363809,
                                 positives=0.29018038530344437,
                                 negatives=0.4710536955915426),
                      Evaluation(name='prob',
                                 score=0.6103759819092439,
                                 positives=0.6527489613022984,
                                 negatives=0.5591587252049944),
                      Evaluation(name='accuracy',
                                 score=0.7205,
                                 positives=0.8309730470534491,
                                 negatives=0.5869685256764219)],
             dynamic_ndcg=0.927982050368625)


In [30]:
bayes_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_bayes_posterior(
            pos_bayes * 10,
            10,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(bayes_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.6030591970301514,
                                 positives=-0.49315899541221353,
                                 negatives=-0.7358982590630978),
                      Evaluation(name='brier',
                                 score=0.41555275773249184,
                                 positives=0.31592031764465023,
                                 negatives=0.535980925237895),
                      Evaluation(name='prob',
                                 score=0.5752704135582849,
                                 positives=0.6303156187053142,
                                 negatives=0.5087359276019915),
                      Evaluation(name='accuracy',
                                 score=0.67675,
                                 positives=0.8209227957971676,
                                 negatives=0.5024848150193263)],
             dynamic_ndcg=0.927982050368625)


In [31]:
inb_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_naive_bayes_posterior(
            pos_bayes * 100,
            100,
            1e-9,
            2e-9,
            1e3,
            2e3,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(nb_scores)
pprint(inb_scores)


  return np.log(p)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.2635477872338648,
                                 positives=-0.2395985891226623,
                                 negatives=-0.2924957688271404),
                      Evaluation(name='brier',
                                 score=0.14237455113263894,
                                 positives=0.12460648251176422,
                                 negatives=0.16385125031049352),
                      Evaluation(name='prob',
                                 score=0.8856213831261657,
                                 positives=0.8934122290216193,
                                 negatives=0.8762043971155922),
                      Evaluation(name='accuracy',
                                 score=0.901,
                                 positives=0.9237094563727729,
                                 negatives=0.8735505245720596)],
             dynamic_ndcg=0.9647023078504892)
ScoreSummary(dynamic=[Evalua

In [32]:
inb_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_naive_bayes_posterior(
            pos_bayes * 10,
            10,
            1e-3,
            2e-3,
            1e-3,
            2e-3,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(nb_scores)
pprint(inb_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.2635477872338648,
                                 positives=-0.2395985891226623,
                                 negatives=-0.2924957688271404),
                      Evaluation(name='brier',
                                 score=0.14237455113263894,
                                 positives=0.12460648251176422,
                                 negatives=0.16385125031049352),
                      Evaluation(name='prob',
                                 score=0.8856213831261657,
                                 positives=0.8934122290216193,
                                 negatives=0.8762043971155922),
                      Evaluation(name='accuracy',
                                 score=0.901,
                                 positives=0.9237094563727729,
                                 negatives=0.8735505245720596)],
             dynamic_ndcg=0.9647023078504892)
ScoreSummary(dynamic=[Evalua

In [33]:
inb_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_naive_bayes_posterior(
            pos_bayes * 10,
            10,
            1e-6,
            2e-6,
            1e-6,
            2e-6,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(inb_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.26396614576264105,
                                 positives=-0.23490285160475105,
                                 negatives=-0.2990956603466395),
                      Evaluation(name='brier',
                                 score=0.14094228358638689,
                                 positives=0.12098409263220378,
                                 negatives=0.16506623720245908),
                      Evaluation(name='prob',
                                 score=0.8872920913808325,
                                 positives=0.8966410068310909,
                                 negatives=0.875991828586456),
                      Evaluation(name='accuracy',
                                 score=0.90225,
                                 positives=0.9241662859753312,
                                 negatives=0.8757592490336831)],
             dynamic_ndcg=0.927982050368625)


In [34]:
inb_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_naive_bayes_posterior(
            pos_bayes * 1,
            1,
            1e-6,
            2e-6,
            1e-6,
            2e-6,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(inb_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.2634085747300902,
                                 positives=-0.23425259519093528,
                                 negatives=-0.2986501204016585),
                      Evaluation(name='brier',
                                 score=0.140475313776115,
                                 positives=0.12055865032561669,
                                 negatives=0.16454907208265332),
                      Evaluation(name='prob',
                                 score=0.8878201900341296,
                                 positives=0.8971036232888882,
                                 negatives=0.8765990771712545),
                      Evaluation(name='accuracy',
                                 score=0.9025,
                                 positives=0.9241662859753312,
                                 negatives=0.8763114301490889)],
             dynamic_ndcg=0.9100043955169862)


In [35]:
inb_scores = compute_scores(
    df,
    train_df,
    test_df_sample,
    make_assessment_function(
        compute_naive_bayes_posterior(
            pos_bayes * 1,
            1,
            1e-9,
            2e-9,
            1e-9,
            2e-9,
        ),
        event_counter,
        co_occurrence_counter.link_counter,
    ),
)

pprint(inb_scores)


ScoreSummary(dynamic=[Evaluation(name='log',
                                 score=-0.2600960657691353,
                                 positives=-0.23213335647449942,
                                 negatives=-0.29389527650682595),
                      Evaluation(name='brier',
                                 score=0.1395686235188014,
                                 positives=0.11946302532950498,
                                 negatives=0.16387075186577532),
                      Evaluation(name='prob',
                                 score=0.8884868592856908,
                                 positives=0.8978346455916086,
                                 negatives=0.8771879613156996),
                      Evaluation(name='accuracy',
                                 score=0.903,
                                 positives=0.9246231155778895,
                                 negatives=0.8768636112644947)],
             dynamic_ndcg=0.9100043955169862)
