# Collaborative Filtering Recommendations using Original Values

In [1]:
import pandas as pd
import numpy as np
import implicit

from scipy.sparse import csr_matrix

In [2]:
train = pd.read_csv("data\\train-plays.csv")
test = pd.read_csv("data\\test-plays.csv")
game_coding = pd.read_csv("data\\game-coding.csv")

In [3]:
train.shape[0]

39893

In [4]:
train.game_id.unique().size

1033

## Format training data

In [5]:
game_user = train.pivot(index="game_id", columns="user_id", values="amount")

In [6]:
game_user = game_user.fillna(0)

In [7]:
user_coding = pd.DataFrame({"original":game_user.columns, "coded":np.arange(game_user.columns.size)})

In [8]:
game_user_sparse = csr_matrix(game_user)

In [9]:
user_game = game_user.T

In [10]:
user_game_sparse = csr_matrix(user_game)

In [11]:
user_ids = train['user_id'].unique()

In [12]:
matrix_size = game_user_sparse.shape[0] * game_user_sparse.shape[1] # Number of possible interactions in the matrix
num_played = len(game_user_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100 * (1 - (num_played / matrix_size))
sparsity

98.86649290751829

## Alternating Least Squares

In [13]:
als = implicit.als.AlternatingLeastSquares(128, 0.05, iterations=50)



In [14]:
als.fit(game_user_sparse)

100%|██████████| 50/50 [00:01<00:00, 30.04it/s]


In [15]:
game_recs = np.ndarray.flatten(als.recommend_all(user_game_sparse, N=5))
user_5 = np.repeat(user_game.index, 5)
recommendations = pd.DataFrame({"user_id":user_5, "game_id":game_recs})

100%|██████████| 3407/3407 [00:00<00:00, 63092.71it/s]


In [16]:
results = recommendations.merge(test, on="user_id")

In [17]:
results['recommended'] = np.where(results.game_id_x == results.game_id_y, 1, 0)

In [18]:
# accuracy
results[results.recommended == 1].shape[0] / test.shape[0]

0.2207220428529498

In [19]:
# % of games recommended
np.unique(game_recs).size / train.game_id.unique().size

0.6979670861568248

# Bayesian Personalized Ranking

In [20]:
bpr = implicit.bpr.BayesianPersonalizedRanking(128, 0.01, 0.05, iterations=200, use_gpu=False)

In [21]:
bpr.fit(game_user_sparse)

100%|██████████| 200/200 [00:03<00:00, 51.24it/s, train_auc=87.61%, skipped=9.08%]


In [22]:
game_recs = np.ndarray.flatten(bpr.recommend_all(user_game_sparse, N=5))
user_5 = np.repeat(user_game.index, 5)
recommendations = pd.DataFrame({"user_id":user_5, "game_id":game_recs})

100%|██████████| 3407/3407 [00:00<00:00, 69517.39it/s]


In [23]:
results = recommendations.merge(test, on="user_id")

In [24]:
results['recommended'] = np.where(results.game_id_x == results.game_id_y, 1, 0)

In [25]:
# accuracy
results[results.recommended == 1].shape[0] / test.shape[0]

0.19694746110948047

In [26]:
# % of games recommended
np.unique(game_recs).size / train.game_id.unique().size

0.6940948693126815

# Logistic Matrix Factorization

In [27]:
lmf = implicit.lmf.LogisticMatrixFactorization(128)

In [28]:
lmf.fit(game_user_sparse)

100%|██████████| 30/30 [00:06<00:00,  4.82it/s]


In [29]:
game_recs = np.ndarray.flatten(lmf.recommend_all(user_game_sparse, N=5))
user_5 = np.repeat(user_game.index, 5)
recommendations = pd.DataFrame({"user_id":user_5, "game_id":game_recs})

100%|██████████| 3407/3407 [00:00<00:00, 70981.84it/s]


In [30]:
results = recommendations.merge(test, on="user_id")

In [31]:
results['recommended'] = np.where(results.game_id_x == results.game_id_y, 1, 0)

In [32]:
# accuracy
results[results.recommended == 1].shape[0] / test.shape[0]

0.06398591135896683

In [33]:
# % of games recommended
np.unique(game_recs).size / train.game_id.unique().size

0.4036786060019361

# Guessing

In [34]:
most_pop = train['game_id'].value_counts().index.to_list()[0:5]

In [35]:
num_total = 0
num_correct = 0
for user_id in user_ids:
    test_df = test[test['user_id'] == user_id]
    test_game_id = test_df['game_id'][user_id]
    num_total = num_total + 1
    if test_game_id in most_pop:
        num_correct = num_correct + 1

In [36]:
print(num_correct / num_total)

0.05283240387437629


# ALS + BPR

In [37]:
user_recs = dict()
for user_id in user_ids:
    res = set(idx for idx, score in als.recommend(user_id, user_game_sparse))
    user_recs[user_id] = res

In [38]:
for user_id in user_ids:
    res = set(idx for idx, score in bpr.recommend(user_id, user_game_sparse))
    als_res = user_recs[user_id]
    for r in res:
        als_res.add(r)
    user_recs[user_id] = als_res

In [39]:
num_total = 0
num_correct = 0
for user_id in user_ids:
    test_df = test[test['user_id'] == user_id]
    test_game_id = test_df['game_id'][user_id]
    num_total = num_total + 1
    if test_game_id in user_recs[user_id]:
        num_correct = num_correct + 1
        
print(num_correct / num_total)

0.3486938655708835


# Average Accuracy Calculation

In [40]:
accuracy = 0
seen = 0
for i in range(0, 100):
    als = implicit.als.AlternatingLeastSquares(128, 0.05, iterations=50)
    als.fit(game_user_sparse)
    game_recs = np.ndarray.flatten(als.recommend_all(user_game_sparse, N=5))
    user_5 = np.repeat(user_game.index, 5)
    recommendations = pd.DataFrame({"user_id":user_5, "game_id":game_recs})
    results = recommendations.merge(test, on="user_id")
    results['recommended'] = np.where(results.game_id_x == results.game_id_y, 1, 0)
    # accuracy
    accuracy = accuracy + (results[results.recommended == 1].shape[0] / test.shape[0])
    # % of games recommended
    seen = seen + (np.unique(game_recs).size / train.game_id.unique().size)

100%|██████████| 50/50 [00:01<00:00, 30.33it/s]
100%|██████████| 3407/3407 [00:00<00:00, 64283.34it/s]
100%|██████████| 50/50 [00:01<00:00, 29.49it/s]
100%|██████████| 3407/3407 [00:00<00:00, 57788.02it/s]
100%|██████████| 50/50 [00:01<00:00, 27.86it/s]
100%|██████████| 3407/3407 [00:00<00:00, 65363.63it/s]
100%|██████████| 50/50 [00:01<00:00, 30.05it/s]
100%|██████████| 3407/3407 [00:00<00:00, 60478.04it/s]
100%|██████████| 50/50 [00:01<00:00, 29.04it/s]
100%|██████████| 3407/3407 [00:00<00:00, 61276.53it/s]
100%|██████████| 50/50 [00:01<00:00, 30.15it/s]
100%|██████████| 3407/3407 [00:00<00:00, 52341.81it/s]
100%|██████████| 50/50 [00:01<00:00, 30.42it/s]
100%|██████████| 3407/3407 [00:00<00:00, 50851.90it/s]
100%|██████████| 50/50 [00:01<00:00, 27.39it/s]
100%|██████████| 3407/3407 [00:00<00:00, 54930.67it/s]
100%|██████████| 50/50 [00:01<00:00, 29.05it/s]
100%|██████████| 3407/3407 [00:00<00:00, 55795.66it/s]
100%|██████████| 50/50 [00:01<00:00, 30.62it/s]
100%|██████████| 3407/340

In [41]:
accuracy / 100

0.22132081009685944

In [42]:
seen / 100

0.7009196515004834

In [43]:
accuracy = 0
seen = 0
for i in range(0, 100):
    bpr = implicit.bpr.BayesianPersonalizedRanking(128, 0.01, 0.05, iterations=200)
    bpr.fit(game_user_sparse)
    game_recs = np.ndarray.flatten(bpr.recommend_all(user_game_sparse, N=5))
    user_5 = np.repeat(user_game.index, 5)
    recommendations = pd.DataFrame({"user_id":user_5, "game_id":game_recs})
    results = recommendations.merge(test, on="user_id")
    results['recommended'] = np.where(results.game_id_x == results.game_id_y, 1, 0)
    # accuracy
    accuracy = accuracy + (results[results.recommended == 1].shape[0] / test.shape[0])
    # % of games recommended
    seen = seen + (np.unique(game_recs).size / train.game_id.unique().size)

100%|██████████| 200/200 [00:03<00:00, 57.32it/s, train_auc=87.67%, skipped=9.24%]
100%|██████████| 3407/3407 [00:00<00:00, 55853.01it/s]
100%|██████████| 200/200 [00:03<00:00, 56.51it/s, train_auc=87.59%, skipped=9.42%]
100%|██████████| 3407/3407 [00:00<00:00, 57747.61it/s]
100%|██████████| 200/200 [00:03<00:00, 54.60it/s, train_auc=87.59%, skipped=8.98%]
100%|██████████| 3407/3407 [00:00<00:00, 63096.05it/s]
100%|██████████| 200/200 [00:03<00:00, 54.39it/s, train_auc=87.68%, skipped=8.92%]
100%|██████████| 3407/3407 [00:00<00:00, 50104.29it/s]
100%|██████████| 200/200 [00:02<00:00, 68.82it/s, train_auc=87.84%, skipped=9.15%]
100%|██████████| 3407/3407 [00:00<00:00, 72496.82it/s]
100%|██████████| 200/200 [00:02<00:00, 70.70it/s, train_auc=87.42%, skipped=9.23%]
100%|██████████| 3407/3407 [00:00<00:00, 70982.90it/s]
100%|██████████| 200/200 [00:02<00:00, 71.53it/s, train_auc=87.46%, skipped=9.49%]
100%|██████████| 3407/3407 [00:00<00:00, 70986.78it/s]
100%|██████████| 200/200 [00:02<00

In [44]:
accuracy / 100

0.18717346639272084

In [45]:
seen / 100

0.7106873184898356

In [46]:
accuracy = 0
seen = 0
for i in range(0, 100):
    lmf = implicit.lmf.LogisticMatrixFactorization(128)
    lmf.fit(game_user_sparse)
    game_recs = np.ndarray.flatten(lmf.recommend_all(user_game_sparse, N=5))
    user_5 = np.repeat(user_game.index, 5)
    recommendations = pd.DataFrame({"user_id":user_5, "game_id":game_recs})
    results = recommendations.merge(test, on="user_id")
    results['recommended'] = np.where(results.game_id_x == results.game_id_y, 1, 0)
    # accuracy
    accuracy = accuracy + (results[results.recommended == 1].shape[0] / test.shape[0])
    # % of games recommended
    seen = seen + (np.unique(game_recs).size / train.game_id.unique().size)

100%|██████████| 30/30 [00:05<00:00,  5.10it/s]
100%|██████████| 3407/3407 [00:00<00:00, 63092.43it/s]
100%|██████████| 30/30 [00:06<00:00,  4.99it/s]
100%|██████████| 3407/3407 [00:00<00:00, 69534.98it/s]
100%|██████████| 30/30 [00:06<00:00,  4.89it/s]
100%|██████████| 3407/3407 [00:00<00:00, 55855.63it/s]
100%|██████████| 30/30 [00:06<00:00,  4.79it/s]
100%|██████████| 3407/3407 [00:00<00:00, 56795.35it/s]
100%|██████████| 30/30 [00:06<00:00,  4.60it/s]
100%|██████████| 3407/3407 [00:00<00:00, 46677.06it/s]
100%|██████████| 30/30 [00:06<00:00,  4.71it/s]
100%|██████████| 3407/3407 [00:00<00:00, 47987.14it/s]
100%|██████████| 30/30 [00:06<00:00,  4.81it/s]
100%|██████████| 3407/3407 [00:00<00:00, 69531.25it/s]
100%|██████████| 30/30 [00:05<00:00,  5.05it/s]
100%|██████████| 3407/3407 [00:00<00:00, 66807.20it/s]
100%|██████████| 30/30 [00:05<00:00,  5.02it/s]
100%|██████████| 3407/3407 [00:00<00:00, 69538.02it/s]
100%|██████████| 30/30 [00:05<00:00,  5.11it/s]
100%|██████████| 3407/340

In [47]:
accuracy / 100

0.05872321690636926

In [48]:
seen / 100

0.39765730880929334