<h1>Item-Based Collaborative Filtering</h1>

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import time
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
start = time.time()
df_ur = pd.read_csv('C:/Users/guije/Documents/boardgames_databases/bgg-19m-revgen.csv')
print(f"Completed in {time.time() - start} seconds.")

Completed in 30.951889753341675 seconds.


In [3]:
df_ur.columns

Index(['user', 'rating', 'name', 'genre'], dtype='object')

In [4]:
print(f"The user 'Torsten' has reviewed {len(df_ur[df_ur.user == 'Torsten'])} boardgames.")

The user 'Torsten' has reviewed 1460 boardgames.


In [5]:
df_ur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18964807 entries, 0 to 18964806
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   user    object 
 1   rating  float64
 2   name    object 
 3   genre   object 
dtypes: float64(1), object(3)
memory usage: 578.8+ MB


In [6]:
df_ur.isnull().any()

user       True
rating    False
name      False
genre     False
dtype: bool

In [7]:
len(df_ur.user[df_ur.user.isnull()])

66

In [8]:
df_ur.dropna(subset=['user'], inplace=True)
df_ur.isnull().any()

user      False
rating    False
name      False
genre     False
dtype: bool

In [9]:
df_ur.genre.value_counts()

genre
strategygames     6536126
familygames       4273743
thematic          2859560
partygames        1587615
unknown           1402455
wargames          1211371
abstracts          581781
cgs                267038
childrensgames     245052
Name: count, dtype: int64

In [10]:
# Divide them by genre
df_sg = df_ur[df_ur.genre == 'strategygames']
df_fg = df_ur[df_ur.genre == 'familygames']
df_tg = df_ur[df_ur.genre == 'thematic']
df_pg = df_ur[df_ur.genre == 'partygames']
df_un = df_ur[df_ur.genre == 'unknown']
df_wg = df_ur[df_ur.genre == 'wargames']
df_ag = df_ur[df_ur.genre == 'abstracts']
df_cg = df_ur[df_ur.genre == 'cgs']
df_chg = df_ur[df_ur.genre == 'childrensgames']

In [11]:
df_genres = [df_sg, df_fg, df_tg, df_pg, df_un, df_wg, df_ag, df_cg, df_chg]

In [12]:
def bg_recommendation(game, list_of_dfs):

    df = pd.DataFrame()
    
    # Find the correct df for a specific boardgame
    for data in list_of_dfs:
        if game in set(data.name):
            df = data
            
    if df.empty:
        print("Game not found in the database.")
    else:
        # Pivot table for similarity
        pivot = df.pivot_table(index=['user'], columns=['name'], values='rating')

        # Filling NaN values with zero
        pivot.fillna(0, inplace=True)

        # Dropping columns with only zeroes
        pivot = pivot.T
        pivot = pivot.loc[:, (pivot != 0).any(axis=0)]

        # Save memory with a Compressed Sparse Row matrix
        piv_sparse = sp.sparse.csr_matrix(pivot.values)

        # Creating a board game similarity dataframe based on cosine similarity
        boardgame_similarity = cosine_similarity(piv_sparse)
        bg_sim_df = pd.DataFrame(boardgame_similarity, index=pivot.index, columns=pivot.index)

        return recommendation_output(bg_name=game, df_sim=bg_sim_df)

In [13]:
def recommendation_output(bg_name, df_sim):
    number = 1
    print(f"Recommended because you like {bg_name}:\n")
    for bg in df_sim.sort_values(by = bg_name, ascending = False).index[1:6]:
        print(f"#{number}: {bg}, {round(df_sim[bg][bg_name]*100,2)}% match")
        number +=1  

In [14]:
bg_recommendation('Pandemic', df_genres)

Recommended because you like Pandemic:

#1: 7 Wonders, 53.72% match
#2: Dominion, 52.37% match
#3: Small World, 49.12% match
#4: Power Grid, 45.07% match
#5: Agricola, 44.95% match
