<h1>Item-Based Collaborative Filtering</h1>

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import time
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
start = time.time()
df_ur = pd.read_csv('C:/Users/guije/Documents/boardgames_databases/bgg-19m-revgen.csv')
print(f"Completed in {time.time() - start} seconds.")

Completed in 15.474953889846802 seconds.


In [3]:
df_ur.columns

Index(['user', 'rating', 'name', 'genre'], dtype='object')

In [4]:
print(f"The user 'Torsten' has reviewed {len(df_ur[df_ur.user == 'Torsten'])} boardgames.")

The user 'Torsten' has reviewed 1460 boardgames.


In [5]:
df_ur.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18964807 entries, 0 to 18964806
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   user    object 
 1   rating  float64
 2   name    object 
 3   genre   object 
dtypes: float64(1), object(3)
memory usage: 578.8+ MB


In [6]:
df_ur.isnull().any()

user       True
rating    False
name      False
genre     False
dtype: bool

In [7]:
len(df_ur.user[df_ur.user.isnull()])

66

In [8]:
df_ur.dropna(subset=['user'], inplace=True)
df_ur.isnull().any()

user      False
rating    False
name      False
genre     False
dtype: bool

In [9]:
df_ur.genre.value_counts()

genre
strategygames     6536126
familygames       4273743
thematic          2859560
partygames        1587615
unknown           1402455
wargames          1211371
abstracts          581781
cgs                267038
childrensgames     245052
Name: count, dtype: int64

In [12]:
df_sg = df_ur[df_ur.genre == 'strategygames']
len(df_sg)

6536126

In [13]:
pivot = df_sg.pivot_table(index=['user'], columns=['name'], values='rating')
pivot.head()

name,1347: De Nigrae Pestis Ludo,15 Dias: The Spanish Golden Age,1655: Habemus Papam,1800: Colorado,1812: The Cradle of Steam Railways,1817,1822: The Railways of Great Britain,1822MX,1824: Austria-Hungary,1825 Unit 1,...,Zapotec,Zauberschwert & Drachenei,Zena 1814,Zeppelin Attack!,ZhanGuo,Zoff in Buffalo,Zoocracy,Zoon,another damn Civilization game,oddball Äeronauts
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beastvol,,,,,,,,,,,...,,,,,,,,,,
mycroft,,,,,,,,,,,...,,,,,,,,,,
woh,,,,,,,,,,,...,,,,,,,,,,
(mostly) harmless,,,,,,,,,,,...,,,,,,,,,,
- V -,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# Filling NaN values with zero
pivot.fillna(0, inplace=True)
pivot.head()

name,1347: De Nigrae Pestis Ludo,15 Dias: The Spanish Golden Age,1655: Habemus Papam,1800: Colorado,1812: The Cradle of Steam Railways,1817,1822: The Railways of Great Britain,1822MX,1824: Austria-Hungary,1825 Unit 1,...,Zapotec,Zauberschwert & Drachenei,Zena 1814,Zeppelin Attack!,ZhanGuo,Zoff in Buffalo,Zoocracy,Zoon,another damn Civilization game,oddball Äeronauts
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
beastvol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mycroft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
woh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(mostly) harmless,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
- V -,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
pivot = pivot.T

In [16]:
pivot.shape

(2252, 304738)

In [17]:
# Dropping columns with only zeroes
pivot = pivot.loc[:, (pivot != 0).any(axis=0)]
pivot.shape

(2252, 304738)

In [18]:
piv_sparse = sp.sparse.csr_matrix(pivot.values)

<h1>Modeling</h1>

In [19]:
boardgame_similarity = cosine_similarity(piv_sparse)

In [20]:
bg_sim_df = pd.DataFrame(boardgame_similarity, index=pivot.index, columns=pivot.index)

In [23]:
'Pandemic' in bg_sim_df.columns

True

<h2>Making Recommendation - Example Boardgame: Pandemic</h2>

In [24]:
def bg_rec(bg_name):
    number = 1
    print(f"Recommended because you like {bg_name}:\n")
    for bg in bg_sim_df.sort_values(by = bg_name, ascending = False).index[1:6]:
        print(f"#{number}: {bg}, {round(bg_sim_df[bg][bg_name]*100,2)}% match")
        number +=1  

In [25]:
bg_rec('Pandemic')

Recommended because you like Pandemic:

#1: 7 Wonders, 53.72% match
#2: Dominion, 52.37% match
#3: Small World, 49.12% match
#4: Power Grid, 45.07% match
#5: Agricola, 44.95% match
