In [2]:
import pandas as pd
import difflib
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import KNNBasic,  KNNWithMeans, KNNBaseline
from surprise.model_selection import KFold
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
from surprise.model_selection import GridSearchCV

In [3]:
def load_data():
    user_game_df = pd.read_csv("../raw_data/rawg_user_games.csv")
    return user_game_df

In [4]:
def get_ratings_and_meta(df):
    filter_df = df[df['user_rating']>0]
    ratings_df = filter_df[['user_id','game_id','user_rating']]
    metadata = df[['game_id','game_name', 'released', 'metacritic', 'rawg_rating']]
    metadata['dummies'] = 0
    meta = metadata.groupby(by=['game_id','game_name', 'released', 'metacritic', 'rawg_rating']).sum().drop(columns='dummies').reset_index()
    return ratings_df, meta

In [5]:
data = load_data()
rating_df, metadata = get_ratings_and_meta(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata['dummies'] = 0


In [6]:
rating_df

Unnamed: 0,user_id,game_id,user_rating
0,1,446900,4
1,1,264830,5
2,1,58443,4
4,1,21889,4
5,1,1190,4
...,...,...,...
254743,997,5286,3
254744,997,3328,4
254745,997,4200,5
254746,997,3498,4


In [7]:
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(rating_df, reader)
anti_set = data.build_full_trainset().build_anti_testset()

In [11]:
games = pd.DataFrame(rating_df['game_id']).drop_duplicates(['game_id'])
users = pd.DataFrame(rating_df['user_id']).drop_duplicates(['user_id'])

In [22]:
pd.DataFrame(data.raw_ratings)

Unnamed: 0,0,1,2,3
0,1,446900,4.0,
1,1,264830,5.0,
2,1,58443,4.0,
3,1,21889,4.0,
4,1,1190,4.0,
...,...,...,...,...
24373,997,5286,3.0,
24374,997,3328,4.0,
24375,997,4200,5.0,
24376,997,3498,4.0,


In [32]:
pd.DataFrame(anti_set)

Unnamed: 0,0,1,2
0,1,34405,3.675814
1,1,195,3.675814
2,1,23062,3.675814
3,1,33908,3.675814
4,1,56045,3.675814
...,...,...,...
1295329,998,8845,3.675814
1295330,998,12120,3.675814
1295331,998,16029,3.675814
1295332,998,3203,3.675814


In [31]:
24378*239

5826342

In [33]:
kf = KFold(n_splits=3)
algo = KNNBasic(k=1)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_pred = predictions

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2443
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2397
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2452


In [37]:
rating_df

Unnamed: 0,user_id,game_id,user_rating
0,1,446900,4
1,1,264830,5
2,1,58443,4
4,1,21889,4
5,1,1190,4
...,...,...,...
254743,997,5286,3
254744,997,3328,4
254745,997,4200,5
254746,997,3498,4


In [42]:
def transform_df(df):
    game_matrix_df = df.pivot(index = 'user_id', columns ='game_id', values = 'user_rating').fillna(0)
    return game_matrix_df

In [141]:
rating_matrix = transform_df(rating_df)

In [142]:
rating_matrix.head()

game_id,20,21,22,24,25,26,27,28,29,30,...,446900,447797,452634,452646,457729,461649,463727,484913,491003,527389
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
game_id_matrix = rating_matrix.columns

In [144]:
game_id_matrix

Int64Index([    20,     21,     22,     24,     25,     26,     27,     28,
                29,     30,
            ...
            446900, 447797, 452634, 452646, 457729, 461649, 463727, 484913,
            491003, 527389],
           dtype='int64', name='game_id', length=5592)

In [162]:
real_sample = rating_matrix.iloc[1].values
real_sample

array([0., 0., 0., ..., 0., 0., 0.])

In [145]:
new_user = [
    { 
        "game_id": 26,
        "user_rating": 4 
    },
    { 
        "game_id": 28,
        "user_rating": 5 
    },
    { 
        "game_id": 30,
        "user_rating": 0 
    }
    ]

In [146]:
X_matrix = pd.DataFrame(game_id_matrix)

In [147]:
X_matrix['ratings'] = 0

In [148]:
X_matrix = X_matrix.set_index('game_id')

In [149]:
new_user[0]["user_rating"]

4

In [150]:
new_user

[{'game_id': 26, 'user_rating': 4},
 {'game_id': 28, 'user_rating': 5},
 {'game_id': 30, 'user_rating': 0}]

In [151]:
for game in new_user:
    game_id = game["game_id"]
    ratings = game["user_rating"]
    #X_matrix[X_matrix['game_id'] == game_id]['ratings'] = ratings
    X_matrix.loc[game_id,'ratings'] = ratings

In [152]:
X_matrix[X_matrix.index == 26]

Unnamed: 0_level_0,ratings
game_id,Unnamed: 1_level_1
26,4


In [153]:
X = X_matrix['ratings'].values

In [154]:
X_matrix.shape

(5592, 1)

In [155]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
samples = rating_matrix

In [156]:
neigh = NearestNeighbors(n_neighbors=1)
neigh.fit(samples)


NearestNeighbors(n_neighbors=1)

In [165]:
neigh.kneighbors(real_sample.reshape(-1, 1).transpose(), 1, return_distance=False)

array([[1]])

In [158]:
X.shape

(5592,)

In [130]:
rating_matrix

game_id,20,21,22,24,25,26,27,28,29,30,...,446900,447797,452634,452646,457729,461649,463727,484913,491003,527389
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
