# Wymagane biblioteki

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from tqdm.notebook import tqdm_notebook as tqdm
import random
import implicit


# Przygotowanie danych

## Wczytywanie danych i ich wstepna obrobka

## Obróbka danych pod model ALS

Ze zbioru zostały usuniete gry, których suma godzin była mniejsza lub równa 10. Spowodowane było to błędami programistycznymi.

## Osobna miara
Jako, że sama liczba godzin nie jest w pełni miarodajna co do oceny gry, został zastosowany podział dystrybucji godzin każdej z gier na kwintyle (5 częsci).

In [18]:
def prepare_data():
    steam_raw = pd.read_csv("steam-200k.csv", header=None, names=["User_ID", "Game", "Interaction", "Hours", "Ignore"])
    steam = steam_raw.drop("Ignore", axis =1)
    steam_purchase = steam[steam['Interaction'] == "purchase"]
    steam_play = steam[steam['Interaction'] == "play"]
    steam_pp = pd.merge(steam_purchase, steam_play, how = 'left', left_on = ['User_ID', 'Game'], right_on = ['User_ID', 'Game'])
    steam_pp['Hours_y'] = steam_pp['Hours_y'].fillna(0)
    steam_clean = steam_pp.drop(['Interaction_x', 'Interaction_y', 'Hours_x'], axis = 1)
    steam_clean_rare = steam_clean.groupby('Game').filter(lambda x : len(x) > 10)
    steam_clean_rare['quartile_rating'] = steam_clean_rare.groupby(['Game'])['Hours_y'].rank(pct=True).transform(
        lambda x: pd.qcut(x, 5, labels=range(1,6), duplicates='drop')
    )
    return steam_clean_rare
steam = prepare_data()

# Preprocessing

## Rzadka macierz
W tej sekcji dane są obrabiane w taki sposób, zeby były zgodne z rzadką macierzą ocen gier.

In [40]:
class ALS_model:
    
    def __init__(self, data):
        # Create lists of all users, artists and plays
        users_n = len(list(np.sort(data.User_ID.unique())))
        games_n = len(list(np.sort(data.Game.unique())))
        plays = list(data.quartile_rating)

        # Get the rows and columns for our new matrix
        users = pd.factorize(data.User_ID.astype('category'))[0]
        games = pd.factorize(data.Game.astype('category'))[0]

        # Contruct a sparse matrix for our users and items containing number of plays
        data_sparse = sparse.csr_matrix((plays, (users, games)), shape=(users_n, games_n))
        ul = {r : u for r, u in zip(list(dict.fromkeys(users)), data.User_ID.unique())}
        ulr = {u : r for r, u in zip(list(dict.fromkeys(users)), data.User_ID.unique())}
        gl = {c : g for c, g in zip(list(dict.fromkeys(games)), data.Game.unique())}
        glr = {g : c for c, g in zip(list(dict.fromkeys(games)), data.Game.unique())}

        data_sparse, ul, ulr, gl, glr

        # users-games
        self.data_sparse = data_sparse
        self.ul = ul
        self.ulr = ulr
        self.gl = gl
        self.glr = glr
        self.model = None

    def fit(self, factors=20, regularization=0.1, iterations=50, alpha=40):
        self.model = implicit.als.AlternatingLeastSquares(factors = factors, regularization = regularization, iterations = iterations)
        # for training we need to flip it and scale with alpha per paper recommendations
        alpha_val = 40
        data_conf = (self.data_sparse.T * alpha_val).astype('double')
        self.model.fit(data_conf)

    def check_validity(self):
        matrix_size = self.data_sparse.shape[0]  *self.data_sparse.shape[1] # Number of possible interactions in the matrix
        num_purchases = len(self.data_sparse.nonzero()[0]) # Number of items interacted with
        sparsity = 100 * (1 - (num_purchases/matrix_size))
        return sparsity <= 0.995

    def recommmend(self, user_tag):
        if self.model is None:
            raise Exception('You need to train first')
        user_id = self.ulr[user_tag]
        encoded_recommendations_with_confidence = self.model.recommend(user_id, self.data_sparse)
        encoded_recommendations = [recommendation for recommendation, _ in encoded_recommendations_with_confidence]
        decoded_recommendations = []
        for er in encoded_recommendations:
            decoded_recommendations.append(self.gl[er])
        return decoded_recommendations

    def similar_items(self, item_name, n_similar = 10):
        item_id = self.glr[item_name]
        return self.model.similar_items(item_id, n_similar)

# Training and evaluating

In [41]:
model = ALS_model(steam)
model.fit()

100%|██████████| 50/50 [00:09<00:00,  5.17it/s]


In [42]:
validation_user_tags = [63276972, 107948398, 248444377,25096601, 65958466, 90711776, 186452037, 90033155, 50818751, 142999522]
for tag in validation_user_tags:
    print(model.recommmend(tag))

['Terraria', 'Team Fortress 2', 'Star Wars - Battlefront II', "Garry's Mod", 'Portal 2', 'Unturned', 'Counter-Strike Source', 'Alien Swarm', 'Goat Simulator', 'Game Dev Tycoon']
['Warhammer 40,000 Dawn of War II', 'Warhammer 40,000 Dawn of War II - Chaos Rising', 'Warhammer 40,000 Dawn of War II  Retribution', 'Aliens vs. Predator', 'Supreme Commander 2', 'Age of Empires II HD The Forgotten', 'RollerCoaster Tycoon 3 Platinum!', 'Titan Quest Immortal Throne', 'Age of Empires II HD Edition', 'Titan Quest']
["Sid Meier's Civilization V", 'City of Steam Arkadia', 'Prime World', 'Ascend Hand of Kul', 'Blacklight Retribution', 'Solstice Arena', 'Ragnarok', 'Lost Saga North America', 'Haunted Memories', 'Kings Bounty Legions']
['Ultra Street Fighter IV', 'Alien Isolation', 'BRINK', 'DmC Devil May Cry', 'Worms Revolution', 'Dead Island', 'South Park The Stick of Truth', 'Mark of the Ninja', 'Dying Light', 'Trine 2']
['This War of Mine', 'FINAL FANTASY VIII', 'Magic 2014 ', 'Darkest Dungeon', '