# Implementacja rekomendacji przy użyciu K-nearest neighbours classification
Kod oparty na implementacji z platformy kaggle https://www.kaggle.com/kellyfuruya/steam-game-recommendations/notebook

In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
import math
import statistics as st
import operator

In [7]:
data = pd.read_csv('steam-200k.csv/steam-200k.csv', header=None, names=["User_ID", "Game", "Interaction", "Hours", "Ignore"])

In [8]:
data.head()

Unnamed: 0,User_ID,Game,Interaction,Hours,Ignore
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [9]:
data.head()
steam_raw = data.drop("Ignore", axis =1)
steam1 = steam_raw

In [10]:
steam1 = steam_raw[steam_raw['Interaction'] == "purchase"]
steam2 = steam_raw[steam_raw['Interaction'] == "play"]
    steam3 = pd.merge(steam1, steam2, how = 'left', left_on = ['User_ID', 'Game'], right_on = ['User_ID', 'Game'])
steam3['Hours_y'] = steam3['Hours_y'].fillna(0)

#put it all into a clean table
steam_clean = steam3.drop(['Interaction_x', 'Interaction_y', 'Hours_x'], axis = 1)
steam_clean.head()
steam_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129535 entries, 0 to 129534
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   User_ID  129535 non-null  int64  
 1   Game     129535 non-null  object 
 2   Hours_y  129535 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.0+ MB


In [11]:
#user_hours is a list of all the hours that each user put into the game we are calculating the frequency for
#hours_i is the hours for that specific user for the game
def frequency(hours_i, user_hours):
    if user_hours == 0:
        return 0
    elif hours_i == user_hours:
        return 1
    return hours_i/(user_hours - hours_i)

#frequency_list is the list of all the frequencies between the user k and the top user
#frequency is the calculated frequency for that user
def rating(frequency_sum):
    return 4*(1-frequency_sum)+1

In [13]:
#first we need to create a table with all of the games and the total hours put into each
game_hours = steam_clean.groupby(['Game'])['Hours_y'].sum().reset_index()

#now we can use this information in the frequency function
steam_clean2 = steam_clean
steam_length = int(len(steam_clean2.index))
i = 0
steam_clean2['Frequency'] = np.nan
for i in range(0, steam_length):
    hours_i = steam_clean2.iloc[i][2]
    user_hours = game_hours[game_hours['Game'] == steam_clean2.iloc[i][1]].iloc[0][1] #- steam_clean2.iloc[i][2]
    steam_clean2.at[i, 'Frequency'] = frequency(hours_i, user_hours)

In [14]:
steam_clean3 = steam_clean2

steam_clean3['Rating'] = np.nan
steam_clean3.sort_values(by = ['Game', 'Frequency'], inplace = True, ascending = False, ignore_index = True)
print(steam_clean3)

          User_ID                                               Game  Hours_y  \
0       159800136                                   theHunter Primal     71.0   
1        62878249                                   theHunter Primal      9.4   
2       207424334                                   theHunter Primal      4.6   
3       157080495                                   theHunter Primal      0.9   
4        43913966                                          theHunter     95.0   
...           ...                                                ...      ...   
129530   35701646  1... 2... 3... KICK IT! (Drop That Beat Like a...      0.0   
129531   11940338                                          0RBITALIS      0.6   
129532   86055705                                          0RBITALIS      0.3   
129533   93030550                                          0RBITALIS      0.3   
129534   46055854                                        007 Legends      0.7   

        Frequency  Rating  

In [15]:
store = "" #flag to see if we have moved on to a new game
sum_f = 0 #running frequency sum
last_f = 0 #flag to see if there are mutliple users with the same frequency
last_r = 0 #if multiple users with same frequency, want to give same rating

for i in range(0, steam_length): #go through the entire dataframe
    temp = steam_clean3.iloc[i][1] #find out which game we are one
    f_i = steam_clean3.iloc[i][3] #find out the frequency for that game and user    
    
    if temp != store: #if it's the first time we are going over the game
        store = temp #indicate we are now on a new game and save it's name
        if f_i == 0.0: #if there are no hours for the game
            sum_f = 0 #reset running sum
            last_f = 0 #reset last frequency
            last_r = 1 #set last rating
            steam_clean3.at[i, 'Rating'] = 1  #if there are no hours, give it the lowest rating
        else: #if there are hours for the game, must be the top user
            sum_f = f_i #first time on this game so sum is the frequency
            last_f = f_i #set last frequency
            last_r = 5 #set last rating
            steam_clean3.at[i, 'Rating'] = 5 #return the highest rating b/c top user
        
    else: #it's not the first time on this game
        if f_i == 0: #multiple users have zero hours
            last_r = 1 #set last rating 
            steam_clean3.at[i, 'Rating'] = 1 #return lowest score; don't need to reset anything
        elif last_f == f_i: #if there are multiple users with the same frequency
            sum_f += f_i #add to running sum
            steam_clean3.at[i, 'Rating'] = last_r #return last rating
        else:
            rating_f = rating(sum_f)  #calculate the rating
            sum_f += f_i #update the sum
            last_f = f_i #update the last frequency
            last_r = rating_f #update the last rating
            steam_clean3.at[i, 'Rating'] = rating_f

In [16]:
steam_clean4 = steam_clean3
steam_clean4.sort_values(by = ['User_ID', 'Game'], inplace = True, ignore_index = True)
#remove the games with no hours played
steam_clean4 = steam_clean4[steam_clean4['Rating'] != 1]
print(steam_clean4)

          User_ID                          Game  Hours_y     Frequency  \
0            5250                   Alien Swarm      4.9  3.266449e-03   
1            5250               Cities Skylines    144.0  4.710192e-02   
6            5250      Deus Ex Human Revolution     62.0  1.758666e-02   
7            5250                        Dota 2      0.2  2.037315e-07   
17           5250                      Portal 2     13.6  1.493931e-03   
...           ...                           ...      ...           ...   
129528  309434439                        Dota 2      0.8  8.149264e-07   
129529  309554670             Mitos.is The Game      5.9  2.987342e-02   
129530  309626088  Age of Empires II HD Edition      6.7  9.778452e-04   
129533  309824202                        Dota 2      0.7  7.130605e-07   
129534  309903146                        Dota 2      0.2  2.037315e-07   

          Rating  
0       1.765477  
1       4.609080  
6       3.214684  
7       0.992064  
17      1.681515

In [17]:
#distance function; takes array q and p and calculates modified euclidean distance
def distance(q, p):
    total = 0
    for i in range(0, len(q)):
        total += (q[i]-p[i])**2
#    print(total)
    return math.sqrt(total)/len(q)

#find the k nearest neighbors
def neighbors(df, k_neighbors, user):
    distances = []
    #subset of the original table that contains only the rows for specific user
    user_games = df[df['User_ID'] == user]
    #subset of the original table minus the rows for specific user
    df_subset = df[df['User_ID'] != user]
    #temporary list to hold the ratings for the specific user
    user_temp = []
    #temporary list to hold the ratings for the user we are currently indexed on
    temp = []
    #flag to see if we have moved on to a new user id
    temp_id = 0
    #iterate through the entire subset
    for index, row in df_subset.iterrows():
        #if the game at that particual row is a game that the specific user has
        if row['Game'] in set(user_games['Game']):
            #if it is, check to see if we are on a new user or not
            if row['User_ID'] == temp_id:
                #if not, add the rating to the temp list
                temp.append(row['Rating'])
                #also add the rating to the user temp list
                user_temp.append(user_games.loc[user_games['Game'] == row['Game'], 'Rating'].iloc[0])
            #if it's the first time running the loop; set temp_id, add temp_id, game, and ratings
            #but do not calculate distance
            elif temp_id == 0:
                temp_id = row['User_ID']
                temp.append(row['Rating'])
                user_temp.append(user_games.loc[user_games['Game'] == row['Game'], 'Rating'].iloc[0])
            #not the first time running the loop
            #new user
            else:
                #calculate distance for previous user
                dist = distance(user_temp, temp)
                #add that to distances along with the id
                distances.append((temp_id, dist))
                #set the flag to the new id
                temp_id = row['User_ID']
                #reset temp and user_temp
                temp = []
                temp.append(row['Rating'])
                user_temp = []
                user_temp.append(user_games.loc[user_games['Game'] == row['Game'], 'Rating'].iloc[0])
    #once we finish for loop, sort distances so smallest are first
    distances.sort(key=operator.itemgetter(1))
    neighbor_list =[]
    #insert neighbors into the list, smallest distance first up to the kth neighbor
    for i in range(k_neighbors):
        neighbor_list.append(distances[i])
    #return the list of k neighbors
    return neighbor_list
        
#recommend games based on the neighbors' ratings
def recommend(user, neighbor_list, df):
    #which games the user already has
    user_games = df[df['User_ID'] == user]
    dissim_games = []
    #go through all the neighbors
    for neighbor in neighbor_list:
        #make a temporary table containing all of the games that the neighbor has but the user does not
        temp = df[(df['User_ID'] == neighbor[0]) & (~df['Game'].isin(user_games['Game']))]
        #loop through the games in temp
        for index, game in temp.iterrows():
            #add the game and its rating to the dissimilar games list
            dissim_games.append((game['Game'], game['Rating']))
    #sort the dissimilar games list by the game name
    dissim_games.sort(key=operator.itemgetter(0))
    #flag to see if moved on to a new game
    flag = ""
    #running sum of all the ratings
    running_sum = 0
    #list we will add the recomendations to
    rec_list = []
    #count of how many times the game was in dissim_games
    count = 0
    #loop through all of the games
    for dis in dissim_games:
        #if it's the first time the game has come up in the loop
        if flag != dis[0]:
            #if it's not the first time the loop has run
            #if it was then we do not want to append anything
            if flag != "":
                #append the last game name and the average rating
                rec_list.append((flag, running_sum/count))
            #set the flag to the new gae
            flag = dis[0]
            #set the running sum to the current rating
            running_sum = dis[1]
            #reset the counter
            count = 1
        #multiple ratings for the same game
        else:
            #add the current rating to the running sum
            running_sum += dis[1]
            #increment the counter
            count += 1
    #sort the list of recommended games with the highest rating first
    sort_list = sorted(rec_list, key=operator.itemgetter(1), reverse = True)
    return(sort_list)
        
def rec_games(rec_tuple):
    games = []
    for pair in rec_tuple:
        games.append(pair[0])
    return games

In [18]:
test_neighbors = neighbors(steam_clean4, 5, 5250)
print(test_neighbors)

[(7249363, 0.0), (30425578, 0.0), (41124938, 0.0), (58893462, 0.0), (68532738, 0.0)]


In [19]:
recs = recommend(159800136, test_neighbors, steam_clean4)
recommended_games = rec_games(recs)
print(recommended_games)

['Dust An Elysian Tail', 'Hotline Miami', 'METAL SLUG 3', 'METAL SLUG X', 'Septerra Core', 'Borderlands 2', 'Mark of the Ninja', 'Half-Life Opposing Force', 'Burnout Paradise The Ultimate Box', 'Resident Evil 6 / Biohazard 6', 'Half-Life Blue Shift', 'Monaco', 'Darksiders', 'METAL GEAR RISING REVENGEANCE', 'Castle Crashers', 'Borderlands', 'Bastion', 'Skullgirls', 'SpeedRunners', 'Devil May Cry 3 Special Edition', 'Half-Life', "Mirror's Edge", 'DmC Devil May Cry', 'FEZ', 'Counter-Strike', 'Borderlands The Pre-Sequel', 'Saints Row The Third', 'Team Fortress 2', 'Dead Island', 'The Walking Dead', 'BioShock', 'Devil May Cry 4', 'Crysis 2 Maximum Edition', 'Gun Monkeys', "Don't Starve", 'Audiosurf', 'Sniper Ghost Warrior', 'Magic 2014 ', 'Awesomenauts', 'Mercenary Kings', 'FINAL FANTASY VIII', "Tom Clancy's Ghost Recon Phantoms - NA", 'Spec Ops The Line', 'Dragon Nest']


In [20]:
def knn(user, k_neighbors, df):
    knearest = neighbors(df, k_neighbors, user)
    rec_list = recommend(user, knearest, df)
    #games = rec_games(rec_list)
    if len(rec_list) > 10:
        rec_list = rec_list[:10]
    games = rec_list
    return games

In [21]:
steam_clean4.head()

Unnamed: 0,User_ID,Game,Hours_y,Frequency,Rating
0,5250,Alien Swarm,4.9,0.003266449,1.765477
1,5250,Cities Skylines,144.0,0.04710192,4.60908
6,5250,Deus Ex Human Revolution,62.0,0.01758666,3.214684
7,5250,Dota 2,0.2,2.037315e-07,0.992064
17,5250,Portal 2,13.6,0.001493931,1.681515


In [22]:
unique_users = steam_clean4['User_ID'].unique()

In [23]:
final_final_preds = pd.DataFrame(unique_users, columns=['user_id'])

In [26]:
#final_final_preds['preds_10'] = final_final_preds['user_id'].apply(lambda row: knn(row,1,steam_clean4))

In [None]:
final_final_preds['preds_10'][0]

In [44]:

def generate_recomendations(list_ids:list,data, k_nn=5)->list:
    results = []
    for elem in list_ids:
        print(1)
        temp = knn(elem, k_nn, data)
        results.append(temp)
    return results


In [45]:
list_id = [63276972,107948398,248444377,25096601,65958466,90711776,186452037,90033155,50818751,142999522]
result = generate_recomendations(list_id,steam_clean4, 3)

1
1
1
1
1
1
1
1
1
1


In [46]:
result

[[('Infestation Survivor Stories', 5.0),
  ('Counter-Strike', 3.662658897531197),
  ('Counter-Strike Global Offensive', 2.5496623238155647),
  ('Ori and the Blind Forest', 1.857745834734756),
  ('Age of Mythology Extended Edition', 1.3330201358807412)],
 [('Caribbean!', 5.0),
  ('Clockwork Empires', 5.0),
  ('Expeditions Conquistador', 5.0),
  ('Life is Feudal Your Own', 5.0),
  ('Sir, You Are Being Hunted', 5.0)],
 [('4 Elements', 5.0),
  ('Noir Syndrome', 5.0),
  ('Redshirt', 5.0),
  ('Renowned Explorers International Society', 5.0),
  ('Reverse Crawl', 5.0)],
 [('Company of Heroes 2', 2.950891134596457)],
 [('Half-Life 2 Lost Coast', 2.1522604594130175),
  ('Half-Life 2 Deathmatch', -0.14961316945891046)],
 [('Portal', 2.151686883166593),
  ('Terraria', 2.126887501222568),
  ('FINAL FANTASY VII', 2.0413196515435046),
  ('Half-Life 2 Lost Coast', 1.8908225509162886),
  ('Starbound', 1.817636516302937)],
 [('3DMark 11', 5.0),
  ('Age of Wonders', 5.0),
  ('Arma 2 Private Military Comp

In [50]:
only_games = {}

for index,user in enumerate(result):
    for games in user:
        if list_id[index] not in only_games:
            only_games[list_id[index]]= [games[0]]
        else:
            only_games[list_id[index]].append(games[0])
            
only_games

{63276972: ['Infestation Survivor Stories',
  'Counter-Strike',
  'Counter-Strike Global Offensive',
  'Ori and the Blind Forest',
  'Age of Mythology Extended Edition'],
 107948398: ['Caribbean!',
  'Clockwork Empires',
  'Expeditions Conquistador',
  'Life is Feudal Your Own',
  'Sir, You Are Being Hunted'],
 248444377: ['4 Elements',
  'Noir Syndrome',
  'Redshirt',
  'Renowned Explorers International Society',
  'Reverse Crawl'],
 25096601: ['Company of Heroes 2'],
 65958466: ['Half-Life 2 Lost Coast', 'Half-Life 2 Deathmatch'],
 90711776: ['Portal',
  'Terraria',
  'FINAL FANTASY VII',
  'Half-Life 2 Lost Coast',
  'Starbound'],
 186452037: ['3DMark 11',
  'Age of Wonders',
  'Arma 2 Private Military Company',
  'Artemis Spaceship Bridge Simulator',
  'BloodLust Shadowhunter'],
 90033155: ['Darksiders',
  'The Witcher 2 Assassins of Kings Enhanced Edition',
  'Company of Heroes 2',
  'Warframe',
  'Call of Duty Modern Warfare 2 - Multiplayer'],
 50818751: ['Half-Life 2 Lost Coast'