In [37]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


In [38]:
df = pd.read_csv('../../data\join_02.csv')
df.info

<bound method DataFrame.info of                 gamename  year      month      avg    gain  peak  \
0      100% Orange Juice  2021   February   278.67  -25.23   645   
1      100% Orange Juice  2021    January   303.89   -4.16   614   
2      100% Orange Juice  2020   December   308.05   23.02   743   
3      100% Orange Juice  2020   November   285.03   -7.41   621   
4      100% Orange Juice  2020    October   292.45   46.68   900   
...                  ...   ...        ...      ...     ...   ...   
63849  theHunter Classic  2014    October   909.46  258.77  2293   
63850  theHunter Classic  2014  September   650.69  -63.60  1717   
63851  theHunter Classic  2014     August   714.29 -337.67  1181   
63852  theHunter Classic  2014       July  1051.96 -110.53  2059   
63853  theHunter Classic  2014       June  1162.48     NaN  2648   

      avg_peak_perc        date release_date  required_age  ...  genre_sports  \
0          43.2047%  2021-02-01   2014-05-16             0  ...       

In [39]:
#df.date.unique()
df_recency = df.sort_values(by='date')
indices_to_keep = df_recency.groupby('gamename')['year'].idxmax()
df = df.loc[indices_to_keep]

In [40]:
df.isna().sum()

gamename                      0
year                          0
month                         0
avg                           0
gain                          3
peak                          0
avg_peak_perc                 0
date                          0
release_date                  0
required_age                  0
price                         0
dlc_count                     0
windows                       0
mac                           0
linux                         0
metacritic_score              0
positive                      0
negative                      0
developers                    0
publishers                    3
average_playtime_forever      0
average_playtime_two_weeks    0
median_playtime_forever       0
median_playtime_two_weeks     0
multi_player                  0
pvp                           0
co-op                         0
genre_action                  0
genre_adventure               0
genre_casual                  0
genre_sexual_content          0
genre_st

In [41]:
df.dropna(inplace=True)

In [42]:
df.duplicated().sum()

0

In [43]:
df.columns

Index(['gamename', 'year', 'month', 'avg', 'gain', 'peak', 'avg_peak_perc',
       'date', 'release_date', 'required_age', 'price', 'dlc_count', 'windows',
       'mac', 'linux', 'metacritic_score', 'positive', 'negative',
       'developers', 'publishers', 'average_playtime_forever',
       'average_playtime_two_weeks', 'median_playtime_forever',
       'median_playtime_two_weeks', 'multi_player', 'pvp', 'co-op',
       'genre_action', 'genre_adventure', 'genre_casual',
       'genre_sexual_content', 'genre_strategy', 'genre_sports',
       'genre_racing', 'genre_rpg', 'genre_simulation', 'indie', 'full_audio',
       'full_controller_support', 'age_0_plus', 'age_13_plus', 'age_18_plus'],
      dtype='object')

In [44]:
df.windows.sum()

942

In [45]:
df.mac.sum()

451

In [46]:
df.linux.sum()

353

In [47]:
df['publishers'].isna().sum()

0

In [48]:
for index in df[df['publishers'].isna()].index:
  df.drop(index, axis=0, inplace=True)

In [49]:
df['publishers'].isna().sum()

0

In [50]:
df.info

<bound method DataFrame.info of                   gamename  year    month       avg     gain   peak  \
1        100% Orange Juice  2021  January    303.89    -4.16    614   
85     12 is Better Than 6  2021  January      5.94     0.78     18   
149                    140  2021  January      2.98    -0.02     11   
238            60 Seconds!  2021  January     33.29     4.64     59   
308          7 Days to Die  2021  January  25366.69  3925.06  39581   
...                    ...   ...      ...       ...      ...    ...   
63522               Zup! 3  2021  January      5.83    -2.59     38   
63572            ibb & obb  2021  January     41.45     2.61    110   
63654               klocki  2021  January      2.60    -0.18      7   
63710            rFactor 2  2021  January    567.28   -38.22   1272   
63774    theHunter Classic  2021  January   1281.06   -41.93   2573   

      avg_peak_perc        date release_date  required_age  ...  genre_sports  \
1          49.4935%  2021-01-01   

In [51]:
df['publishers'].unique()

array(['Fruitbat Factory', 'HypeTrain Digital', 'Carlsen Games',
       'Robot Gentleman', 'The Fun Pimps Entertainment LLC',
       'Games Operators', '7th Beat Games,indienova',
       'Gears for Breakfast', 'Coffee Stain Publishing',
       'Electronic Arts', '505 Games', 'Little Orbit', 'Studio Wildcard',
       'System Era Softworks', 'Grapeshot Games', 'Devolver Digital',
       'Infini-Brain inc.', 'Hyper Hippo Games', 'Xbox Game Studios',
       'Łukasz Jakowski Games', 'Paradox Interactive', 'Statespace',
       'Carbon Games', 'Remedy Entertainment', 'Sandbox Interactive GmbH',
       'Valve', 'Reactive Drop Team',
       'SEGA,Feral Interactive (Mac),Feral Interactive (Linux)',
       'Gamera Games', 'U.S. Army', 'SCS Software', 'Frictional Games',
       'Demruth', 'Lince Works', 'Kakao Games Europe B.V.',
       'Bohemia Interactive', 'League of Geeks', 'Game Science',
       'Igara Studio', 'Kunos Simulazioni', 'Trion Worlds',
       'Dylan Fitterer', 'Aeria Games', 'Boxe

In [52]:
df['developers'].unique()

array(['Orange_Juice', 'Ink Stains Games', 'Carlsen Games',
       'Robot Gentleman', 'The Fun Pimps', 'Jutsu Games',
       '7th Beat Games', 'Gears for Breakfast', 'Gone North Games',
       'Hazelight', 'Giant Squid', 'Reloaded Productions',
       'Studio Wildcard,Instinct Games,Efecto Studios,Virtual Basement LLC',
       'System Era Softworks', 'Grapeshot Games,Instinct Games',
       'Sloclap', 'LiLith', 'Hyper Hippo Games',
       'Skybox Labs,Hidden Path Entertainment,Ensemble Studios,Forgotten Empires',
       'Forgotten Empires,Tantalus Media,Wicked Witch',
       'Tantalus Media,Forgotten Empires', 'Forgotten Empires,Tantalus',
       'Ensemble Studios', 'Łukasz Jakowski',
       'SkyBox Labs,Ensemble Studios', 'Triumph Studios', 'Statespace',
       'Carbon Games', 'Remedy Entertainment', 'Sandbox Interactive GmbH',
       'Valve', 'Reactive Drop Team',
       'Creative Assembly,Feral Interactive (Mac),Feral Interactive (Linux)',
       'GSQ Games', 'U.S. Army', 'SCS Softw

In [53]:
df['year'].unique()

array([2021, 2020], dtype=int64)

In [54]:
df.gamename.unique()

array(['100% Orange Juice', '12 is Better Than 6', '140', '60 Seconds!',
       '7 Days to Die', '911 Operator', 'A Dance of Fire and Ice',
       'A Hat in Time', 'A Story About My Uncle', 'A Way Out', 'ABZU',
       'APB Reloaded', 'ARK: Survival Evolved', 'ASTRONEER', 'ATLAS',
       'Absolver', 'Action Taimanin', 'AdVenture Capitalist',
       'Age of Empires II (2013)',
       'Age of Empires II: Definitive Edition',
       'Age of Empires III: Definitive Edition',
       'Age of Empires: Definitive Edition', 'Age of Empires® III (2007)',
       'Age of History II', 'Age of Mythology: Extended Edition',
       'Age of Wonders III', 'Aim Lab', 'AirMech Strike', 'Alan Wake',
       "Alan Wake's American Nightmare", 'Albion Online', 'Alien Swarm',
       'Alien Swarm: Reactive Drop', 'Alien: Isolation',
       'Amazing Cultivation Simulator', "America's Army: Proving Grounds",
       'American Truck Simulator', 'Amnesia: A Machine for Pigs',
       'Amnesia: The Dark Descent', 'Antic

In [55]:
df.set_index('gamename', inplace=True)
df.head()

Unnamed: 0_level_0,year,month,avg,gain,peak,avg_peak_perc,date,release_date,required_age,price,...,genre_sports,genre_racing,genre_rpg,genre_simulation,indie,full_audio,full_controller_support,age_0_plus,age_13_plus,age_18_plus
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,January,303.89,-4.16,614,49.4935%,2021-01-01,2014-05-16,0,1.74,...,0,0,0,0,1,1,0,1,0,0
12 is Better Than 6,2021,January,5.94,0.78,18,33%,2021-01-01,2015-11-20,0,9.99,...,0,0,0,0,1,0,1,1,0,0
140,2021,January,2.98,-0.02,11,27.0909%,2021-01-01,2013-10-16,0,4.99,...,0,0,0,0,1,1,1,1,0,0
60 Seconds!,2021,January,33.29,4.64,59,56.4237%,2021-01-01,2015-05-25,0,8.99,...,0,0,0,1,1,1,0,1,0,0
7 Days to Die,2021,January,25366.69,3925.06,39581,64.088%,2021-01-01,2013-12-13,0,24.99,...,0,0,1,1,1,0,1,1,0,0


In [56]:
column_object = df.dtypes[df.dtypes == 'object'].keys()
column_object


Index(['month', 'avg_peak_perc', 'date', 'release_date', 'developers',
       'publishers'],
      dtype='object')

In [57]:
column_object.drop(['month','avg_peak_perc','date','release_date'])

Index(['developers', 'publishers'], dtype='object')

In [58]:
one_hot_label = pd.get_dummies(df[column_object])
one_hot_label.head(3)

Unnamed: 0_level_0,month_February,month_January,avg_peak_perc_13.2268%,avg_peak_perc_13.3797%,avg_peak_perc_13.7215%,avg_peak_perc_14.0609%,avg_peak_perc_15.3421%,avg_peak_perc_16.4606%,avg_peak_perc_16.9515%,avg_peak_perc_18.1829%,...,publishers_Zombie Panic! Team,publishers_gamigo US Inc.,publishers_inXile Entertainment,publishers_kChamp Games,publishers_like Charlie,publishers_marbenx,publishers_tinyBuild,publishers_tobyfox,publishers_Łukasz Jakowski Games,publishers_墨鱼玩游戏
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12 is Better Than 6,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
140,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [59]:
df.drop(column_object,axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,year,avg,gain,peak,required_age,price,dlc_count,windows,mac,linux,...,genre_sports,genre_racing,genre_rpg,genre_simulation,indie,full_audio,full_controller_support,age_0_plus,age_13_plus,age_18_plus
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,303.89,-4.16,614,0,1.74,34,1,0,0,...,0,0,0,0,1,1,0,1,0,0
12 is Better Than 6,2021,5.94,0.78,18,0,9.99,2,1,1,1,...,0,0,0,0,1,0,1,1,0,0
140,2021,2.98,-0.02,11,0,4.99,1,1,1,1,...,0,0,0,0,1,1,1,1,0,0
60 Seconds!,2021,33.29,4.64,59,0,8.99,0,1,1,0,...,0,0,0,1,1,1,0,1,0,0
7 Days to Die,2021,25366.69,3925.06,39581,0,24.99,0,1,1,1,...,0,0,1,1,1,0,1,1,0,0


In [60]:
df = pd.concat([df,one_hot_label],axis=1)
df.head()

Unnamed: 0_level_0,year,avg,gain,peak,required_age,price,dlc_count,windows,mac,linux,...,publishers_Zombie Panic! Team,publishers_gamigo US Inc.,publishers_inXile Entertainment,publishers_kChamp Games,publishers_like Charlie,publishers_marbenx,publishers_tinyBuild,publishers_tobyfox,publishers_Łukasz Jakowski Games,publishers_墨鱼玩游戏
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,303.89,-4.16,614,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
12 is Better Than 6,2021,5.94,0.78,18,0,9.99,2,1,1,1,...,False,False,False,False,False,False,False,False,False,False
140,2021,2.98,-0.02,11,0,4.99,1,1,1,1,...,False,False,False,False,False,False,False,False,False,False
60 Seconds!,2021,33.29,4.64,59,0,8.99,0,1,1,0,...,False,False,False,False,False,False,False,False,False,False
7 Days to Die,2021,25366.69,3925.06,39581,0,24.99,0,1,1,1,...,False,False,False,False,False,False,False,False,False,False


In [61]:
column_numeric = list(df.dtypes[df.dtypes == 'float64'].keys())
column_numeric.remove('price')

In [62]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[column_numeric])
i=0
for column in column_numeric:
    df[column] = scaled[:,i]
    i += 1
     


In [63]:
df.memory_usage(deep=True).sum() / 1024**2

3.048013687133789

In [71]:
#Too much mem. we must reduce. going to take most recent year for each game.
df.describe

<bound method NDFrame.describe of                      year       avg      gain   peak  required_age  price  \
gamename                                                                    
100% Orange Juice    2021  0.000408  0.755582    614             0   1.74   
12 is Better Than 6  2021  0.000007  0.755597     18             0   9.99   
140                  2021  0.000003  0.755595     11             0   4.99   
60 Seconds!          2021  0.000044  0.755609     59             0   8.99   
7 Days to Die        2021  0.034130  0.767446  39581             0  24.99   
...                   ...       ...       ...    ...           ...    ...   
Zup! 3               2021  0.000007  0.755587     38             0   0.99   
ibb & obb            2021  0.000055  0.755603    110             0  11.99   
klocki               2021  0.000003  0.755594      7             0   1.99   
rFactor 2            2021  0.000762  0.755480   1272             0   7.99   
theHunter Classic    2021  0.001723  0.755

In [72]:

# Model initiation
model = NearestNeighbors(metric='euclidean')

# Fit model to the data
model.fit(df)

In [77]:
def getRecommended(gamename:str):
    
    distances, neighbors = model.kneighbors(df.loc[gamename],n_neighbors=5)
    similar_games = []
    for gamename in df.loc[neighbors[0][:]].values:
        similar_games.append(gamename[0])
    similar_distance = []
    for distance in distances[0]:
        similar_distance.append(f"{round(100-distance,2)}%")
    return pd.Dataframe(data = {"Game" : similar_games[1:], "Similarity" : similar_distance[1:]})


In [78]:

getRecommended(df.loc['Yakuza Kiwami 2'])

KeyError: "None of [Index([                 2021, 0.0009133097518992537,    0.7562299553262629,\n                        1083,                    17,                 29.99,\n                           1,                     1,                     0,\n                           0,\n       ...\n                       False,                 False,                 False,\n                       False,                 False,                 False,\n                       False,                 False,                 False,\n                       False],\n      dtype='object', name='gamename', length=3074)] are in the [index]"