In [53]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


In [54]:
df = pd.read_csv('../../data\join_02.csv')
df.info

<bound method DataFrame.info of                 gamename  year      month      avg    gain  peak  \
0      100% Orange Juice  2021   February   278.67  -25.23   645   
1      100% Orange Juice  2021    January   303.89   -4.16   614   
2      100% Orange Juice  2020   December   308.05   23.02   743   
3      100% Orange Juice  2020   November   285.03   -7.41   621   
4      100% Orange Juice  2020    October   292.45   46.68   900   
...                  ...   ...        ...      ...     ...   ...   
63849  theHunter Classic  2014    October   909.46  258.77  2293   
63850  theHunter Classic  2014  September   650.69  -63.60  1717   
63851  theHunter Classic  2014     August   714.29 -337.67  1181   
63852  theHunter Classic  2014       July  1051.96 -110.53  2059   
63853  theHunter Classic  2014       June  1162.48     NaN  2648   

      avg_peak_perc        date release_date  required_age  ...  genre_sports  \
0          43.2047%  2021-02-01   2014-05-16             0  ...       

In [55]:
df.isna().sum()

gamename                        0
year                            0
month                           0
avg                             0
gain                          966
peak                            0
avg_peak_perc                   0
date                            0
release_date                    0
required_age                    0
price                           0
dlc_count                       0
windows                         0
mac                             0
linux                           0
metacritic_score                0
positive                        0
negative                        0
developers                      0
publishers                    227
average_playtime_forever        0
average_playtime_two_weeks      0
median_playtime_forever         0
median_playtime_two_weeks       0
multi_player                    0
pvp                             0
co-op                           0
genre_action                    0
genre_adventure                 0
genre_casual  

In [77]:
df.gamename.unique()

AttributeError: 'DataFrame' object has no attribute 'gamename'

In [56]:
df.duplicated().sum()

0

In [57]:
df.columns

Index(['gamename', 'year', 'month', 'avg', 'gain', 'peak', 'avg_peak_perc',
       'date', 'release_date', 'required_age', 'price', 'dlc_count', 'windows',
       'mac', 'linux', 'metacritic_score', 'positive', 'negative',
       'developers', 'publishers', 'average_playtime_forever',
       'average_playtime_two_weeks', 'median_playtime_forever',
       'median_playtime_two_weeks', 'multi_player', 'pvp', 'co-op',
       'genre_action', 'genre_adventure', 'genre_casual',
       'genre_sexual_content', 'genre_strategy', 'genre_sports',
       'genre_racing', 'genre_rpg', 'genre_simulation', 'indie', 'full_audio',
       'full_controller_support', 'age_0_plus', 'age_13_plus', 'age_18_plus'],
      dtype='object')

In [58]:
df.windows.sum()

63854

In [59]:
df.mac.sum()

32569

In [60]:
df.linux.sum()

26757

In [61]:
df['publishers'].isna().sum()

227

In [62]:
for index in df[df['publishers'].isna()].index:
  df.drop(index, axis=0, inplace=True)

In [63]:
df['publishers'].isna().sum()

0

In [64]:
df.info

<bound method DataFrame.info of                 gamename  year      month      avg    gain  peak  \
0      100% Orange Juice  2021   February   278.67  -25.23   645   
1      100% Orange Juice  2021    January   303.89   -4.16   614   
2      100% Orange Juice  2020   December   308.05   23.02   743   
3      100% Orange Juice  2020   November   285.03   -7.41   621   
4      100% Orange Juice  2020    October   292.45   46.68   900   
...                  ...   ...        ...      ...     ...   ...   
63849  theHunter Classic  2014    October   909.46  258.77  2293   
63850  theHunter Classic  2014  September   650.69  -63.60  1717   
63851  theHunter Classic  2014     August   714.29 -337.67  1181   
63852  theHunter Classic  2014       July  1051.96 -110.53  2059   
63853  theHunter Classic  2014       June  1162.48     NaN  2648   

      avg_peak_perc        date release_date  required_age  ...  genre_sports  \
0          43.2047%  2021-02-01   2014-05-16             0  ...       

In [65]:
df['publishers'].unique()

array(['Fruitbat Factory', 'HypeTrain Digital', 'Carlsen Games',
       'Robot Gentleman', 'The Fun Pimps Entertainment LLC',
       'Games Operators', '7th Beat Games,indienova',
       'Gears for Breakfast', 'Coffee Stain Publishing',
       'Electronic Arts', '505 Games', 'Little Orbit', 'Studio Wildcard',
       'System Era Softworks', 'Grapeshot Games', 'Devolver Digital',
       'Infini-Brain inc.', 'Hyper Hippo Games', 'Xbox Game Studios',
       'Łukasz Jakowski Games', 'Paradox Interactive', 'Statespace',
       'Carbon Games', 'Remedy Entertainment', 'Sandbox Interactive GmbH',
       'Valve', 'Reactive Drop Team',
       'SEGA,Feral Interactive (Mac),Feral Interactive (Linux)',
       'Gamera Games', 'U.S. Army', 'SCS Software', 'Frictional Games',
       'Demruth', 'Lince Works', 'Kakao Games Europe B.V.',
       'Bohemia Interactive', 'League of Geeks', 'Game Science',
       'Mesmerizers', 'Igara Studio', 'Kunos Simulazioni', 'Trion Worlds',
       'Dylan Fitterer', 'Aeri

In [66]:
df['developers'].unique()

array(['Orange_Juice', 'Ink Stains Games', 'Carlsen Games',
       'Robot Gentleman', 'The Fun Pimps', 'Jutsu Games',
       '7th Beat Games', 'Gears for Breakfast', 'Gone North Games',
       'Hazelight', 'Giant Squid', 'Reloaded Productions',
       'Studio Wildcard,Instinct Games,Efecto Studios,Virtual Basement LLC',
       'System Era Softworks', 'Grapeshot Games,Instinct Games',
       'Sloclap', 'LiLith', 'Hyper Hippo Games',
       'Skybox Labs,Hidden Path Entertainment,Ensemble Studios,Forgotten Empires',
       'Forgotten Empires,Tantalus Media,Wicked Witch',
       'Tantalus Media,Forgotten Empires', 'Forgotten Empires,Tantalus',
       'Ensemble Studios', 'Łukasz Jakowski',
       'SkyBox Labs,Ensemble Studios', 'Triumph Studios', 'Statespace',
       'Carbon Games', 'Remedy Entertainment', 'Sandbox Interactive GmbH',
       'Valve', 'Reactive Drop Team',
       'Creative Assembly,Feral Interactive (Mac),Feral Interactive (Linux)',
       'GSQ Games', 'U.S. Army', 'SCS Softw

In [67]:
df['year'].unique()

array([2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012],
      dtype=int64)

In [68]:
df.set_index('gamename', inplace=True)
df.head()

Unnamed: 0_level_0,year,month,avg,gain,peak,avg_peak_perc,date,release_date,required_age,price,...,genre_sports,genre_racing,genre_rpg,genre_simulation,indie,full_audio,full_controller_support,age_0_plus,age_13_plus,age_18_plus
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,February,278.67,-25.23,645,43.2047%,2021-02-01,2014-05-16,0,1.74,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2021,January,303.89,-4.16,614,49.4935%,2021-01-01,2014-05-16,0,1.74,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2020,December,308.05,23.02,743,41.4603%,2020-12-01,2014-05-16,0,1.74,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2020,November,285.03,-7.41,621,45.8986%,2020-11-01,2014-05-16,0,1.74,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2020,October,292.45,46.68,900,32.4944%,2020-10-01,2014-05-16,0,1.74,...,0,0,0,0,1,1,0,1,0,0


In [69]:
column_object = df.dtypes[df.dtypes == 'object'].keys()
column_object


Index(['month', 'avg_peak_perc', 'date', 'release_date', 'developers',
       'publishers'],
      dtype='object')

In [70]:
column_object.drop(['month','avg_peak_perc','date','release_date'])

Index(['developers', 'publishers'], dtype='object')

In [71]:
one_hot_label = pd.get_dummies(df[column_object])
one_hot_label.head(3)

Unnamed: 0_level_0,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,...,publishers_Zombie Panic! Team,publishers_gamigo US Inc.,publishers_inXile Entertainment,publishers_kChamp Games,publishers_like Charlie,publishers_marbenx,publishers_tinyBuild,publishers_tobyfox,publishers_Łukasz Jakowski Games,publishers_墨鱼玩游戏
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [72]:
df.drop(column_object,axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,year,avg,gain,peak,required_age,price,dlc_count,windows,mac,linux,...,genre_sports,genre_racing,genre_rpg,genre_simulation,indie,full_audio,full_controller_support,age_0_plus,age_13_plus,age_18_plus
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,278.67,-25.23,645,0,1.74,34,1,0,0,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2021,303.89,-4.16,614,0,1.74,34,1,0,0,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2020,308.05,23.02,743,0,1.74,34,1,0,0,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2020,285.03,-7.41,621,0,1.74,34,1,0,0,...,0,0,0,0,1,1,0,1,0,0
100% Orange Juice,2020,292.45,46.68,900,0,1.74,34,1,0,0,...,0,0,0,0,1,1,0,1,0,0


In [73]:
df = pd.concat([df,one_hot_label],axis=1)
df.head()

Unnamed: 0_level_0,year,avg,gain,peak,required_age,price,dlc_count,windows,mac,linux,...,publishers_Zombie Panic! Team,publishers_gamigo US Inc.,publishers_inXile Entertainment,publishers_kChamp Games,publishers_like Charlie,publishers_marbenx,publishers_tinyBuild,publishers_tobyfox,publishers_Łukasz Jakowski Games,publishers_墨鱼玩游戏
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,278.67,-25.23,645,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2021,303.89,-4.16,614,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2020,308.05,23.02,743,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2020,285.03,-7.41,621,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2020,292.45,46.68,900,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False


In [74]:
column_numeric = list(df.dtypes[df.dtypes == 'float64'].keys())
column_numeric.remove('price')

In [75]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[column_numeric])
i=0
for column in column_numeric:
    df[column] = scaled[:,i]
    i += 1
     
df.head()

Unnamed: 0_level_0,year,avg,gain,peak,required_age,price,dlc_count,windows,mac,linux,...,publishers_Zombie Panic! Team,publishers_gamigo US Inc.,publishers_inXile Entertainment,publishers_kChamp Games,publishers_like Charlie,publishers_marbenx,publishers_tinyBuild,publishers_tobyfox,publishers_Łukasz Jakowski Games,publishers_墨鱼玩游戏
gamename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,2021,0.000325,0.57283,645,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2021,0.000354,0.572879,614,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2020,0.000359,0.572941,743,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2020,0.000332,0.572871,621,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False
100% Orange Juice,2020,0.000341,0.572995,900,0,1.74,34,1,0,0,...,False,False,False,False,False,False,False,False,False,False


In [76]:

# Model initiation
model = NearestNeighbors(metric='euclidean')

# Fit model to the data
model.fit(df)

MemoryError: Unable to allocate 26.8 GiB for an array with shape (56458, 63627) and data type float64