In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cdist, pdist
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [4]:
data = '../data/steam_cleaned.csv'

In [8]:
df = pd.read_csv(data).set_index('Unique_ID')
df.head()

Unnamed: 0_level_0,win_comp,mac_comp,linux_comp,percent_review,total_review,discount,price,"""1990s""","""Beatemup""","""ShootEmUp""",...,Werewolves,Western,Wholesome,WordGame,WorldWarI,WorldWarII,Wrestling,Zombies,eSports,year
Unique_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1643320,1,0,0,0,0,0,59.99,0,0,0,...,0,0,0,0,0,0,0,0,0,2023.0
2172860,1,1,0,0,0,0,39.99,0,0,0,...,0,0,0,0,0,0,0,0,0,2023.0
1364780,1,0,0,0,0,0,59.99,0,0,0,...,0,0,0,0,0,0,0,0,0,2023.0
1868180,1,0,0,0,0,0,39.99,0,0,0,...,0,0,0,0,0,0,0,0,0,2023.0
1868170,1,0,0,0,0,0,39.99,0,0,0,...,0,0,0,0,0,0,0,0,0,2023.0


In [14]:
idLabels = list(df.index)

9286

## Cosine Similarity

In [9]:
scaled_df = StandardScaler().fit_transform(df)

In [18]:
cos_df = pd.DataFrame(cosine_similarity(scaled_df), columns=idLabels)
cos_df.index = idLabels
cos_df.head()

Unnamed: 0,1643320,2172860,1364780,1868180,1868170,1810820,1485590,1294810,1774580,1798020,...,592370,418800,418790,564310,237810,335610,249860,49540,418770,57925
1643320,1.0,0.136589,0.043632,0.056806,0.05584,0.05584,0.016036,0.4011,0.059817,0.010533,...,-0.005388,-0.006058,-0.003496,0.046075,0.003363,0.016141,-0.022846,0.071663,-0.007898,0.128139
2172860,0.136589,1.0,0.055725,0.179138,0.060406,0.060406,0.018852,0.189843,0.069052,0.029298,...,0.003048,0.026487,0.029549,-0.044207,0.023695,0.019161,-0.018141,-0.009927,0.026929,-0.007415
1364780,0.043632,0.055725,1.0,0.039466,0.0388,0.0388,0.139189,0.104632,0.07719,0.127066,...,0.139055,0.012754,0.014924,-0.048375,0.000186,0.011665,-0.011208,-0.001637,0.011494,-0.043265
1868180,0.056806,0.179138,0.039466,1.0,0.883715,0.883715,0.22754,0.067414,0.04972,0.015514,...,-0.002344,0.006995,0.009442,-0.082805,0.014669,0.010331,-0.020004,-0.012832,0.007219,-0.082968
1868170,0.05584,0.060406,0.0388,0.883715,1.0,1.0,0.223502,0.066268,0.048875,0.015285,...,-0.002276,0.006896,0.0093,-0.081241,0.014431,0.010173,-0.019622,-0.012579,0.007117,-0.081346


In [22]:
def top10_cos(id):
    """

    Returns top 10 app ids along with their cosine similarity to argument id

    Args:
        id (int): id of app to lookup
    """

    # Create ID-Title Dictionary
    data = pd.read_csv('../data/steam.csv')
    idNamesDict = data.set_index('Unique_ID').to_dict()['title']

    # Get top 10 matches
    res = cos_df.loc[id].sort_values(ascending=False)[1:11]
    res = pd.DataFrame(res)
    res['Title'] = res.index.map(mapper=idNamesDict)
    res.columns = ['Cosine_Similarity', 'Title']
    res = res[['Title', 'Cosine_Similarity']]
    res



    return res

In [36]:
top10_cos(1245620)

Unnamed: 0,Title,Cosine_Similarity
1343240,Thymesia,0.883818
335300,DARK SOULS™ II: Scholar of the First Sin,0.877885
265300,Lords Of The Fallen™ 2014,0.857842
1108590,Eldest Souls,0.837568
1448440,Wo Long: Fallen Dynasty,0.829018
1054700,Back To Ashes,0.81062
1110910,Mortal Shell,0.799705
1456650,Pascal's Wager: Definitive Edition,0.785882
1672210,Mortal Shell: The Virtuous Cycle,0.759416
1766100,The Last Hero of Nostalgaia,0.712582


## TSVD Data

In [38]:
scaled_data = StandardScaler().fit_transform(df)
tsvd = TruncatedSVD(n_components=383)
data_tsvd = tsvd.fit_transform(scaled_data)
cos_TSVD_df = pd.DataFrame(cosine_similarity(data_tsvd), columns=idLabels)
cos_TSVD_df.index = idLabels
cos_TSVD_df.head()

Unnamed: 0,1643320,2172860,1364780,1868180,1868170,1810820,1485590,1294810,1774580,1798020,...,592370,418800,418790,564310,237810,335610,249860,49540,418770,57925
1643320,1.0,0.136619,0.037531,0.047129,0.043419,0.043419,0.01335,0.405318,0.061245,-0.006085,...,-0.012425,-0.005546,-0.002183,0.066916,0.003312,0.014608,-0.017295,0.076227,-0.00598,0.151658
2172860,0.136619,1.0,0.059711,0.186536,0.056469,0.056469,-0.003282,0.193161,0.058241,0.020891,...,0.003232,0.027024,0.030932,-0.045202,0.034147,0.015432,-0.020075,-0.006921,0.028707,-0.059815
1364780,0.037531,0.059711,1.0,0.035569,0.03115,0.03115,0.139919,0.104379,0.081156,0.151852,...,0.148471,0.011466,0.015079,-0.06101,0.001552,0.014197,-0.007127,0.001943,0.011829,-0.077371
1868180,0.047129,0.186536,0.035569,1.0,0.905177,0.905177,0.231336,0.072428,0.048731,-0.005792,...,-0.005508,0.005621,0.008976,-0.093138,0.01578,0.009654,-0.016785,-0.010971,0.006854,-0.108726
1868170,0.043419,0.056469,0.03115,0.905177,1.0,1.0,0.242359,0.060583,0.036221,0.018885,...,-0.010454,0.00296,0.006679,-0.086585,0.016359,0.01111,-0.01565,-0.011409,0.00461,-0.118617


In [39]:
def top10_cos(id, df):
    """

    Returns top 10 app ids along with their cosine similarity to argument id

    Args:
        id (int): id of app to lookup
    """

    # Create ID-Title Dictionary
    data = pd.read_csv('../data/steam.csv')
    idNamesDict = data.set_index('Unique_ID').to_dict()['title']

    # Get top 10 matches
    res = df.loc[id].sort_values(ascending=False)[1:11]
    res = pd.DataFrame(res)
    res['Title'] = res.index.map(mapper=idNamesDict)
    res.columns = ['Cosine_Similarity', 'Title']
    res = res[['Title', 'Cosine_Similarity']]
    res



    return res

In [40]:
top10_cos(1245620, cos_TSVD_df)

Unnamed: 0,Title,Cosine_Similarity
1343240,Thymesia,0.903654
335300,DARK SOULS™ II: Scholar of the First Sin,0.893949
265300,Lords Of The Fallen™ 2014,0.869411
1108590,Eldest Souls,0.845653
1448440,Wo Long: Fallen Dynasty,0.841292
1054700,Back To Ashes,0.828549
1110910,Mortal Shell,0.827697
1456650,Pascal's Wager: Definitive Edition,0.800453
1672210,Mortal Shell: The Virtuous Cycle,0.790734
1766100,The Last Hero of Nostalgaia,0.737233


## Euclidean Distance

In [30]:
euc_df = pd.DataFrame(cdist(scaled_df, scaled_df, 'euclid'), columns=idLabels)
euc_df.index = idLabels
euc_df.head()

Unnamed: 0,1643320,2172860,1364780,1868180,1868170,1810820,1485590,1294810,1774580,1798020,...,592370,418800,418790,564310,237810,335610,249860,49540,418770,57925
1643320,0.0,18.281652,22.852627,20.740453,20.961396,20.961396,22.27053,14.526682,20.98936,21.066711,...,33.105313,31.975853,31.686032,16.992243,33.741213,31.380471,35.572885,35.527203,31.752637,14.75137
2172860,18.281652,0.0,22.221073,18.859703,20.374074,20.374074,21.709519,16.234096,20.355424,20.30679,...,32.635609,31.198199,30.901086,17.016149,33.136751,30.973934,35.160258,36.280025,30.943866,14.70236
1364780,22.852627,22.221073,0.0,23.820029,24.012654,24.012654,23.446417,20.902792,23.59432,22.582088,...,32.760325,33.743056,33.468542,21.299937,35.716899,33.468459,37.31128,38.309463,33.531607,19.558985
1868180,20.740453,18.859703,23.820029,0.0,7.70808,7.70808,20.605668,19.188427,22.089839,22.045032,...,33.744639,32.501134,32.216039,19.301436,34.239422,32.150869,36.192009,37.280798,32.257076,17.296112
1868170,20.961396,20.374074,24.012654,7.70808,0.0,0.0,20.828042,19.42703,22.297416,22.253026,...,33.880886,32.642572,32.358723,19.538659,34.373708,32.293841,36.319076,37.404167,32.399579,17.560446


In [33]:
def top10_euc(id):
    """

    Returns top 10 app ids along with their Euclidean Distance to argument id

    Args:
        id (int): id of app to lookup
    """

    # Create ID-Title Dictionary
    data = pd.read_csv('../data/steam.csv')
    idNamesDict = data.set_index('Unique_ID').to_dict()['title']

    # Get top 10 matches
    res = euc_df.loc[id].sort_values(ascending=True)[1:11]
    res = pd.DataFrame(res)
    res['Title'] = res.index.map(mapper=idNamesDict)
    res.columns = ['Euclidean_Distance', 'Title']
    res = res[['Title', 'Euclidean_Distance']]
    res



    return res

In [35]:
top10_euc(1245620)

Unnamed: 0,Title,Euclidean_Distance
335300,DARK SOULS™ II: Scholar of the First Sin,9.294261
1343240,Thymesia,9.388622
265300,Lords Of The Fallen™ 2014,9.982138
1108590,Eldest Souls,10.650063
1448440,Wo Long: Fallen Dynasty,11.176794
1054700,Back To Ashes,11.269385
1110910,Mortal Shell,11.630913
1456650,Pascal's Wager: Definitive Edition,12.018141
1672210,Mortal Shell: The Virtuous Cycle,12.805354
378540,The Surge,14.441094


## TSVD Data

In [41]:
scaled_data = StandardScaler().fit_transform(df)
tsvd = TruncatedSVD(n_components=383)
data_tsvd = tsvd.fit_transform(scaled_data)
cos_TSVD_df = pd.DataFrame(cdist(data_tsvd, data_tsvd, 'euclid'), columns=idLabels)
cos_TSVD_df.index = idLabels
cos_TSVD_df.head()

Unnamed: 0,1643320,2172860,1364780,1868180,1868170,1810820,1485590,1294810,1774580,1798020,...,592370,418800,418790,564310,237810,335610,249860,49540,418770,57925
1643320,0.0,17.847789,22.411074,20.532061,20.709463,20.709463,21.484279,14.105496,20.441153,18.9011,...,32.50083,31.641063,31.442551,16.186177,33.470525,31.100659,35.212403,35.235221,31.500851,14.12162
2172860,17.847789,0.0,21.668332,18.447892,19.92227,19.92227,21.10478,15.734328,19.869205,18.048578,...,31.879157,30.815921,30.601935,16.277564,32.735891,30.657609,34.825021,35.952486,30.640446,14.194503
1364780,22.411074,21.668332,0.0,23.257192,23.38631,23.38631,22.678744,20.305775,22.749641,20.495655,...,31.936869,33.328907,33.147431,20.560925,35.302627,33.076327,36.958357,37.908223,33.206279,18.876097
1868180,20.532061,18.447892,23.257192,0.0,6.976561,6.976561,19.909973,18.824376,21.604745,20.010747,...,33.108778,32.123958,31.954561,18.690747,33.940388,31.827534,35.828374,36.988407,31.993216,16.683067
1868170,20.709463,19.92227,23.38631,6.976561,0.0,0.0,19.874107,19.015483,21.769827,20.057092,...,33.220682,32.266566,32.080941,18.866444,34.110242,31.904859,35.92174,37.088129,32.123482,16.907555


In [44]:
def top10_euc(id, df):
    """

    Returns top 10 app ids along with their Euclidean Distance to argument id

    Args:
        id (int): id of app to lookup
    """

    # Create ID-Title Dictionary
    data = pd.read_csv('../data/steam.csv')
    idNamesDict = data.set_index('Unique_ID').to_dict()['title']

    # Get top 10 matches
    res = df.loc[id].sort_values(ascending=True)[1:11]
    res = pd.DataFrame(res)
    res['Title'] = res.index.map(mapper=idNamesDict)
    res.columns = ['Euclidean_Distance', 'Title']
    res = res[['Title', 'Euclidean_Distance']]
    res



    return res

In [45]:
top10_euc(1245620, cos_TSVD_df)

Unnamed: 0,Title,Euclidean_Distance
335300,DARK SOULS™ II: Scholar of the First Sin,8.496723
1343240,Thymesia,8.527293
265300,Lords Of The Fallen™ 2014,9.543963
1108590,Eldest Souls,10.256283
1054700,Back To Ashes,10.423606
1110910,Mortal Shell,10.459138
1448440,Wo Long: Fallen Dynasty,10.542279
1456650,Pascal's Wager: Definitive Edition,11.468814
1672210,Mortal Shell: The Virtuous Cycle,11.597084
1413990,The Heart of Darkness,13.22903
