In [59]:
import numpy as np
import pandas as pd


In [60]:
df = pd.read_json("review_data_small.json")

# normalize the JSON data to extract the fields
df = pd.json_normalize(df['fields'])

# count the occurrences of each steamid
counts = df.groupby('steamid').size()

# create a boolean mask for rows with steamid that appears less than 5 times
mask = df['steamid'].isin(counts[counts < 20].index)

# drop the corresponding rows from the original dataframe
df_filtered = df.drop(df[mask].index)

# print the filtered dataframe
df_filtered.head(20)

Unnamed: 0,gameid,steamid,playtime
25,493520,76561198163915061,1195
487,493520,76561198086514998,233
766,493520,76561198150994235,460
1086,346900,76561198089074577,5
1185,346900,76561198119316678,565
1223,228280,76561198315585536,268
1269,228280,76561197965049785,2389
1284,228280,76561198025696681,4976
1315,228280,76561198030123832,8344
1331,228280,76561198049368871,1817


In [61]:
df_filtered.info()
# df.head()|

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9024 entries, 25 to 995161
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   gameid    9024 non-null   object
 1   steamid   9024 non-null   object
 2   playtime  9024 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 282.0+ KB


In [62]:
top_hours = df_filtered.groupby('gameid')['playtime'].max().reset_index()
top_hours = top_hours.sort_values(by=['playtime'], ascending = False, ignore_index=True)
top_hours.head(20)

Unnamed: 0,gameid,playtime
0,201270,855573
1,570,601623
2,474960,522251
3,712100,324060
4,10190,292535
5,236850,213922
6,1100600,209613
7,872790,207070
8,202990,198240
9,1263850,195865


In [63]:
def frequency(user_playtime, all_playtime):
    if not user_playtime:
        return 0
    elif user_playtime >= all_playtime:
        return 1
    return user_playtime / all_playtime

def rating(frequency_sum):
    return 4 * (1 - frequency_sum) + 1

In [64]:
game_playtime = df_filtered.groupby(['gameid'])['playtime'].sum().reset_index()
length = len(df_filtered.index)
df_filtered['frequency'] = np.nan
# game_playtime.head(20)
df_filtered.head(20)

Unnamed: 0,gameid,steamid,playtime,frequency
25,493520,76561198163915061,1195,
487,493520,76561198086514998,233,
766,493520,76561198150994235,460,
1086,346900,76561198089074577,5,
1185,346900,76561198119316678,565,
1223,228280,76561198315585536,268,
1269,228280,76561197965049785,2389,
1284,228280,76561198025696681,4976,
1315,228280,76561198030123832,8344,
1331,228280,76561198049368871,1817,


In [65]:
count = 1
for i, row in df_filtered.iterrows():
    user_playtime = row["playtime"]
    all_playtime = game_playtime[game_playtime["gameid"] == row["gameid"]].iloc[0][1]
    df_filtered.at[i, "frequency"] = frequency(user_playtime , all_playtime)
    # print(df_filtered.at[i, "frequency"])
    print(f"{count} / {length}", end="\r", flush=True)
    count += 1


9024 / 9024

In [66]:
df_filtered.head(20)

Unnamed: 0,gameid,steamid,playtime,frequency
25,493520,76561198163915061,1195,0.632945
487,493520,76561198086514998,233,0.123411
766,493520,76561198150994235,460,0.243644
1086,346900,76561198089074577,5,0.008772
1185,346900,76561198119316678,565,0.991228
1223,228280,76561198315585536,268,0.005133
1269,228280,76561197965049785,2389,0.045754
1284,228280,76561198025696681,4976,0.0953
1315,228280,76561198030123832,8344,0.159804
1331,228280,76561198049368871,1817,0.034799


In [67]:
df_filter_saved = df_filtered.copy(deep=True)

# df_filtered['rating'] = np.nan
# df_filtered.sort_values(by = ['gameid', 'frequency'], inplace = True, ascending = False, ignore_index = True)
# df_filtered.head(20)
df_filtered.sort_values(by = ['gameid', 'frequency'], ascending = False, ignore_index=True)
df_filtered['rating'] = np.nan
# df_filter_saved.head(20)
df_filtered.head(20)

Unnamed: 0,gameid,steamid,playtime,frequency,rating
25,493520,76561198163915061,1195,0.632945,
487,493520,76561198086514998,233,0.123411,
766,493520,76561198150994235,460,0.243644,
1086,346900,76561198089074577,5,0.008772,
1185,346900,76561198119316678,565,0.991228,
1223,228280,76561198315585536,268,0.005133,
1269,228280,76561197965049785,2389,0.045754,
1284,228280,76561198025696681,4976,0.0953,
1315,228280,76561198030123832,8344,0.159804,
1331,228280,76561198049368871,1817,0.034799,


In [68]:
last_game = "" # 마지막에 본 게임
sum_f = 0 # 현재 frequency의 sum
last_f = 0 # 마지막 frequency(같은 frequency를 가진 유저 처리)
last_r = 0 # 마지막 rating(같은 frequency 가진 유저 처리)

for i, row in df_filtered.iterrows(): #go through the entire dataframe
    current_game = row[0] #find out which game we are one
    f_i = row[3] #find out the frequency for that game and user 
    # print(current_game)

    if current_game != last_game: #if it's the first time we are going over the game
        last_game = current_game #indicate we are now on a new game and save it's name
        if f_i == 0.0: #if there are no hours for the game
            sum_f = 0 #reset running sum
            last_f = 0 #reset last frequency
            last_r = 1 #set last rating
            df_filtered.at[i, 'rating'] = 1  #if there are no hours, give it the lowest rating
        else: #if there are hours for the game, must be the top user
            sum_f = f_i #first time on this game so sum is the frequency
            last_f = f_i #set last frequency
            last_r = 5 #set last rating
            df_filtered.at[i, 'rating'] = 5 #return the highest rating b/c top user

    else: #it's not the first time on this game
        if f_i == 0: #multiple users have zero hours
            last_r = 1 #set last rating 
            df_filtered.at[i, 'rating'] = 1 #return lowest score; don't need to reset anything
        elif last_f == f_i: #if there are multiple users with the same frequency
            sum_f += f_i #add to running sum
            df_filtered.at[i, 'rating'] = last_r #return last rating
        else:
            rating_f = rating(sum_f)  #calculate the rating
            print(rating_f)
            sum_f += f_i #update the sum
            last_f = f_i #update the last frequency
            last_r = rating_f #update the last rating
            df_filtered.at[i, 'rating'] = rating_f


25 gameid                  493520
steamid      76561198163915061
playtime                  1195
frequency             0.632945
rating                     NaN
Name: 25, dtype: object 493520 76561198163915061
487 gameid                  493520
steamid      76561198086514998
playtime                   233
frequency             0.123411
rating                     NaN
Name: 487, dtype: object 493520 76561198086514998
2.468220338983051
766 gameid                  493520
steamid      76561198150994235
playtime                   460
frequency             0.243644
rating                     NaN
Name: 766, dtype: object 493520 76561198150994235
1.9745762711864407
1086 gameid                  346900
steamid      76561198089074577
playtime                     5
frequency             0.008772
rating                     NaN
Name: 1086, dtype: object 346900 76561198089074577
1185 gameid                  346900
steamid      76561198119316678
playtime                   565
frequency             0.99122

In [69]:
df_filtered[:80]

Unnamed: 0,gameid,steamid,playtime,frequency,rating
25,493520,76561198163915061,1195,0.632945,5.000000
487,493520,76561198086514998,233,0.123411,2.468220
766,493520,76561198150994235,460,0.243644,1.974576
1086,346900,76561198089074577,5,0.008772,5.000000
1185,346900,76561198119316678,565,0.991228,4.964912
...,...,...,...,...,...
8649,606150,76561197970406584,822,0.063109,4.121075
8743,606150,76561197993240784,758,0.058196,3.868637
8767,606150,76561198192625621,22,0.001689,3.635854
8866,606150,76561197963852846,17,0.001305,3.629098


In [70]:
pivot_table = pd.pivot_table(df_filtered, values='rating', index=['steamid'], columns=['gameid'])
pivot_table.tail(20)

gameid,10,100,10090,1016920,1017180,10180,10190,1029780,1030210,1030830,...,976730,977880,977950,979690,985890,9900,992300,994280,996580,997070
steamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561198327764276,,,,,,,,,,,...,,,,,,,,,,
76561198333469466,,,,,,,,,,,...,,,,,,,,,,
76561198367541956,,,,,,,,,,,...,,,,,,,2.360134,,,
76561198367894395,,,,,,,,,,,...,,,,,,,,,,
76561198372421801,,,,,,4.083663,,,,,...,,,,,,,,,,
76561198376809656,,,,,,,,,,,...,,,,,,,,,,
76561198397700047,,,,,,,,,,,...,,,,,,,,,,
76561198401442310,,,,,,,,,,,...,,,,,,,,,,
76561198412953565,,,,,,,,,,,...,,,,1.656631,,,,,,
76561198449034673,,,,,,,,,,,...,,,,3.117378,,,,,,


In [71]:
from sklearn.metrics.pairwise import cosine_similarity

In [72]:
cos_sim_matrix = cosine_similarity(pivot_table.fillna(0))

cos_sim_df = pd.DataFrame(cos_sim_matrix, columns=pivot_table.index, index=pivot_table.index)


In [73]:
cos_sim_df.tail(20)

steamid,76561197961093505,76561197962973671,76561197963852846,76561197965049785,76561197965986699,76561197966788390,76561197967982736,76561197968210099,76561197969062133,76561197969379991,...,76561198825459467,76561198854641697,76561198855141224,76561198856712497,76561198913734471,76561198922800008,76561198960579742,76561198963788044,76561198981658318,76561198989051349
steamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561198327764276,0.0,0.077938,0.0,0.0,0.0,0.012814,0.080709,0.0,0.0,0.065169,...,0.198502,0.0,0.007313,0.039744,0.0,0.026145,0.0,0.085404,0.0,0.051326
76561198333469466,0.04983,0.0,0.083957,0.02439,0.0,0.019998,0.0,0.080363,0.0,0.0,...,0.03138,0.012394,0.048318,0.019365,0.075481,0.065665,0.0,0.0,0.0,0.055327
76561198367541956,0.0,0.0,0.0,0.0,0.0803,0.012053,0.019671,0.010852,0.0,0.074449,...,0.063223,0.060097,0.068752,0.041948,0.063273,0.0,0.0,0.0,0.07086,0.0
76561198367894395,0.0,0.0,0.129392,0.0,0.0,0.0,0.065652,0.0,0.031804,0.081897,...,0.0,0.0,0.0,0.016112,0.027291,0.008964,0.10487,0.0,0.047741,0.03684
76561198372421801,0.0,0.0,0.0,0.0,0.018029,0.050154,0.044989,0.023253,0.078207,0.0,...,0.081486,0.005588,0.059011,0.012504,0.0,0.037621,0.0,0.065451,0.0,0.040999
76561198376809656,0.083512,0.0,0.0,0.0,0.025649,0.073251,0.0,0.0,0.03537,0.0,...,0.042069,0.003119,0.065055,0.104144,0.0,0.019011,0.03221,0.0,0.0,0.112313
76561198397700047,0.0,0.0,0.0,0.070128,0.042676,0.027519,0.0,0.0,0.036066,0.0,...,0.0584,0.037894,0.01566,0.063324,0.05666,0.0,0.014774,0.0,0.0,0.010288
76561198401442310,0.0,0.033198,0.03109,0.020838,0.013072,0.104449,0.13196,0.075878,0.0,0.0,...,0.009799,0.028362,0.0,0.0,0.021183,0.0,0.0,0.0,0.0,0.172171
76561198412953565,0.0,0.0,0.022376,0.031908,0.089777,0.0,0.026069,0.069227,0.02415,0.0,...,0.0,0.112083,0.016461,0.0,0.0,0.0,0.044929,0.0,0.0,0.0
76561198449034673,0.014904,0.0,0.0,0.010744,0.018898,0.0,0.0,0.06698,0.047663,0.019676,...,0.064165,0.046712,0.016182,0.0,0.0,0.0,0.0,0.017682,0.121383,0.143501


In [74]:
top_100_neighbors = cos_sim_df["76561198315585536"].sort_values(ascending=False)[1:30]
top_100_neighbors = list(top_100_neighbors.index)

In [75]:
print(top_100_neighbors)

['76561198061975321', '76561198048447597', '76561198049368871', '76561198067090064', '76561198025696681', '76561198027973295', '76561198194419315', '76561198058852804', '76561198239163744', '76561198013146396', '76561198030123832', '76561198011965365', '76561198072094217', '76561198062813911', '76561198134513001', '76561197989807190', '76561198011551145', '76561198142553612', '76561198275956300', '76561198016573541', '76561197989746995', '76561198095217331', '76561197981027148', '76561198138642342', '76561198136250191', '76561198155755820', '76561198038757354', '76561198113014119', '76561197977450814']


In [76]:
#recommend games based on the neighbors' ratings
def get_recommend(user, neighbor_list, df):
    #which games the user already has
    user_games = df[df['steamid'] == user]
    candidates = []
    #go through all the neighbors
    for neighbor in neighbor_list:
        #make a temporary table containing all of the games that the neighbor has but the user does not
        temp = df[(df['steamid'] == neighbor) & (~df['gameid'].isin(user_games['gameid']))]
        #loop through the games in temp
        for index, game in temp.iterrows():
            #add the game and its rating to the dissimilar games list
            candidates.append((game['gameid'], game['rating']))
    #sort the dissimilar games list by the game name
    candidates.sort(key=lambda x: x[0])
    #flag to see if moved on to a new game
    flag = ""
    #running sum of all the ratings
    running_sum = 0
    #list we will add the recomendations to
    rec_list = []
    #count of how many times the game was in candidates
    count = 0
    #loop through all of the games
    for dis in candidates:
        #if it's the first time the game has come up in the loop
        if flag != dis[0]:
            #if it's not the first time the loop has run
            #if it was then we do not want to append anything
            if flag != "":
                #append the last game name and the average rating
                rec_list.append((flag, running_sum/count))
            #set the flag to the new gae
            flag = dis[0]
            #set the running sum to the current rating
            running_sum = dis[1]
            #reset the counter
            count = 1
        #multiple ratings for the same game
        else:
            #add the current rating to the running sum
            running_sum += dis[1]
            #increment the counter
            count += 1
    #sort the list of recommended games with the highest rating first
    sort_list = sorted(rec_list, key=lambda x: x[1], reverse = True)
    return(sort_list)

In [77]:
df_filtered_2 = df_filtered
df_filtered_2.sort_values(by = ['steamid', 'gameid'], ignore_index = True)
df_filtered_2.head(20)
# df_filtered.head(20)

Unnamed: 0,gameid,steamid,playtime,frequency,rating
25,493520,76561198163915061,1195,0.632945,5.0
487,493520,76561198086514998,233,0.123411,2.46822
766,493520,76561198150994235,460,0.243644,1.974576
1086,346900,76561198089074577,5,0.008772,5.0
1185,346900,76561198119316678,565,0.991228,4.964912
1223,228280,76561198315585536,268,0.005133,5.0
1269,228280,76561197965049785,2389,0.045754,4.979469
1284,228280,76561198025696681,4976,0.0953,4.796453
1315,228280,76561198030123832,8344,0.159804,4.415253
1331,228280,76561198049368871,1817,0.034799,3.776037


In [78]:
recommend = get_recommend("76561198316225206", top_100_neighbors, df_filtered_2)

print(len(recommend))

481


In [79]:
recommend[:481]

[('10', 5.0),
 ('10180', 5.0),
 ('1057090', 5.0),
 ('1080110', 5.0),
 ('1222670', 5.0),
 ('1222730', 5.0),
 ('1277400', 5.0),
 ('1313860', 5.0),
 ('1599340', 5.0),
 ('1938090', 5.0),
 ('261640', 5.0),
 ('273350', 5.0),
 ('282070', 5.0),
 ('307780', 5.0),
 ('332950', 5.0),
 ('339610', 5.0),
 ('365590', 5.0),
 ('368230', 5.0),
 ('386180', 5.0),
 ('466560', 5.0),
 ('492720', 5.0),
 ('501300', 5.0),
 ('56437', 5.0),
 ('569480', 5.0),
 ('582660', 5.0),
 ('719040', 5.0),
 ('730', 5.0),
 ('823500', 5.0),
 ('848450', 5.0),
 ('860510', 5.0),
 ('895400', 5.0),
 ('960090', 5.0),
 ('485510', 4.995948824116412),
 ('1066780', 4.979777698099678),
 ('493490', 4.957056336664551),
 ('594650', 4.956730769230769),
 ('633230', 4.953136441298511),
 ('367500', 4.951236422463601),
 ('477160', 4.94229272188691),
 ('611500', 4.941073307611365),
 ('552520', 4.936645203741129),
 ('282900', 4.93571340939762),
 ('9450', 4.924604263885076),
 ('55150', 4.908977436389822),
 ('397950', 4.9087691702713325),
 ('597820', 