Use another dataset and merge it with the dataset used for calculating the Wilson Score so that we can get the tags needed for content based collaborative filtering

In [38]:
import pandas as pd 
import numpy as np

In [39]:
# load the dataframe used for wilson scoring
df_wilson = pd.read_json('game_stats_wilson.json', orient='records', lines=True)

In [40]:
df_wilson.head()

Unnamed: 0,appid,game,total_votes,upvotes,downvotes,wilson_score
0,550,Left 4 Dead 2,1309,1309,0,0.997074
1,644560,Mirror,925,925,0,0.995864
2,264710,Subnautica,1333,1332,1,0.995763
3,294100,RimWorld,1294,1293,1,0.995635
4,413150,Stardew Valley,2240,2236,4,0.995417


In [41]:
df_wilson.shape

(27315, 6)

In [42]:
# just need the appid and game
df_wilson_games = df_wilson[['appid', 'game']]

In [43]:
df_wilson_games.head()

Unnamed: 0,appid,game
0,550,Left 4 Dead 2
1,644560,Mirror
2,264710,Subnautica
3,294100,RimWorld
4,413150,Stardew Valley


In [44]:
# load dataframe that has game tags
df_tags = pd.read_json('obsolete/steam_games_preprocessed.json')

In [45]:
df_tags

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140.0,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980.0,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290.0,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400.0,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.49,http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640.0,"Nikita ""Ghost_RUS""",,
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",4.24,http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530.0,Sacada,,
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",1.39,http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660.0,Laush Dmitriy Sergeevich,,
32133,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",,http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870.0,"xropi,stev3ns",1 user reviews,


publisher, genres, tags, and specs all seem to be useful for content based collaborative filtering, let's take all of them

In [46]:
# take the columns I want
df_tags_important = df_tags[['id', 'app_name', 'genres', 'tags', 'publisher']]

In [47]:
df_tags_important.head()

Unnamed: 0,id,app_name,genres,tags,publisher
0,761140.0,Lost Summoner Kitty,"[Action, Casual, Indie, Simulation, Strategy]","[Strategy, Action, Indie, Casual, Simulation]",Kotoshiro
1,643980.0,Ironbound,"[Free to Play, Indie, RPG, Strategy]","[Free to Play, Strategy, Indie, RPG, Card Game...","Making Fun, Inc."
2,670290.0,Real Pool 3D - Poolians,"[Casual, Free to Play, Indie, Simulation, Sports]","[Free to Play, Simulation, Sports, Casual, Ind...",Poolians.com
3,767400.0,弹炸人2222,"[Action, Adventure, Casual]","[Action, Adventure, Casual]",彼岸领域
4,773570.0,Log Challenge,,"[Action, Indie, Casual, Sports]",


In [48]:
df_tags_important.shape

(32135, 5)

Merge both datasets but only keep the games that belong to df_wilson 

In [49]:
# Ensure game names are case-insensitive and merge datasets
df_wilson['game_lower'] = df_wilson['game'].str.lower()
df_tags_important['app_name_lower'] = df_tags_important['app_name'].str.lower()

# Merge datasets on the lowercased game name
merged_df = pd.merge(
    df_wilson,
    df_tags_important,
    left_on='game_lower',
    right_on='app_name_lower',
    how='left'
)

# Drop unnecessary columns after merging
merged_df = merged_df.drop(columns=['game_lower', 'app_name_lower'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tags_important['app_name_lower'] = df_tags_important['app_name'].str.lower()


In [50]:
# Display the first few rows of the merged DataFrame
merged_df.head(100)

Unnamed: 0,appid,game,total_votes,upvotes,downvotes,wilson_score,id,app_name,genres,tags,publisher
0,550,Left 4 Dead 2,1309,1309,0,0.997074,550.0,Left 4 Dead 2,[Action],"[Zombies, Co-op, FPS, Multiplayer, Action, Onl...",Valve
1,644560,Mirror,925,925,0,0.995864,,,,,
2,264710,Subnautica,1333,1332,1,0.995763,264710.0,Subnautica,,"[Early Access, Survival, Open World, Explorati...",
3,294100,RimWorld,1294,1293,1,0.995635,294100.0,RimWorld,"[Indie, Simulation, Strategy, Early Access]","[Early Access, Base Building, Survival, Strate...",Ludeon Studios
4,413150,Stardew Valley,2240,2236,4,0.995417,413150.0,Stardew Valley,"[Indie, RPG, Simulation]","[RPG, Simulation, Pixel Graphics, Agriculture,...",Chucklefish
...,...,...,...,...,...,...,...,...,...,...,...
95,220,Half-Life 2,406,404,2,0.982219,220.0,Half-Life 2,[Action],"[FPS, Action, Sci-fi, Singleplayer, Classic, S...",Valve
96,224260,No More Room in Hell,209,209,0,0.981951,224260.0,No More Room in Hell,"[Action, Free to Play, Indie]","[Free to Play, Zombies, Multiplayer, Survival,...",Lever Games
97,1657630,Slime Rancher 2,209,209,0,0.981951,,,,,
98,383870,Firewatch,207,207,0,0.981780,383870.0,Firewatch,"[Adventure, Indie]","[Atmospheric, Adventure, Walking Simulator, St...","Panic Inc., Campo Santo"


In [51]:
merged_df.shape

(27354, 11)

In [52]:
# Drop entries with missing genres
merged_df_valid_genres = merged_df.dropna(subset=['genres'])

In [55]:
merged_df_valid_genres.shape

(7434, 11)

Let's remove all the games that don't have genres since we can't use them for content based collaborative filtering

In [56]:
merged_df_valid_genres.head()

Unnamed: 0,appid,game,total_votes,upvotes,downvotes,wilson_score,id,app_name,genres,tags,publisher
0,550,Left 4 Dead 2,1309,1309,0,0.997074,550.0,Left 4 Dead 2,[Action],"[Zombies, Co-op, FPS, Multiplayer, Action, Onl...",Valve
3,294100,RimWorld,1294,1293,1,0.995635,294100.0,RimWorld,"[Indie, Simulation, Strategy, Early Access]","[Early Access, Base Building, Survival, Strate...",Ludeon Studios
4,413150,Stardew Valley,2240,2236,4,0.995417,413150.0,Stardew Valley,"[Indie, RPG, Simulation]","[RPG, Simulation, Pixel Graphics, Agriculture,...",Chucklefish
6,48700,Mount & Blade: Warband,689,689,0,0.994455,48700.0,Mount & Blade: Warband,"[Action, RPG]","[Medieval, RPG, Open World, Strategy, Action, ...",TaleWorlds Entertainment
7,10,Counter-Strike,611,611,0,0.993752,10.0,Counter-Strike,[Action],"[Action, FPS, Multiplayer, Shooter, Classic, T...",Valve


In [57]:
# columns that I need
merged_df_valid_genres = merged_df_valid_genres[['appid', 'game', 'genres', 'tags', 'publisher']]

In [58]:
merged_df_valid_genres.head()

Unnamed: 0,appid,game,genres,tags,publisher
0,550,Left 4 Dead 2,[Action],"[Zombies, Co-op, FPS, Multiplayer, Action, Onl...",Valve
3,294100,RimWorld,"[Indie, Simulation, Strategy, Early Access]","[Early Access, Base Building, Survival, Strate...",Ludeon Studios
4,413150,Stardew Valley,"[Indie, RPG, Simulation]","[RPG, Simulation, Pixel Graphics, Agriculture,...",Chucklefish
6,48700,Mount & Blade: Warband,"[Action, RPG]","[Medieval, RPG, Open World, Strategy, Action, ...",TaleWorlds Entertainment
7,10,Counter-Strike,[Action],"[Action, FPS, Multiplayer, Shooter, Classic, T...",Valve


Save this file as json

In [59]:
merged_df_valid_genres.to_json('merged_df_valid_genres.json', orient='records', lines=True)