In [239]:
import pandas as pd
from collections import Counter
import numpy as np

In [2]:
df = pd.read_json("./small_reviews.json",  lines=True)

In [3]:
del df['_id']
del df['base_review_id']
df['steamid']  = df.steamid.apply( lambda x : x['$numberLong'] )

# Create a game-data dataframe

In [4]:
games = pd.DataFrame(df.appid.drop_duplicates())

### Add reviews column

In [5]:
reviews_per_game = df.value_counts('appid').sort_values(ascending=False)
reviews_per_game.name = 'num_reviews'
games = games.merge(reviews_per_game.to_frame().reset_index())

### Add positive reviews column

In [6]:
pos_revs_per_game = df[df.voted_up].value_counts('appid')
pos_revs_per_game.name = 'pos_reviews'
games = games.merge(pos_revs_per_game.to_frame().reset_index())

### Create bins for number of reviews
This is a "fame (notoriety) metric" (a game is known)

In [7]:
games['fame'] = pd.qcut(games.num_reviews , q=[0, 0.25, 0.85, 1],labels=['NICHE', 'KNOWN', 'FAMOUS'])

In [8]:
games

Unnamed: 0,appid,num_reviews,pos_reviews,fame
0,204100,251,205,FAMOUS
1,204120,8,6,KNOWN
2,204180,21,16,KNOWN
3,204240,17,15,KNOWN
4,204300,149,132,FAMOUS
...,...,...,...,...
6250,598700,32,29,KNOWN
6251,598810,9,9,KNOWN
6252,598960,5,4,KNOWN
6253,598980,33,31,KNOWN


### Create a percentage of positive reviews metric
This is a "quality metric"

In [9]:
games['perc_pos_revs'] = 100 * games.pos_reviews / games.num_reviews

In [10]:
games.head()

Unnamed: 0,appid,num_reviews,pos_reviews,fame,perc_pos_revs
0,204100,251,205,FAMOUS,81.673307
1,204120,8,6,KNOWN,75.0
2,204180,21,16,KNOWN,76.190476
3,204240,17,15,KNOWN,88.235294
4,204300,149,132,FAMOUS,88.590604


# Hyp1) Does the average user prefer to stick with either popular games exor niche games?
There is a loyalty towards a level of notoriety

### For each user get list of games (s)he played

In [12]:
user_games = df.groupby('steamid').appid.agg(lambda x : list(x)).to_frame().reset_index()

### Keep users that have reviewed at least 3 games

In [13]:
user_games = user_games[user_games.appid.apply(lambda gs: len(gs)>2)]
user_games = user_games.reset_index(drop=True)

### Some utility funcs

In [14]:
def appid_to_num_reviews(appid):
    l = games[games.appid==appid].num_reviews.to_list()
    return l[0] if l else 0

In [15]:
def appid_to_fame(appid):
    l = games[games.appid==appid].fame.to_list()
    return l[0] if l else 0

In [16]:
user_games

Unnamed: 0,steamid,appid
0,76561197960270613,"[312540, 816020, 838310]"
1,76561197960271994,"[274310, 319630, 6850, 105600]"
2,76561197960279937,"[251130, 58570, 666140]"
3,76561197960281680,"[814380, 414340, 447040]"
4,76561197960319772,"[214250, 217690, 219150, 226720, 6550, 356400]"
...,...,...
2232,76561199032812732,"[205100, 271590, 808910]"
2233,76561199049375604,"[447530, 550, 519860]"
2234,76561199095484177,"[331870, 618720, 410890, 453290]"
2235,76561199117428662,"[251570, 55230, 359550]"


### Convert list of games to list of "fames"

In [17]:
user_games['fame'] = user_games.appid.apply(lambda gs : [appid_to_fame(g) for g in gs])

In [18]:
user_games

Unnamed: 0,steamid,appid,fame
0,76561197960270613,"[312540, 816020, 838310]","[KNOWN, FAMOUS, KNOWN]"
1,76561197960271994,"[274310, 319630, 6850, 105600]","[KNOWN, FAMOUS, KNOWN, FAMOUS]"
2,76561197960279937,"[251130, 58570, 666140]","[KNOWN, KNOWN, FAMOUS]"
3,76561197960281680,"[814380, 414340, 447040]","[FAMOUS, FAMOUS, FAMOUS]"
4,76561197960319772,"[214250, 217690, 219150, 226720, 6550, 356400]","[KNOWN, KNOWN, FAMOUS, KNOWN, KNOWN, FAMOUS]"
...,...,...,...
2232,76561199032812732,"[205100, 271590, 808910]","[FAMOUS, FAMOUS, KNOWN]"
2233,76561199049375604,"[447530, 550, 519860]","[FAMOUS, FAMOUS, FAMOUS]"
2234,76561199095484177,"[331870, 618720, 410890, 453290]","[KNOWN, KNOWN, KNOWN, KNOWN]"
2235,76561199117428662,"[251570, 55230, 359550]","[FAMOUS, FAMOUS, FAMOUS]"


### Calculate loyalty of each user to a level of fame

In [19]:
user_games['loyalty'] = user_games.fame.apply(lambda x : 100 * max(Counter(x).values()) / sum(Counter(x).values()) )

In [20]:
user_games.loyalty.describe()

count    2237.000000
mean       77.393383
std        19.076578
min        33.333333
25%        66.666667
50%        66.666667
75%       100.000000
max       100.000000
Name: loyalty, dtype: float64

In [21]:
def stat_sign(x_bar, s, mu_null, n): # TODO: mean_test()
    import scipy.stats as stats
    import math
    # Calculate the test statistic
    test_stat = (x_bar - mu_null)/(s/math.sqrt(n))

    # # Output the p-value of the test statistic (right tailed test)
    p =  1-stats.t.cdf(test_stat, n-1)
    
    return test_stat, p 


In [28]:
stat_sign(user_games.loyalty.mean(), user_games.loyalty.std(), 75 , len(user_games.loyalty))

(5.933962679174627, 1.7091031923044397e-09)

# Hyp2) Do players who play mostly famous games enjoy playing non-famous when they do?

In [272]:
user_games['loyalty_famous'] = user_games.fame.apply( lambda gs : 100*Counter(gs)['FAMOUS'] / sum(Counter(gs).values())  )                                                                                                
# user_games['loyalty_niche'] = user_games.fame.apply( lambda gs : 100*Counter(gs)['NICHE'] / sum(Counter(gs).values())  )                                                                                                

In [273]:
pop_game_players = user_games[ (user_games.loyalty_famous > 60)  & (user_games.loyalty_niche != 0 )]

In [274]:
def get_vote_val(userid, appid):
    return df[(df.steamid == userid) & (df.appid == appid) ].voted_up.to_list()[0]

In [275]:
pop_game_players['appreciation_niche'] = pop_game_players.apply(lambda r : [get_vote_val(r.steamid, g) for g in r.appid if appid_to_fame(g) == 'NICHE'  ], axis=1).apply(lambda x : 100 * sum(x)/len(x))
# pop_game_players['appreciation_famous'] = pop_game_players.apply(lambda r : [get_vote_val(r.steamid, g) for g in r.appid if appid_to_fame(g) == 'FAMOUS'  ], axis=1).apply(lambda x : 100 * sum(x)/len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_game_players['appreciation_niche'] = pop_game_players.apply(lambda r : [get_vote_val(r.steamid, g) for g in r.appid if appid_to_fame(g) == 'NICHE'  ], axis=1).apply(lambda x : 100 * sum(x)/len(x))


In [276]:
pop_game_players.appreciation_niche.mean()

82.05128205128206

In [277]:
mu = 70
stat_sign(pop_game_players.appreciation_niche.mean(), pop_game_players.appreciation_niche.std(), mu, len( pop_game_players.appreciation_niche))

(2.7556192588962674, 0.003654444477156904)