In [1]:
import pymongo, pandas as pd, numpy as np, collections, utils, scipy.stats

# Sixth and Seventh Hypothesis

In the following notebook it will be analyzed the following two hypothesis:
 - **Are players loyal to a specific game notoriety level (no matter which one)?**
 - **Do players who mostly review famous games also enjoy niche ones?**

*P.S:* The two hypothesis are considered together in this notebook since some data is shared.

## 6) Are players loyal to a specific game notoriety level (no matter which one)?

### Data import, preprocessing & obtain useful metrics to perform the analysis

#### Import json reviews and games datasets + Some utity functions

In [2]:
# Load the small reviews & games dataset from mongo local instance
mongo = pymongo.MongoClient()
mongo_db = mongo.final_project
small_reviews_df = pd.DataFrame(list(mongo_db.small_reviews.find({}, {'_id': False, 'base_review_id': False})))
games_df = pd.DataFrame(list(mongo_db.games.find({}, {'_id': False})))
mongo.close() #Close the connection

In [3]:
""" Utility functions """

def appid_to_num_reviews(appid):
    """ Given the id of a game it gives its number of reviews """
    l = games_df[games_df.appid==appid].num_reviews.to_list()
    return l[0] if l else 0

def appid_to_notoriety(appid):
    """ Given the id of a game it gives its notoriety level """
    l = games_df[games_df.appid==appid].notoriety.to_list()
    return l[0] if l else 0

def get_vote_val(userid, appid):
    """ Obtain whether a player enjoyed a particular game or not """
    return small_reviews_df[(small_reviews_df.steamid == userid) & (small_reviews_df.appid == appid) ].voted_up.to_list()[0]

#### Add the number of written reviews for a specific game (overall & only positive)

In [4]:
# Obtain all reviews written for each game
reviews_per_game = small_reviews_df.value_counts('appid').sort_values(ascending=False)
reviews_per_game.name = 'num_reviews'
games_df = games_df.merge(reviews_per_game.to_frame().reset_index())

# Obtain all positive reviews written for each game
pos_revs_per_game = small_reviews_df[small_reviews_df.voted_up].value_counts('appid')
pos_revs_per_game.name = 'pos_reviews'
games_df = games_df.merge(pos_revs_per_game.to_frame().reset_index())

#### Obtain for each game a notoriety level and a percentage of positive reviews

The notoriety level is basically a measure of how much a game has been discussed in the small reviews dataset.

In [5]:
games_df['notoriety'] = pd.qcut(games_df.num_reviews , q=[0, 0.25, 0.85, 1],labels=['NICHE', 'KNOWN', 'FAMOUS'])
games_df['perc_pos_revs'] = (games_df.pos_reviews / games_df.num_reviews) * 100

games_df[['appid', 'name', 'num_reviews', 'pos_reviews', 'perc_pos_revs', 'notoriety']]

Unnamed: 0,appid,name,num_reviews,pos_reviews,perc_pos_revs,notoriety
0,20,Team Fortress Classic,54,44,81.481481,FAMOUS
1,50,Half-Life: Opposing Force,101,98,97.029703,FAMOUS
2,60,Ricochet,29,22,75.862069,KNOWN
3,40,Deathmatch Classic,12,10,83.333333,KNOWN
4,70,Half-Life,467,448,95.931478,FAMOUS
...,...,...,...,...,...,...
6250,1032430,Blood Trail,18,17,94.444444,KNOWN
6251,1035610,Hentai Mosaique Vip Room,1,1,100.000000,NICHE
6252,1037190,Shipped,1,1,100.000000,NICHE
6253,1046030,ISLANDERS,160,154,96.250000,FAMOUS


#### Obtain for each user a list of reviewed games & keep the ones that had reviewed at least 3 games

In [6]:
# List of games each user reviewed
users_reviewed_games = small_reviews_df.groupby('steamid').appid.agg(lambda x:list(x)).to_frame().reset_index()

# Users reviewed at least 3 games
users_reviewed_least_3_games = users_reviewed_games[users_reviewed_games.appid.apply(lambda gs: len(gs)>2)].reset_index(drop=True)
users_reviewed_least_3_games

Unnamed: 0,steamid,appid
0,76561197960270613,"[312540, 816020, 838310]"
1,76561197960271994,"[274310, 319630, 6850, 105600]"
2,76561197960279937,"[251130, 58570, 666140]"
3,76561197960281680,"[814380, 414340, 447040]"
4,76561197960319772,"[214250, 217690, 219150, 226720, 6550, 356400]"
...,...,...
2232,76561199032812732,"[205100, 271590, 808910]"
2233,76561199049375604,"[447530, 550, 519860]"
2234,76561199095484177,"[331870, 618720, 410890, 453290]"
2235,76561199117428662,"[251570, 55230, 359550]"


#### Convert list of games to list of notoriety levels

In [7]:
users_reviewed_least_3_games['notoriety'] = users_reviewed_least_3_games.appid.apply(lambda gs : [appid_to_notoriety(g) for g in gs])
users_reviewed_least_3_games

Unnamed: 0,steamid,appid,notoriety
0,76561197960270613,"[312540, 816020, 838310]","[KNOWN, FAMOUS, KNOWN]"
1,76561197960271994,"[274310, 319630, 6850, 105600]","[KNOWN, FAMOUS, KNOWN, FAMOUS]"
2,76561197960279937,"[251130, 58570, 666140]","[KNOWN, KNOWN, FAMOUS]"
3,76561197960281680,"[814380, 414340, 447040]","[FAMOUS, FAMOUS, FAMOUS]"
4,76561197960319772,"[214250, 217690, 219150, 226720, 6550, 356400]","[KNOWN, KNOWN, FAMOUS, KNOWN, KNOWN, FAMOUS]"
...,...,...,...
2232,76561199032812732,"[205100, 271590, 808910]","[FAMOUS, FAMOUS, KNOWN]"
2233,76561199049375604,"[447530, 550, 519860]","[FAMOUS, FAMOUS, FAMOUS]"
2234,76561199095484177,"[331870, 618720, 410890, 453290]","[KNOWN, KNOWN, KNOWN, KNOWN]"
2235,76561199117428662,"[251570, 55230, 359550]","[FAMOUS, FAMOUS, FAMOUS]"


### Analysis

#### Calculate the maximum loyalty of each user (that have reviewed at least 3 games) to a game notoriety level

- **max_user_loyalty = (  max(user_reviewed_games[notoriety_level]) / reviewed_games  ) * 100**

In [8]:
notoriety_games_users_reviewed_least_3_games = pd.DataFrame(users_reviewed_least_3_games['notoriety'])
notoriety_games_users_reviewed_least_3_games.name = 'notoriety'
notoriety_games_users_reviewed_least_3_games

Unnamed: 0,notoriety
0,"[KNOWN, FAMOUS, KNOWN]"
1,"[KNOWN, FAMOUS, KNOWN, FAMOUS]"
2,"[KNOWN, KNOWN, FAMOUS]"
3,"[FAMOUS, FAMOUS, FAMOUS]"
4,"[KNOWN, KNOWN, FAMOUS, KNOWN, KNOWN, FAMOUS]"
...,...
2232,"[FAMOUS, FAMOUS, KNOWN]"
2233,"[FAMOUS, FAMOUS, FAMOUS]"
2234,"[KNOWN, KNOWN, KNOWN, KNOWN]"
2235,"[FAMOUS, FAMOUS, FAMOUS]"


In [9]:

users_max_loyalty_level = notoriety_games_users_reviewed_least_3_games.notoriety.apply(lambda x : 100 * max(collections.Counter(x).values()) / sum(collections.Counter(x).values()) )
users_max_loyalty_level

0        66.666667
1        50.000000
2        66.666667
3       100.000000
4        66.666667
           ...    
2232     66.666667
2233    100.000000
2234    100.000000
2235    100.000000
2236    100.000000
Name: notoriety, Length: 2237, dtype: float64

#### Statistical test

Are at least M percentage of users loyal to one notoriety level (no matter which one)?

In [10]:
# Perform the mean test
statistics, p_value = utils.mean_test(users_max_loyalty_level, 75)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

Mean test coefficient: 5.934 
P-value: 0.000

Reject the null hypothesis


We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that at least a 75% of people are loyal to review (and play) the same notoriety level game.

## 7) Do players who mostly review famous games also enjoy niche ones?

#### Obtain the loyalty level for famous & niche games, of players that have at least reviewed 3 games

In [11]:
users_reviewed_least_3_games['loyalty_famous'] = users_reviewed_least_3_games.notoriety.apply( lambda gs : 100*collections.Counter(gs)['FAMOUS'] / sum(collections.Counter(gs).values())  )
users_reviewed_least_3_games['loyalty_niche'] = users_reviewed_least_3_games.notoriety.apply( lambda gs : 100*collections.Counter(gs)['NICHE'] / sum(collections.Counter(gs).values())  )
users_reviewed_least_3_games[['steamid', 'loyalty_famous','loyalty_niche']]

Unnamed: 0,steamid,loyalty_famous,loyalty_niche
0,76561197960270613,33.333333,0.0
1,76561197960271994,50.000000,0.0
2,76561197960279937,33.333333,0.0
3,76561197960281680,100.000000,0.0
4,76561197960319772,33.333333,0.0
...,...,...,...
2232,76561199032812732,66.666667,0.0
2233,76561199049375604,100.000000,0.0
2234,76561199095484177,0.000000,0.0
2235,76561199117428662,100.000000,0.0


#### Consider only people who are very loyal to famous games and that have at least played one niche game

In [12]:
pop_game_players = users_reviewed_least_3_games[ (users_reviewed_least_3_games.loyalty_famous > 60)  & (users_reviewed_least_3_games.loyalty_niche != 0 )]
pop_game_players[['steamid', 'loyalty_famous','loyalty_niche']]

Unnamed: 0,steamid,loyalty_famous,loyalty_niche
21,76561197961349661,66.666667,33.333333
57,76561197964798094,66.666667,33.333333
70,76561197966533518,75.000000,25.000000
71,76561197966548041,66.666667,33.333333
92,76561197968514637,66.666667,33.333333
...,...,...,...
2025,76561198240308012,66.666667,33.333333
2046,76561198260725341,75.000000,25.000000
2138,76561198330649667,66.666667,33.333333
2139,76561198331789084,66.666667,33.333333


#### Obtain the appreciation to niche games of players that are very loyal to famous games (and at least played one niche game)

In [13]:
niche_appreciation_pop_game_players = pop_game_players.apply(lambda r : [get_vote_val(r.steamid, g) for g in r.appid if appid_to_notoriety(g) == 'NICHE'  ], axis=1).apply(lambda x : 100 * sum(x)/len(x))
niche_appreciation_pop_game_players.value_counts()

100.0    64
0.0      14
dtype: int64

#### Statistical test

Are users loyal to famous notoriety games also enjoying M% of niche ones?

In [14]:
statistics, p_value = utils.mean_test(niche_appreciation_pop_game_players, 70)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

Mean test coefficient: 2.756 
P-value: 0.004

Reject the null hypothesis


We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that players loyal to famous notoriety games also enjoyed 70% of niche ones.