In [1]:
import pymongo, pandas as pd, collections, scipy.stats, sys
from functools import *
sys.path.append('../../../scripts')
import utils

# Sixth and Seventh Hypothesis

In the following notebook it will be analyzed the following two hypotheses:
 - **Are players loyal to a specific game notoriety level (no matter which one)?**
 - **Do players who mostly review famous games also enjoy niche ones?**

*P.S:* The two hypotheses are considered together in this notebook since some data are shared.

## 6) Are players loyal to a specific game notoriety level (no matter which one)?

### Data import, preprocessing & obtain useful metrics to perform the analysis

#### Import json reviews

In [2]:
# Load the small reviews from mongo local instance
mongo = pymongo.MongoClient()
mongo_db = mongo.final_project
small_reviews_df = pd.DataFrame(list(mongo_db.small_reviews.find({}, {'_id': False, 'base_review_id': False})))
mongo.close() #Close the connection

#### Create games dataframe - contains the number of written reviews for a specific game (overall & only positive)

In [3]:
# Obtain all reviews written for each game
games_df = small_reviews_df.appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'num_reviews'
})

# Obtain all positive reviews written for each game
pos_revs_per_game_df = small_reviews_df[small_reviews_df.voted_up].appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'pos_reviews'
})
games_df = games_df.merge(pos_revs_per_game_df, how='left').fillna(0).astype(int)
games_df

Unnamed: 0,appid,num_reviews,pos_reviews
0,730,19535,17159
1,359550,6592,5898
2,105600,6436,6298
3,4000,5946,5740
4,271590,5909,4496
...,...,...,...
6708,451980,1,1
6709,451920,1,1
6710,451880,1,1
6711,98900,1,1


#### Obtain for each game a notoriety level and a percentage of positive reviews

The notoriety level is basically a measure of how much a game has been discussed in the small reviews dataset.

In [4]:
games_df['notoriety'] = pd.qcut(games_df.num_reviews, q=[0, 0.25, 0.85, 1], labels=['NICHE', 'KNOWN', 'FAMOUS'])
games_df['perc_pos_revs'] = (games_df.pos_reviews / games_df.num_reviews) * 100
games_df

Unnamed: 0,appid,num_reviews,pos_reviews,notoriety,perc_pos_revs
0,730,19535,17159,FAMOUS,87.837215
1,359550,6592,5898,FAMOUS,89.472087
2,105600,6436,6298,FAMOUS,97.855811
3,4000,5946,5740,FAMOUS,96.535486
4,271590,5909,4496,FAMOUS,76.087324
...,...,...,...,...,...
6708,451980,1,1,NICHE,100.000000
6709,451920,1,1,NICHE,100.000000
6710,451880,1,1,NICHE,100.000000
6711,98900,1,1,NICHE,100.000000


#### Calculate the maximum loyalty of each user (that have reviewed at least 3 games) to a game notoriety level

- **max_user_loyalty = (max(user_reviewed_games[notoriety_level]) / reviewed_games) * 100**

In [5]:
def get_max_user_loyalty(x):
    return 100 * max(collections.Counter(x).values()) / sum(collections.Counter(x).values())

small_reviews_groups = small_reviews_df[['steamid', 'appid', 'voted_up']]\
    .merge(games_df[['appid', 'notoriety']])\
    .groupby('steamid')\
    .filter(lambda x: len(x) > 2)\
    .groupby('steamid')

users_max_loyalty_level = small_reviews_groups.agg(get_max_user_loyalty)['notoriety']
users_max_loyalty_level

steamid
76561197960270613     66.666667
76561197960271994     50.000000
76561197960279937     66.666667
76561197960281680    100.000000
76561197960319772     66.666667
                        ...    
76561199032812732     66.666667
76561199049375604    100.000000
76561199095484177    100.000000
76561199117428662    100.000000
76561199122263574    100.000000
Name: notoriety, Length: 2237, dtype: float64

### Analysis

#### Statistical test

Are users loyal, for a M%, to one notoriety level (no matter which one)?

In [6]:
# Perform the mean test
statistics, p_value = utils.mean_test(users_max_loyalty_level, 75)

print(f"Mean test coefficient: {statistics:.3f}\nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

Mean test coefficient: 10.286 
P-value: 0.000

Reject the null hypothesis


We can notice that our hypothesis is true (and is statistically significant).<br>
So we can say that players are loyal to review, on average, at least 75% of same notoriety level games.

## 7) Do players who mostly review famous games also enjoy niche ones?

#### Obtain the loyalty level for famous & niche games, of players that have at least reviewed 3 games

Also, compute the fraction of niche games that a player liked with respect to the total number of niche games that he/she reviewed.

In [7]:
def get_user_loyalty(x, notoriety):
    return 100 * collections.Counter(x)[notoriety] / sum(collections.Counter(x).values())  

def get_fraction_niche_games_liked(x):
    a = x[x['notoriety'] == 'NICHE']['voted_up']
    if len(a) > 0:
        return 100 * sum(a) / len(a)
    else:
        return None

users_famous_loyalty_level = small_reviews_groups.agg(partial(get_user_loyalty, notoriety='FAMOUS'))['notoriety']
users_niche_loyalty_level = small_reviews_groups.agg(partial(get_user_loyalty, notoriety='NICHE'))['notoriety']
fraction_niche_games_liked = small_reviews_groups.apply(get_fraction_niche_games_liked)

#### Consider only people who are very loyal to famous games and that have at least played one niche game

In [8]:
loyalties_df = pd.DataFrame({'famous_loyalty': users_famous_loyalty_level, 'niche_loyalty': users_niche_loyalty_level, 'niche_liked': fraction_niche_games_liked})
loyalties_df = loyalties_df[(loyalties_df['famous_loyalty'] > 50) & (loyalties_df['niche_loyalty'] != 0)]
loyalties_df

Unnamed: 0_level_0,famous_loyalty,niche_loyalty,niche_liked
steamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
76561197961349661,66.666667,33.333333,100.0
76561197962587924,66.666667,33.333333,0.0
76561197966548041,66.666667,33.333333,100.0
76561197968514637,66.666667,33.333333,100.0
76561197968907104,66.666667,33.333333,100.0
76561197975221620,66.666667,33.333333,0.0
76561197977069906,66.666667,16.666667,100.0
76561197980349546,66.666667,33.333333,100.0
76561197986360343,60.0,20.0,100.0
76561197989591969,66.666667,33.333333,0.0


#### Statistical test

Are users loyal to famous notoriety games also enjoying M% of niche ones?

In [9]:
statistics, p_value = utils.mean_test(loyalties_df['niche_liked'], 60)

print(f"Mean test coefficient: {statistics:.3f}\nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

Mean test coefficient: 1.881 
P-value: 0.032

Reject the null hypothesis


We can notice that our hypothesis is true (and is statistically significant).<br>
So we can say that players that are loyal to famous games also enjoyed, on average, at least 60% of niche games that they reviewed.