In [1]:
import findspark, sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType, StructField, StructType, ArrayType, IntegerType
sys.path.append('../../../scripts')
import utils

# Locate the Spark installation (add pyspark to sys.path, see https://github.com/minrk/findspark#readme)
findspark.init()
print(f'Using Spark located in {findspark.find()}.')

# Create or get the Spark session (singleton) and the underlying Spark context
spark = SparkSession.builder.getOrCreate()

Using Spark located in /usr/local/spark/.


# Sixth and Seventh Hypothesis

In the following notebook it will be analyzed the following two hypotheses:
 - **Are players loyal to a specific game notoriety level (no matter which one)?**
 - **Do players who mostly review famous games also enjoy niche ones?**

*P.S:* The two hypotheses are considered together in this notebook since some data are shared.

## 6) Are players loyal to a specific game notoriety level (no matter which one)?

### Data import, preprocessing & obtain useful metrics to perform the analysis

#### Import Reviews from HDFS


In [2]:
# Define a reasonable schema for the reviews dataset
reviews_schema = StructType([
#     StructField('base_review_id', LongType(), True), #TODOOOOO uncomment
    StructField('steamid', LongType(), True), 
    StructField('appid', LongType(), True),
    StructField('voted_up', BooleanType(), True),
    StructField('votes_up', LongType(), True),
    StructField('votes_funny', LongType(), True),
    StructField('weighted_vote_score', DoubleType(), True),
    StructField('playtime_forever', LongType(), True),
    StructField('playtime_at_review', LongType(), True),
    StructField('num_games_owned', LongType(), True),
    StructField('num_reviews', LongType(), True),
    StructField('review', StringType(), True),
    StructField('unix_timestamp_created', LongType(), True),
    StructField('unix_timestamp_updated', LongType(), True)
])

# Read the reviews dataset from HDFS
base_reviews_df = spark.read.csv(
    path='hdfs://localhost:54310/final_project/data/base_reviews',
    schema=reviews_schema,
    escape='"',
    header=True,
    ignoreTrailingWhiteSpace=True,
    mode='FAILFAST',
    multiLine=True,
    unescapedQuoteHandling='STOP_AT_CLOSING_QUOTE'
).to_pandas_on_spark()



#### Create games dataframe - contains the number of written reviews for a specific game (overall & only positive)

In [5]:
# Obtain all reviews written for each game
games_df = base_reviews_df.appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'num_reviews'
})

# Obtain all positive reviews written for each game
pos_revs_per_game_df = base_reviews_df[base_reviews_df.voted_up].appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'pos_reviews'
})
games_df = games_df.merge(pos_revs_per_game_df, how='left').fillna(0).astype(int)
games_df.to_spark().limit(5).toPandas()

Unnamed: 0,appid,num_reviews,pos_reviews
0,730,976212,859030
1,550,103661,100694
2,620,87843,86880
3,400,35797,35312
4,240,27880,26638


#### Utility functions

In [61]:
""" Utility functions """

def appid_to_num_reviews(appid):
    """ Given the id of a game it gives its number of reviews """
    l = games_df[games_df.appid==appid].num_reviews.to_list()
    return l[0] if l else 0

def appid_to_notoriety(appid):
    """ Given the id of a game it gives its notoriety level """
    l = games_df[games_df.appid==appid].notoriety.to_list()
    return l[0] if l else 0

def get_vote_val(userid, appid):
    """ Obtain whether a player enjoyed a particular game or not """
    return small_reviews_df[(small_reviews_df.steamid == userid) & (small_reviews_df.appid == appid) ].voted_up.to_list()[0]

def num_revs_to_label(num_rev:int)->str:
    """ Get notoriety label from number of reviews """
    if num_rev < THRESH1:
        return 'NICHE'
    elif   THRESH1 < num_rev < THRESH2:
        return 'KNOWN'
    else:
        return 'FAMOUS'

#### Obtain for each game a notoriety level and a percentage of positive reviews

The notoriety level is basically a measure of how much a game has been discussed in the small reviews dataset.

In [10]:
s = games_df.num_reviews.quantile([.25, .85])
THRESH1 = s[.25]
THRESH2 = s[.85]

THRESH1, THRESH2

(243.0, 6101.0)

In [12]:
games_df['notoriety'] = games_df.num_reviews.apply(num_revs_to_label)
games_df['perc_pos_revs'] = (games_df.pos_reviews / games_df.num_reviews) * 100
games_df.to_spark().limit(5).toPandas()

#### Obtain for each user a list of reviewed games & keep the ones that had reviewed at least 3 games

In [58]:
udf = F.udf(lambda x:list(x), ArrayType(IntegerType()))

users_reviewed_games = base_reviews_df.to_spark().groupby('steamid')\
                        .agg(F.collect_list('appid').alias('appid'))\
                        .withColumn('appid', udf(F.col('appid')))\
                        .to_pandas_on_spark()

users_reviewed_games.to_spark().limit(5).toPandas()

Unnamed: 0,steamid,appid
0,76561197960265778,[730]
1,76561197960265858,[30]
2,76561197960265876,[730]
3,76561197960265890,[620]
4,76561197960266039,"[30, 70]"


In [59]:
users_reviewed_least_3_games = users_reviewed_games[users_reviewed_games.appid.apply(lambda gs: len(gs)>2)]
users_reviewed_least_3_games.to_spark().limit(5).toPandas()

#### Convert list of games to list of notoriety levels

In [None]:
users_reviewed_least_3_games['notoriety'] = users_reviewed_least_3_games.appid.apply(lambda gs : [appid_to_notoriety(g) for g in gs])
users_reviewed_least_3_games.to_spark().limit(5).toPandas()

### Analysis

#### Calculate the maximum loyalty of each user (that have reviewed at least 3 games) to a game notoriety level

- **max_user_loyalty = (  max(user_reviewed_games[notoriety_level]) / reviewed_games  ) * 100**

In [53]:
notoriety_games_users_reviewed_least_3_games = pd.DataFrame(users_reviewed_least_3_games['notoriety'])
notoriety_games_users_reviewed_least_3_games.name = 'notoriety'
notoriety_games_users_reviewed_least_3_games

Unnamed: 0,notoriety
0,"[KNOWN, FAMOUS, KNOWN]"
1,"[KNOWN, FAMOUS, KNOWN, FAMOUS]"
2,"[KNOWN, KNOWN, FAMOUS]"
3,"[FAMOUS, FAMOUS, FAMOUS]"
4,"[KNOWN, KNOWN, FAMOUS, KNOWN, KNOWN, FAMOUS]"
...,...
2232,"[FAMOUS, FAMOUS, KNOWN]"
2233,"[FAMOUS, FAMOUS, FAMOUS]"
2234,"[KNOWN, KNOWN, KNOWN, KNOWN]"
2235,"[FAMOUS, FAMOUS, FAMOUS]"


In [54]:

users_max_loyalty_level = notoriety_games_users_reviewed_least_3_games.notoriety.apply(lambda x : 100 * max(collections.Counter(x).values()) / sum(collections.Counter(x).values()) )
users_max_loyalty_level

0        66.666667
1        50.000000
2        66.666667
3       100.000000
4        66.666667
           ...    
2232     66.666667
2233    100.000000
2234    100.000000
2235    100.000000
2236    100.000000
Name: notoriety, Length: 2237, dtype: float64

#### Statistical test

Are users loyal, for a M%, to one notoriety level (no matter which one)?

In [55]:
# Perform the mean test
statistics, p_value = utils.mean_test(users_max_loyalty_level, 75)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

Mean test coefficient: 10.286 
P-value: 0.000

Reject the null hypothesis


We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that people are loyal to review (and play) the 75% of same notoriety level games.

## 7) Do players who mostly review famous games also enjoy niche ones?

#### Obtain the loyalty level for famous & niche games, of players that have at least reviewed 3 games

In [56]:
users_reviewed_least_3_games['loyalty_famous'] = users_reviewed_least_3_games.notoriety.apply( lambda gs : 100*collections.Counter(gs)['FAMOUS'] / sum(collections.Counter(gs).values())  )
users_reviewed_least_3_games['loyalty_niche'] = users_reviewed_least_3_games.notoriety.apply( lambda gs : 100*collections.Counter(gs)['NICHE'] / sum(collections.Counter(gs).values())  )
users_reviewed_least_3_games[['steamid', 'loyalty_famous','loyalty_niche']]

Unnamed: 0,steamid,loyalty_famous,loyalty_niche
0,76561197960270613,33.333333,0.0
1,76561197960271994,50.000000,0.0
2,76561197960279937,33.333333,0.0
3,76561197960281680,100.000000,0.0
4,76561197960319772,33.333333,0.0
...,...,...,...
2232,76561199032812732,66.666667,0.0
2233,76561199049375604,100.000000,0.0
2234,76561199095484177,0.000000,0.0
2235,76561199117428662,100.000000,0.0


#### Consider only people who are very loyal to famous games and that have at least played one niche game

In [57]:
pop_game_players = users_reviewed_least_3_games[ (users_reviewed_least_3_games.loyalty_famous > 55)  & (users_reviewed_least_3_games.loyalty_niche != 0 )]
pop_game_players[['steamid', 'loyalty_famous','loyalty_niche']]

Unnamed: 0,steamid,loyalty_famous,loyalty_niche
21,76561197961349661,66.666667,33.333333
33,76561197962587924,66.666667,33.333333
71,76561197966548041,66.666667,33.333333
92,76561197968514637,66.666667,33.333333
99,76561197968907104,66.666667,33.333333
224,76561197975221620,66.666667,33.333333
236,76561197977069906,66.666667,16.666667
267,76561197980349546,66.666667,33.333333
335,76561197986360343,60.0,20.0
377,76561197989591969,66.666667,33.333333


#### Obtain the appreciation to niche games of players that are very loyal to famous games (and at least played one niche game)

In [58]:
niche_appreciation_pop_game_players = pop_game_players.apply(lambda r : [get_vote_val(r.steamid, g) for g in r.appid if appid_to_notoriety(g) == 'NICHE'  ], axis=1).apply(lambda x : 100 * sum(x)/len(x))
niche_appreciation_pop_game_players.value_counts()

100.0    41
0.0      17
dtype: int64

#### Statistical test

Are users loyal to famous notoriety games also enjoying M% of niche ones?

In [59]:
statistics, p_value = utils.mean_test(niche_appreciation_pop_game_players, 60)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

Mean test coefficient: 1.773 
P-value: 0.041

Reject the null hypothesis


We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that players loyal to famous notoriety games also enjoyed 60% of niche ones.