In [1]:
import sys
sys.path.append('../../../scripts')
import utils

In [2]:
import findspark

# Locate the Spark installation (add pyspark to sys.path, see https://github.com/minrk/findspark#readme)
findspark.init()
print(f'Using Spark located in {findspark.find()}.')

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType, StructField, StructType, ArrayType, IntegerType

# Create or get the Spark session (singleton) and the underlying Spark context
spark = SparkSession.builder.getOrCreate()

Using Spark located in /usr/local/spark/.


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/25 17:42:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/25 17:42:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Sixth and Seventh Hypothesis

In the following notebook it will be analyzed the following two hypotheses:
 - **Are players loyal to a specific game notoriety level (no matter which one)?**
 - **Do players who mostly review famous games also enjoy niche ones?**

*P.S:* The two hypotheses are considered together in this notebook since some data are shared.

## 6) Are players loyal to a specific game notoriety level (no matter which one)?

### Data import, preprocessing & obtain useful metrics to perform the analysis

#### Import Reviews from HDFS


In [3]:
# Define a reasonable schema for the reviews dataset
reviews_schema = StructType([
    StructField('base_review_id', LongType(), True),
    StructField('steamid', LongType(), True), 
    StructField('appid', LongType(), True),
    StructField('voted_up', BooleanType(), True),
    StructField('votes_up', LongType(), True),
    StructField('votes_funny', LongType(), True),
    StructField('weighted_vote_score', DoubleType(), True),
    StructField('playtime_forever', LongType(), True),
    StructField('playtime_at_review', LongType(), True),
    StructField('num_games_owned', LongType(), True),
    StructField('num_reviews', LongType(), True),
    StructField('review', StringType(), True),
    StructField('unix_timestamp_created', LongType(), True),
    StructField('unix_timestamp_updated', LongType(), True)
])

# Read the reviews dataset from HDFS
base_reviews_df = spark.read.csv(
    path='hdfs://localhost:54310/final_project/data/base_reviews',
    schema=reviews_schema,
    escape='"',
    header=True,
    ignoreTrailingWhiteSpace=True,
    mode='FAILFAST',
    multiLine=True,
    unescapedQuoteHandling='STOP_AT_CLOSING_QUOTE'
).to_pandas_on_spark()



#### Create games dataframe - contains the number of written reviews for a specific game (overall & only positive)

In [4]:
# Obtain all reviews written for each game
games_df = base_reviews_df.appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'num_reviews'
})

# Obtain all positive reviews written for each game
pos_revs_per_game_df = base_reviews_df[base_reviews_df.voted_up].appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'pos_reviews'
})
games_df = games_df.merge(pos_revs_per_game_df, how='left').fillna(0).astype(int)
games_df.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,appid,num_reviews,pos_reviews
0,730,195612,172188
1,4000,59644,57716
2,105600,64284,63056
3,359550,67606,60965
4,271590,58734,44674


#### Utility functions

In [20]:
""" Utility functions """

def appid_to_num_reviews(appid):
    """ Given the id of a game it gives its number of reviews """
    l = games_df[games_df.appid==appid].num_reviews.to_list()
    return l[0] if l else 0

def appid_to_notoriety(appid):
    """ Given the id of a game it gives its notoriety level """
    l = games_df[games_df.appid==appid].notoriety.to_list()
    return l[0] if l else 0

def get_vote_val(userid, appid):
    """ Obtain whether a player enjoyed a particular game or not """
    return small_reviews_df[(small_reviews_df.steamid == userid) & (small_reviews_df.appid == appid) ].voted_up.to_list()[0]

def num_revs_to_label(num_rev:int)->str:
    """ Get notoriety label from number of reviews """
    if num_rev < THRESH1:
        return 'NICHE'
    elif THRESH1 <= num_rev < THRESH2:
        return 'KNOWN'
    else:
        return 'FAMOUS'

#### Obtain for each game a notoriety level and a percentage of positive reviews

The notoriety level is basically a measure of how much a game has been discussed in the small reviews dataset.

In [6]:
s = games_df.num_reviews.quantile([.25, .85])
THRESH1 = s[.25]
THRESH2 = s[.85]

THRESH1, THRESH2

                                                                                

(14.0, 296.0)

In [7]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

udf_num_revs_to_label = udf(lambda x: num_revs_to_label(x), StringType())

games_df = games_df.to_spark().withColumn('notoriety', udf_num_revs_to_label(col('num_reviews'))).to_pandas_on_spark()
games_df.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,appid,num_reviews,pos_reviews,notoriety
0,730,195612,172188,FAMOUS
1,4000,59644,57716,FAMOUS
2,105600,64284,63056,FAMOUS
3,359550,67606,60965,FAMOUS
4,271590,58734,44674,FAMOUS


In [8]:
games_df['perc_pos_revs'] = (games_df.pos_reviews / games_df.num_reviews) * 100
games_df.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,appid,num_reviews,pos_reviews,notoriety,perc_pos_revs
0,730,195612,172188,FAMOUS,88.025275
1,4000,59644,57716,FAMOUS,96.767487
2,105600,64284,63056,FAMOUS,98.089727
3,359550,67606,60965,FAMOUS,90.176907
4,271590,58734,44674,FAMOUS,76.061566


#### Obtain for each user a list of reviewed games & keep the ones that had reviewed at least 3 games

In [9]:
udf_to_list = F.udf(lambda x:list(x), ArrayType(IntegerType()))

users_reviewed_games = base_reviews_df.to_spark().groupby('steamid')\
                        .agg(F.collect_list('appid').alias('appid'))\
                        .withColumn('appid', udf_to_list(F.col('appid')))\
                        .to_pandas_on_spark()

users_reviewed_games.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,steamid,appid
0,76561197960266219,[573170]
1,76561197960266231,[92800]
2,76561197960267068,[453090]
3,76561197960267185,[351490]
4,76561197960267331,"[61700, 730]"


In [10]:
from pyspark.sql.types import BooleanType

udf_filter_at_least_3_games = udf(lambda x: (len(x) > 2), BooleanType())

users_reviewed_games = users_reviewed_games.to_spark().withColumn('at_least_3_games', udf_filter_at_least_3_games(col('appid'))).to_pandas_on_spark()
users_reviewed_least_3_games = users_reviewed_games[users_reviewed_games['at_least_3_games']]
users_reviewed_least_3_games.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,steamid,appid,at_least_3_games
0,76561197960269579,"[4920, 980300, 252470]",True
1,76561197960270050,"[205100, 215280, 250320, 110800, 115800]",True
2,76561197960270526,"[253230, 571740, 578080, 730]",True
3,76561197960272328,"[287700, 203140, 418240, 578080]",True
4,76561197960272398,"[288160, 107200, 396750, 716630]",True


#### Convert list of games to list of notoriety levels

In [21]:
from pyspark.sql.types import ArrayType

wrong_udf = udf(lambda x: [appid_to_notoriety(g, games_df) for g in x], ArrayType(StringType()))
# wrong_udf = udf(lambda x: appid_to_notoriety(x[0]), StringType())

# users_reviewed_least_3_games['notoriety'] = users_reviewed_least_3_games.appid.apply(lambda gs: [appid_to_notoriety(g) for g in gs])
users_reviewed_least_3_games = users_reviewed_least_3_games.to_spark().withColumn('notoriety', wrong_udf(col('appid'))).to_pandas_on_spark()
users_reviewed_least_3_games.to_spark().limit(5).toPandas()

Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/usr/local/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/usr/local/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 563, in dump
    return Pickler.dump(self, obj)
TypeError: cannot pickle '_thread.RLock' object


PicklingError: Could not serialize object: TypeError: cannot pickle '_thread.RLock' object

### Analysis

#### Calculate the maximum loyalty of each user (that have reviewed at least 3 games) to a game notoriety level

- **max_user_loyalty = (  max(user_reviewed_games[notoriety_level]) / reviewed_games  ) * 100**

In [None]:
notoriety_games_users_reviewed_least_3_games = pd.DataFrame(users_reviewed_least_3_games['notoriety'])
notoriety_games_users_reviewed_least_3_games.name = 'notoriety'
notoriety_games_users_reviewed_least_3_games

In [None]:

users_max_loyalty_level = notoriety_games_users_reviewed_least_3_games.notoriety.apply(lambda x : 100 * max(collections.Counter(x).values()) / sum(collections.Counter(x).values()) )
users_max_loyalty_level

#### Statistical test

Are users loyal, for a M%, to one notoriety level (no matter which one)?

In [None]:
# Perform the mean test
statistics, p_value = utils.mean_test(users_max_loyalty_level, 75)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that people are loyal to review (and play) the 75% of same notoriety level games.

## 7) Do players who mostly review famous games also enjoy niche ones?

#### Obtain the loyalty level for famous & niche games, of players that have at least reviewed 3 games

In [None]:
users_reviewed_least_3_games['loyalty_famous'] = users_reviewed_least_3_games.notoriety.apply( lambda gs : 100*collections.Counter(gs)['FAMOUS'] / sum(collections.Counter(gs).values())  )
users_reviewed_least_3_games['loyalty_niche'] = users_reviewed_least_3_games.notoriety.apply( lambda gs : 100*collections.Counter(gs)['NICHE'] / sum(collections.Counter(gs).values())  )
users_reviewed_least_3_games[['steamid', 'loyalty_famous','loyalty_niche']]

#### Consider only people who are very loyal to famous games and that have at least played one niche game

In [None]:
pop_game_players = users_reviewed_least_3_games[ (users_reviewed_least_3_games.loyalty_famous > 55)  & (users_reviewed_least_3_games.loyalty_niche != 0 )]
pop_game_players[['steamid', 'loyalty_famous','loyalty_niche']]

#### Obtain the appreciation to niche games of players that are very loyal to famous games (and at least played one niche game)

In [None]:
niche_appreciation_pop_game_players = pop_game_players.apply(lambda r : [get_vote_val(r.steamid, g) for g in r.appid if appid_to_notoriety(g) == 'NICHE'  ], axis=1).apply(lambda x : 100 * sum(x)/len(x))
niche_appreciation_pop_game_players.value_counts()

#### Statistical test

Are users loyal to famous notoriety games also enjoying M% of niche ones?

In [None]:
statistics, p_value = utils.mean_test(niche_appreciation_pop_game_players, 60)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that players loyal to famous notoriety games also enjoyed 60% of niche ones.