In [1]:
import sys
sys.path.append('../../../scripts')
import utils

In [2]:
import findspark

# Locate the Spark installation (add pyspark to sys.path, see https://github.com/minrk/findspark#readme)
findspark.init()
print(f'Using Spark located in {findspark.find()}.')

from pyspark.sql import SparkSession
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType, StructField, StructType, ArrayType, IntegerType

# Create or get the Spark session (singleton) and the underlying Spark context
spark = SparkSession.builder.getOrCreate()

Using Spark located in /usr/local/spark/.


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/01 13:40:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Sixth and Seventh Hypothesis

In the following notebook it will be analyzed the following two hypotheses:
 - **Are players loyal to a specific game notoriety level (no matter which one)?**
 - **Do players who mostly review famous games also enjoy niche ones?**

*P.S:* The two hypotheses are considered together in this notebook since some data are shared.

## 6) Are players loyal to a specific game notoriety level (no matter which one)?

### Data import, preprocessing & obtain useful metrics to perform the analysis

#### Import Reviews from HDFS


In [3]:
# Define a reasonable schema for the reviews dataset
reviews_schema = StructType([
    StructField('base_review_id', LongType(), True),
    StructField('steamid', LongType(), True), 
    StructField('appid', LongType(), True),
    StructField('voted_up', BooleanType(), True),
    StructField('votes_up', LongType(), True),
    StructField('votes_funny', LongType(), True),
    StructField('weighted_vote_score', DoubleType(), True),
    StructField('playtime_forever', LongType(), True),
    StructField('playtime_at_review', LongType(), True),
    StructField('num_games_owned', LongType(), True),
    StructField('num_reviews', LongType(), True),
    StructField('review', StringType(), True),
    StructField('unix_timestamp_created', LongType(), True),
    StructField('unix_timestamp_updated', LongType(), True)
])

# Read the reviews dataset from HDFS
base_reviews_df = spark.read.csv(
    path='hdfs://localhost:54310/final_project/data/base_reviews',
    schema=reviews_schema,
    escape='"',
    header=True,
    ignoreTrailingWhiteSpace=True,
    mode='FAILFAST',
    multiLine=True,
    unescapedQuoteHandling='STOP_AT_CLOSING_QUOTE'
).to_pandas_on_spark()

#### Create games dataframe - contains the number of written reviews for a specific game (overall & only positive)

In [4]:
# Obtain all reviews written for each game
games_df = base_reviews_df.appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'num_reviews'
})

# Obtain all positive reviews written for each game
pos_revs_per_game_df = base_reviews_df[base_reviews_df.voted_up].appid.value_counts().reset_index().rename(columns={
    'index': 'appid',
    'appid': 'pos_reviews'
})
games_df = games_df.merge(pos_revs_per_game_df, how='left').fillna(0).astype(int)
games_df.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,appid,num_reviews,pos_reviews
0,730,195612,172188
1,4000,59644,57716
2,105600,64284,63056
3,359550,67606,60965
4,271590,58734,44674


#### Obtain for each game a notoriety level and a percentage of positive reviews

The notoriety level is basically a measure of how much a game has been discussed in the small reviews dataset.

In [5]:
s = games_df.num_reviews.quantile([.25, .85])
THRESH1 = s[.25]
THRESH2 = s[.85]

THRESH1, THRESH2

                                                                                

(14.0, 296.0)

In [6]:
from pyspark.sql.functions import udf, col

def num_revs_to_label(num_rev: int) -> str:
    """ Get notoriety label from number of reviews """
    if num_rev < THRESH1:
        return 'NICHE'
    elif THRESH1 <= num_rev < THRESH2:
        return 'KNOWN'
    else:
        return 'FAMOUS'

udf_num_revs_to_label = udf(lambda x: num_revs_to_label(x), StringType())

games_df = games_df.to_spark().withColumn('notoriety', udf_num_revs_to_label(col('num_reviews'))).to_pandas_on_spark()
games_df['perc_pos_revs'] = (games_df.pos_reviews / games_df.num_reviews) * 100
games_df.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,appid,num_reviews,pos_reviews,notoriety,perc_pos_revs
0,730,195612,172188,FAMOUS,88.025275
1,4000,59644,57716,FAMOUS,96.767487
2,105600,64284,63056,FAMOUS,98.089727
3,359550,67606,60965,FAMOUS,90.176907
4,271590,58734,44674,FAMOUS,76.061566


#### Calculate the maximum loyalty of each user (that have reviewed at least 3 games) to a game notoriety level

- **max_user_loyalty = (max(user_reviewed_games[notoriety_level]) / reviewed_games) * 100**

In [20]:
from pyspark.sql.functions import collect_list

from pyspark.sql import functions as F, Window

base_reviews_merge = (base_reviews_df[['steamid', 'appid', 'voted_up']]
    .merge(games_df[['appid', 'notoriety']])
)

w1 = Window.partitionBy('steamid', 'notoriety')
base_reviews_notoriety = base_reviews_merge.to_spark().withColumn('n_for_each_notoriety', F.count(col('notoriety')).over(w1))
base_reviews_notoriety = base_reviews_notoriety.dropDuplicates(['steamid', 'notoriety'])

w2 = Window.partitionBy('steamid')
max_for_each_notoriety = base_reviews_notoriety.withColumn('max_for_each_notoriety', F.max(col('n_for_each_notoriety')).over(w2))
max_for_each_notoriety = max_for_each_notoriety.dropDuplicates(['steamid'])

n = base_reviews_notoriety.withColumn('n', F.sum(col('n_for_each_notoriety')).over(w2))
n = n.dropDuplicates(['steamid'])

n = n[n['n'] > 2]

r = max_for_each_notoriety.join(n[['steamid', 'n']], on='steamid')

r = r.withColumn('result', col('max_for_each_notoriety') / col('n') * 100)

print(f'The mean is {r.to_pandas_on_spark()["result"].mean()}')
r.limit(100).toPandas()

22/09/01 14:59:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:59:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:59:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:59:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:59:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:59:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

The mean is 86.73383160554866


22/09/01 15:00:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:00:16 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,steamid,appid,voted_up,notoriety,n_for_each_notoriety,max_for_each_notoriety,n,result
0,76561197960269579,252470,True,FAMOUS,3,3,3,100.0
1,76561197960270050,205100,True,FAMOUS,5,5,5,100.0
2,76561197960270526,253230,True,FAMOUS,4,4,4,100.0
3,76561197960272328,287700,True,FAMOUS,4,4,4,100.0
4,76561197960272398,288160,True,FAMOUS,3,3,4,75.0
...,...,...,...,...,...,...,...,...
95,76561197960670470,292730,False,FAMOUS,3,3,3,100.0
96,76561197960670753,227160,False,KNOWN,2,2,4,50.0
97,76561197960674598,247020,True,FAMOUS,3,3,3,100.0
98,76561197960678117,215670,False,KNOWN,1,3,4,75.0


### Analysis

#### Statistical test

Are users loyal, for a M%, to one notoriety level (no matter which one)?

In [15]:
# Perform the mean test
statistics, p_value = utils.mean_test(r.to_pandas_on_spark()['result'], 85)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

22/09/01 14:45:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:45:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:45:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:45:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:45:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:45:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:46:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:46:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 14:46:11 WARN RowBasedKeyValueBatch: Calling spill() on

Mean test coefficient: 42.758 
P-value: 0.000

Reject the null hypothesis


                                                                                

We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that people are loyal to review (and play) the 75% of same notoriety level games.

## 7) Do players who mostly review famous games also enjoy niche ones?

#### Obtain the loyalty level for famous & niche games, of players that have at least reviewed 3 games

In [48]:
base_reviews_sum = base_reviews_merge.to_spark().withColumn('n_for_each_notoriety', F.count(col('notoriety')).over(w1))
base_reviews_sum = base_reviews_sum.withColumn('percentage_voted_up_for_each_notoriety', F.mean(col('voted_up').cast('int') * 100).over(w1))
base_reviews_sum = base_reviews_sum.dropDuplicates(['steamid', 'notoriety'])

m = base_reviews_sum.join(n[['steamid', 'n']], on='steamid')

m.limit(100).toPandas()

22/09/01 15:48:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:48:50 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,steamid,appid,voted_up,notoriety,n_for_each_notoriety,percentage_voted_up_for_each_notoriety,n
0,76561197960269579,252470,True,FAMOUS,3,100.0,3
1,76561197960270050,205100,True,FAMOUS,5,100.0,5
2,76561197960270526,253230,True,FAMOUS,4,100.0,4
3,76561197960272328,287700,True,FAMOUS,4,100.0,4
4,76561197960272398,288160,True,FAMOUS,3,100.0,4
...,...,...,...,...,...,...,...
95,76561197960460623,282140,True,FAMOUS,3,100.0,3
96,76561197960460660,234650,True,FAMOUS,3,100.0,3
97,76561197960462715,221910,True,FAMOUS,3,100.0,3
98,76561197960464070,346110,False,FAMOUS,1,0.0,3


In [49]:
m = m.withColumn('loyalty', col('n_for_each_notoriety') / col('n') * 100)
m.limit(100).toPandas()

22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:03 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,steamid,appid,voted_up,notoriety,n_for_each_notoriety,percentage_voted_up_for_each_notoriety,n,loyalty
0,76561197960269579,252470,True,FAMOUS,3,100.0,3,100.000000
1,76561197960270050,205100,True,FAMOUS,5,100.0,5,100.000000
2,76561197960270526,253230,True,FAMOUS,4,100.0,4,100.000000
3,76561197960272328,287700,True,FAMOUS,4,100.0,4,100.000000
4,76561197960272398,288160,True,FAMOUS,3,100.0,4,75.000000
...,...,...,...,...,...,...,...,...
95,76561197960460623,282140,True,FAMOUS,3,100.0,3,100.000000
96,76561197960460660,234650,True,FAMOUS,3,100.0,3,100.000000
97,76561197960462715,221910,True,FAMOUS,3,100.0,3,100.000000
98,76561197960464070,346110,False,FAMOUS,1,0.0,3,33.333333


In [50]:
m_pd = m.to_pandas_on_spark()
loyalty_niche_not_zero = m_pd[(m_pd['notoriety'] == 'NICHE') & (m_pd['loyalty'] > 0.0)]
loyalty_niche_not_zero.to_spark().limit(100).toPandas()

22/09/01 15:49:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:14 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:15 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:15 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,steamid,appid,voted_up,notoriety,n_for_each_notoriety,percentage_voted_up_for_each_notoriety,n,loyalty
0,76561197960279937,562470,True,NICHE,1,100.0,14,7.142857
1,76561197960364346,4900,True,NICHE,1,100.0,4,25.000000
2,76561197960397498,448000,False,NICHE,2,0.0,9,22.222222
3,76561197960404790,32140,True,NICHE,1,100.0,3,33.333333
4,76561197960410177,387870,True,NICHE,1,100.0,3,33.333333
...,...,...,...,...,...,...,...,...
95,76561197985573260,437870,False,NICHE,1,0.0,13,7.692308
96,76561197987352433,371220,False,NICHE,3,0.0,4,75.000000
97,76561197987572484,327310,True,NICHE,1,100.0,7,14.285714
98,76561197988004463,289180,False,NICHE,2,0.0,20,10.000000


In [51]:
loyalty_famous_high = m_pd[(m_pd['notoriety'] == 'FAMOUS') & (m_pd['loyalty'] > 55.0)]
loyalty_famous_high.to_spark().limit(100).toPandas()

22/09/01 15:49:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:25 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:26 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,steamid,appid,voted_up,notoriety,n_for_each_notoriety,percentage_voted_up_for_each_notoriety,n,loyalty
0,76561197960269579,252470,True,FAMOUS,3,100.000000,3,100.000000
1,76561197960270050,205100,True,FAMOUS,5,100.000000,5,100.000000
2,76561197960270526,253230,True,FAMOUS,4,100.000000,4,100.000000
3,76561197960272328,287700,True,FAMOUS,4,100.000000,4,100.000000
4,76561197960272398,288160,True,FAMOUS,3,100.000000,4,75.000000
...,...,...,...,...,...,...,...,...
95,76561197960938228,232910,False,FAMOUS,3,66.666667,3,100.000000
96,76561197960951855,220200,True,FAMOUS,3,100.000000,3,100.000000
97,76561197960953632,247000,True,FAMOUS,12,83.333333,14,85.714286
98,76561197960953945,232090,True,FAMOUS,3,100.000000,4,75.000000


In [52]:
result = loyalty_niche_not_zero.merge(loyalty_famous_high['steamid'])
result.to_spark().limit(100).toPandas()

22/09/01 15:49:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:49:41 WARN RowBasedKeyValueBatch: Calling spill() on

Unnamed: 0,steamid,appid,voted_up,notoriety,n_for_each_notoriety,percentage_voted_up_for_each_notoriety,n,loyalty
0,76561197960410177,387870,True,NICHE,1,100.0,3,33.333333
1,76561197960427320,286380,True,NICHE,1,100.0,11,9.090909
2,76561197960967170,501430,False,NICHE,1,0.0,3,33.333333
3,76561197961194018,283350,False,NICHE,1,0.0,18,5.555556
4,76561197962127913,704840,True,NICHE,1,100.0,9,11.111111
...,...,...,...,...,...,...,...,...
95,76561198026999552,358960,True,NICHE,1,100.0,9,11.111111
96,76561198027094164,714010,True,NICHE,1,100.0,3,33.333333
97,76561198028059459,363410,True,NICHE,1,100.0,11,9.090909
98,76561198029094011,750210,False,NICHE,1,0.0,8,12.500000


#### Statistical test

Are users loyal to famous notoriety games also enjoying M% of niche ones?

In [55]:
statistics, p_value = utils.mean_test(result['percentage_voted_up_for_each_notoriety'], 60)

print(f"Mean test coefficient: {statistics:.3f} \nP-value: {p_value:.3f}")
alpha = 0.05
if p_value < alpha:
    print('\nReject the null hypothesis')
else:
    print('\nDo not reject the null hypothesis')

22/09/01 15:52:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:19 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/09/01 15:52:21 WARN RowBasedKeyValueBatch: Calling spill() on

Mean test coefficient: 2.996 
P-value: 0.001

Reject the null hypothesis


                                                                                

We can notice that our hypothesis is true (and is statistically significant).<br> 
So we can say that players loyal to famous notoriety games also enjoyed 60% of niche ones.