In [1]:
import findspark

# Locate the Spark installation (add pyspark to sys.path, see https://github.com/minrk/findspark#readme)
findspark.init()
print(f'Using Spark located in {findspark.find()}.')

from pyspark.sql import SparkSession

# Create or get the Spark session (singleton) and the underlying Spark context
spark = SparkSession.builder.getOrCreate()

Using Spark located in /usr/local/spark/.


In [2]:
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType, StructField, StructType

# Second Hypothesis: 

In the following notebook the following hypothesis, already analyzed in local (using standard python), will be replicated on the Big Data source located in the HDFS:
 - **Does there exist a correlation between the number of owned games and the number of written reviews?**

## Correlation between the number of owned games and the number of written reviews

### Import reviews dataset from HDFS & select relevant features to perform the analysis

In [3]:
# Define the schema for the base reviews dataset
base_reviews_schema = StructType([
    StructField('base_review_id', LongType(), True),
    StructField('steamid', LongType(), True),
    StructField('appid', LongType(), True),
    StructField('voted_up', BooleanType(), True),
    StructField('votes_up', LongType(), True),
    StructField('votes_funny', LongType(), True),
    StructField('weighted_vote_score', DoubleType(), True),
    StructField('playtime_forever', LongType(), True),
    StructField('playtime_at_review', LongType(), True),
    StructField('num_games_owned', LongType(), True),
    StructField('num_reviews', LongType(), True),
    StructField('review', StringType(), True),
    StructField('unix_timestamp_created', LongType(), True),
    StructField('unix_timestamp_updated', LongType(), True)
])

# Read the base reviews dataset from HDFS
base_reviews_df = spark.read.csv(
    path='hdfs://localhost:54310/final_project/data/base_reviews',
    schema=base_reviews_schema,
    escape='"',
    header=True,
    ignoreTrailingWhiteSpace=True,
    mode='FAILFAST',
    multiLine=True,
    unescapedQuoteHandling='STOP_AT_CLOSING_QUOTE'
).to_pandas_on_spark()



In [4]:
# Show the reviews dataframe
base_reviews_df.to_spark().limit(5).toPandas()

                                                                                

Unnamed: 0,base_review_id,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,0,76561199012934585,204100,True,0,0,0.0,1671,1660,37,10,A masterpiece that is extremely underrated. Th...,1619063926,1619063926
1,1,76561198242204348,204100,True,0,0,0.0,414,414,54,28,Not like 1 and 2 of the series but its alright.,1619047384,1619047384
2,2,76561198078115373,204100,False,1,1,0.522059,119,119,91,8,Unskippable cut scenes are horrible. Gameplay ...,1619040366,1619040366
3,3,76561198255525846,204100,True,0,0,0.0,69,69,27,1,I enjoy the game. Played it to 100% on PS3 an...,1619035215,1619035215
4,4,76561199026331378,204100,True,0,0,0.0,608,608,40,1,"Feel the Payne ;)\nGreat Game, just like part ...",1619027681,1619027681


In [5]:
# Getting the latest details on the players (we are interested on selecting the details of the users at the latest review they written)
latest_detail_players_df = base_reviews_df.sort_values('unix_timestamp_created').groupby('steamid').tail(1)

# Inspect the number of unique players
print('Number of unique players: ',str(len(latest_detail_players_df)))

### Correlation test

In order to verify if there is a correlation we are considering the *Spearman's* method.

 - What is Spearman correlation? https://statisticsbyjim.com/basics/spearmans-correlation/

In [6]:
# Calculate Spearman Rank correlation
# The higher the number of owned games the higher the number of reviews
statistics = latest_detail_players_df[['num_games_owned', 'num_reviews']].corr('spearman')['num_games_owned'][1]
print(f"Spearman correlation coefficient: {statistics:.3f}")

22/08/23 09:45:35 WARN Executor: Managed memory leak detected; size = 78676038 bytes, task 0.0 in stage 16.0 (TID 47)
22/08/23 09:45:39 WARN Executor: Managed memory leak detected; size = 78676038 bytes, task 0.0 in stage 27.0 (TID 56)
22/08/23 09:45:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/08/23 09:45:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


Spearman correlation coefficient: 0.494


In [None]:
# Stop the Spark context underlying the Spark session
spark.stop()