# Reviews Dataset Filtering

In [1]:
import findspark

# Locate the Spark installation (add pyspark to sys.path, see https://github.com/minrk/findspark#readme)
findspark.init()
print(f'Using Spark located in {findspark.find()}.')

from pyspark.sql import SparkSession

# Create or get the Spark session (singleton) and the underlying Spark context
spark = SparkSession.builder.getOrCreate()

Using Spark located in /usr/local/spark/.


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/30 18:13:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType, StructField, StructType

# Define a reasonable schema for the base reviews dataset
base_reviews_schema = StructType([
    StructField('base_review_id', LongType(), True),
    StructField('steamid', LongType(), True),
    StructField('appid', LongType(), True),
    StructField('voted_up', BooleanType(), True),
    StructField('votes_up', LongType(), True),
    StructField('votes_funny', LongType(), True),
    StructField('weighted_vote_score', DoubleType(), True),
    StructField('playtime_forever', LongType(), True),
    StructField('playtime_at_review', LongType(), True),
    StructField('num_games_owned', LongType(), True),
    StructField('num_reviews', LongType(), True),
    StructField('review', StringType(), True),
    StructField('unix_timestamp_created', LongType(), True),
    StructField('unix_timestamp_updated', LongType(), True)
])

# Read the base reviews dataset from HDFS
base_reviews_df = spark.read.csv(
    path='hdfs://localhost:54310/final_project/data/base_reviews',
    schema=base_reviews_schema,
    escape='"',
    header=True,
    ignoreTrailingWhiteSpace=True,
    mode='FAILFAST',
    multiLine=True,
    unescapedQuoteHandling='STOP_AT_CLOSING_QUOTE'
).to_pandas_on_spark()



In [3]:
# Print a few rows of the base reviews dataset to verify their correctness
base_reviews_df.to_spark().limit(5).toPandas()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

Unnamed: 0,base_review_id,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,0,76561199012934585,204100,True,0,0,0.0,1671,1660,37,10,A masterpiece that is extremely underrated. Th...,1619063926,1619063926
1,1,76561198242204348,204100,True,0,0,0.0,414,414,54,28,Not like 1 and 2 of the series but its alright.,1619047384,1619047384
2,2,76561198078115373,204100,False,1,1,0.522059,119,119,91,8,Unskippable cut scenes are horrible. Gameplay ...,1619040366,1619040366
3,3,76561198255525846,204100,True,0,0,0.0,69,69,27,1,I enjoy the game. Played it to 100% on PS3 an...,1619035215,1619035215
4,4,76561199026331378,204100,True,0,0,0.0,608,608,40,1,"Feel the Payne ;)\nGreat Game, just like part ...",1619027681,1619027681


In [4]:
# Remove the malformed reviews for which the playtime forever is less than the playtime at review
# (this is probably due to inconsistencies in the data gathering process performed by Steam)
base_reviews_filtered_df = base_reviews_df[base_reviews_df['playtime_forever'] >= base_reviews_df['playtime_at_review']]

In [5]:
# Remove the rows with null or empty text reviews
base_reviews_filtered_df = base_reviews_filtered_df[(~base_reviews_filtered_df['review'].isnull()) & (base_reviews_filtered_df['review'].str.len() > 0)]

In [6]:
# Print a few rows of the base reviews dataset to verify their correctness
base_reviews_filtered_df.to_spark().limit(5).toPandas()

Unnamed: 0,base_review_id,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,0,76561199012934585,204100,True,0,0,0.0,1671,1660,37,10,A masterpiece that is extremely underrated. Th...,1619063926,1619063926
1,1,76561198242204348,204100,True,0,0,0.0,414,414,54,28,Not like 1 and 2 of the series but its alright.,1619047384,1619047384
2,2,76561198078115373,204100,False,1,1,0.522059,119,119,91,8,Unskippable cut scenes are horrible. Gameplay ...,1619040366,1619040366
3,3,76561198255525846,204100,True,0,0,0.0,69,69,27,1,I enjoy the game. Played it to 100% on PS3 an...,1619035215,1619035215
4,4,76561199026331378,204100,True,0,0,0.0,608,608,40,1,"Feel the Payne ;)\nGreat Game, just like part ...",1619027681,1619027681


In [7]:
# Get the number of rows extracted from the base reviews dataset
base_count = len(base_reviews_df)
base_filtered_count = len(base_reviews_filtered_df)
percentage = (1 - base_filtered_count / base_count) * 100
print(f'Removing {base_count - base_filtered_count} reviews of the {base_count} of the base reviews dataset ({percentage:.3f}%).')



Removing 6846 reviews of the 3087862 of the base reviews dataset (0.222%).


                                                                                

In [8]:
# Write the base filtered reviews dataset to HDFS
base_reviews_filtered_df.to_spark().write.csv(
    path='hdfs://localhost:54310/final_project/data/base_reviews_filtered',
    mode='overwrite',
    escape='"',
    header=True
)

                                                                                

In [9]:
# Stop the Spark context underlying the Spark session
spark.stop()