# Reviews Dataset Subsampling and Insertion Into MongoDB Database

In [1]:
import findspark

# Locate the Spark installation (add pyspark to sys.path, see https://github.com/minrk/findspark#readme)
findspark.init()
print(f'Using Spark located in {findspark.find()}.')

from pyspark.sql import SparkSession

# Create or get the Spark session (singleton) and the underlying Spark context
spark = SparkSession.builder.getOrCreate()

Using Spark located in /usr/local/spark/.


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/12 10:07:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.types import BooleanType, DoubleType, LongType, StringType, StructField, StructType

# Define a reasonable schema for the reviews dataset
reviews_schema = StructType([
    StructField('steamid', LongType(), True),
    StructField('appid', LongType(), True),
    StructField('voted_up', BooleanType(), True),
    StructField('votes_up', LongType(), True),
    StructField('votes_funny', LongType(), True),
    StructField('weighted_vote_score', DoubleType(), True),
    StructField('playtime_forever', LongType(), True),
    StructField('playtime_at_review', LongType(), True),
    StructField('num_games_owned', LongType(), True),
    StructField('num_reviews', LongType(), True),
    StructField('review', StringType(), True),
    StructField('unix_timestamp_created', LongType(), True),
    StructField('unix_timestamp_updated', LongType(), True)
])

# Read the reviews dataset from HDFS
reviews_df = spark.read.csv(
    path='hdfs://localhost:54310/final_project/data/reviews',
    schema=reviews_schema,
    escape='"',
    header=True,
    ignoreTrailingWhiteSpace=True,
    mode='FAILFAST',
    multiLine=True,
    unescapedQuoteHandling='STOP_AT_CLOSING_QUOTE'
)

In [3]:
# Order the original reviews dataset to allow for deterministic behavior in the following steps (decrease debug effort)
reviews_df = reviews_df.sort(reviews_schema.fieldNames())

In [4]:
# Print a few rows of the original reviews dataset to verify their correctness
reviews_df.limit(5).toPandas()

                                                                                

Unnamed: 0,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,76561197960265745,275850,True,2,0,0.545455,9484,1745,91,2,This has been hovering on my wishlist for ages...,1596147287,1596147287
1,76561197960265778,730,True,7,4,0.592431,349851,112806,780,6,Fun game I like it still,1416782428,1416782428
2,76561197960265778,252490,True,1,0,0.52381,124351,69823,780,6,"Fun game, good anti-cheat. I would recommend t...",1587390881,1587390881
3,76561197960265781,39210,True,0,0,0.0,41173,15285,560,9,It's hard to find anything negative to write a...,1614500711,1614500711
4,76561197960265781,493340,True,2,0,0.52381,3406,2319,560,9,Planet Coaster is essentially what everyone wa...,1479599829,1479599829


In [5]:
# Check the types of the reviews Spark DataFrame
reviews_df.dtypes

[('steamid', 'bigint'),
 ('appid', 'bigint'),
 ('voted_up', 'boolean'),
 ('votes_up', 'bigint'),
 ('votes_funny', 'bigint'),
 ('weighted_vote_score', 'double'),
 ('playtime_forever', 'bigint'),
 ('playtime_at_review', 'bigint'),
 ('num_games_owned', 'bigint'),
 ('num_reviews', 'bigint'),
 ('review', 'string'),
 ('unix_timestamp_created', 'bigint'),
 ('unix_timestamp_updated', 'bigint')]

In [6]:
# Randomly subsample the original dataset due to the single-node Spark installation limitation
base_reviews_df = reviews_df.sample(withReplacement=False, fraction=0.2, seed=0)

# Get the exact the number of rows extracted from the original dataset
original_count = reviews_df.count()
base_count = base_reviews_df.count()
percentage = base_count / original_count * 100
print(f'Taking {base_count} reviews out of the {original_count} of the original reviews dataset ({percentage:.3f}%).')



Taking 3086382 reviews out of the 15437471 of the original reviews dataset (19.993%).


                                                                                

In [7]:
from pyspark.sql.functions import monotonically_increasing_id 

# Add unique ids to the extracted reviews (as the first column) as a common refence for further analyses
base_review_id_column_name = 'base_review_id'
base_reviews_df = base_reviews_df.withColumn(base_review_id_column_name, monotonically_increasing_id())
base_reviews_df = base_reviews_df.select([base_review_id_column_name, *reviews_schema.fieldNames()])

In [8]:
# Print a few rows of the base reviews dataset to verify their correctness
base_reviews_df.limit(5).toPandas()

                                                                                

Unnamed: 0,base_review_id,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,0,76561197960265778,252490,True,1,0,0.52381,124351,69823,780,6,"Fun game, good anti-cheat. I would recommend t...",1587390881,1587390881
1,1,76561197960265822,578080,False,0,0,0.0,92857,88764,326,5,Love to loot entire places and find no guns or...,1490862846,1531677554
2,2,76561197960265890,620,True,0,0,0.0,53,52,307,10,this game fucking sucks. no buy.,1336880073,1336880073
3,3,76561197960265908,440900,True,0,0,0.0,4387,4189,47,1,It kinda feels like the sims meets world of wa...,1525909615,1527766944
4,4,76561197960265942,310110,False,118,0,0.752326,627,485,1266,10,"First off, I'm going to write about the game, ...",1414958760,1414986669


In [9]:
# Check the types of the base reviews Spark DataFrame
base_reviews_df.dtypes

[('base_review_id', 'bigint'),
 ('steamid', 'bigint'),
 ('appid', 'bigint'),
 ('voted_up', 'boolean'),
 ('votes_up', 'bigint'),
 ('votes_funny', 'bigint'),
 ('weighted_vote_score', 'double'),
 ('playtime_forever', 'bigint'),
 ('playtime_at_review', 'bigint'),
 ('num_games_owned', 'bigint'),
 ('num_reviews', 'bigint'),
 ('review', 'string'),
 ('unix_timestamp_created', 'bigint'),
 ('unix_timestamp_updated', 'bigint')]

In [10]:
# Write the extracted base reviews dataset to HDFS
base_reviews_df.write.csv(f'hdfs://localhost:54310/final_project/data/base_reviews', mode='overwrite')

                                                                                

In [11]:
# Randomly subsample the base dataset to create a small repository on which to perform the preliminary analyses
small_reviews_df = base_reviews_df.sample(withReplacement=False, fraction=0.1, seed=0)

# Get the exact the number of rows extracted from the base dataset
small_count = small_reviews_df.count()
percentage = small_count / base_count * 100
print(f'Taking {small_count} reviews out of the {base_count} of the base reviews dataset ({percentage:.3f}%).')



Taking 308667 reviews out of the 3086382 of the base reviews dataset (10.001%).


                                                                                

In [12]:
# Print a few rows of the small reviews dataset to verify their correctness
small_reviews_df.limit(5).toPandas()

                                                                                

Unnamed: 0,base_review_id,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated
0,2,76561197960265890,620,True,0,0,0.0,53,52,307,10,this game fucking sucks. no buy.,1336880073,1336880073
1,9,76561197960266642,450540,True,0,0,0.0,3928,3223,142,2,one of the best VR weapon simulators ever made.,1573057253,1573057253
2,16,76561197960267291,378610,False,0,0,0.46875,163,163,200,5,The game crashed on my machine 4 times in 2 ho...,1577395138,1577395138
3,26,76561197960267685,252950,True,0,0,0.0,60406,53991,76,2,is good game,1575582355,1575582355
4,46,76561197960268765,287700,True,0,0,0.0,4862,3725,157,27,"Basically, one of the best MGS games out there!",1511807430,1511807430


In [13]:
import pymongo

# Connect to the local MongoDB instance and select the database used as repository for the small reviews dataset
mongo = pymongo.MongoClient()
mongo_db = mongo.final_project

In [14]:
# Delete the content of the small_reviews collection, if necessary
mongo_db.small_reviews.delete_many({})

# Get a dict representation of the DataFrame containing the small dataset
small_reviews_dict = small_reviews_df.toPandas().to_dict(orient='records')

# Insert the reviews in the small_reviews collection of the final_project MongoDB database
mongo_db.small_reviews.insert_many(small_reviews_dict);

                                                                                

In [15]:
# Close the connection to the local MongoDB instance
mongo.close()

# Stop the Spark context underlying the Spark session
spark.stop()