In [5]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION']=spark_version
# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"
# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (108.157.162.110)] [0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connected to cloud.r-proje                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Connected to cloud.r-proje                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Wait                                                                               Hit:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Conn                                                                   

In [6]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-10-21 04:31:19--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.1’


2022-10-21 04:31:20 (5.37 MB/s) - ‘postgresql-42.2.16.jar.1’ saved [1002883/1002883]



In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("M16-Amazon-Challenge").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [8]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get("amazon_reviews_us_Video_Games_v1_00.tsv.gz"), sep="\t", header=True, inferSchema=True)
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   12039526| RTIS3L2M1F5SM|B001CXYMFS|     737716809|Thrustmaster T-Fl...|     Video Games|          5|            0|          0|   N|                Y|an amazing joysti...|Used this for Eli...| 2015-08-31|
|         US|    9636577| R1ZV7R40OLHKD|B00M920ND6|     569686175|Tonsee 6 buttons ...|     Video Games|          5|    

In [9]:
# Load in a function to use columns
from pyspark.sql.functions import col
# Filter the data and create a new DataFrame or table to retrieve all the rows where the total_votes count is equal to or greater than 20:
filterd_reviews_df = df.filter(col("total_votes") >= 20)
filterd_reviews_df.show(20)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|    7142190| R4PKAZRQJJX14|B00QZLVCU0|     210935604|Valve - DOTA 2 St...|     Video Games|          1|           21|         34|   N|                N|What store doesn'...|Who pays 4 dollar...| 2015-08-31|
|         US|    1085641|R2CI0Y288CC7E2|B00RHI62GY|     626589765|ONE PIECE Pirate ...|     Video Games|          1|    

In [14]:
# Filter the new dataframe where number of helpful_votes / total_votes >= 50%:
per_filtered_df = filterd_reviews_df.filter(col("helpful_votes")/col("total_votes") >= 0.5)
per_filtered_df.show(20)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|    7142190| R4PKAZRQJJX14|B00QZLVCU0|     210935604|Valve - DOTA 2 St...|     Video Games|          1|           21|         34|   N|                N|What store doesn'...|Who pays 4 dollar...| 2015-08-31|
|         US|    1085641|R2CI0Y288CC7E2|B00RHI62GY|     626589765|ONE PIECE Pirate ...|     Video Games|          1|    

In [27]:
# Filter the DataFrame or table created in Step 2, and create a new DataFrame or table that retrieves all the rows where a review was written as part of the Vine program (paid):
vine_df = per_filtered_df.filter(col("vine") == 'Y')
vine_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   48771843|R3KKUSGFZWSUIY|B00YXO5UXG|     555176890|Turtle Beach - Ea...|     Video Games|          5|           56|         63|   Y|                N|Quality Chat Headset|Not every situati...| 2015-08-04|
|         US|   53080186|R10FO5UKKVZBK2|B00XO041RQ|     238654494|PDP AG7 True Wire...|     Video Games|          3|    

In [28]:
# Retrieve all the rows where the review was not part of the Vine program (unpaid)
no_vine_df = per_filtered_df.filter(col("vine") == 'N')
no_vine_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|    7142190| R4PKAZRQJJX14|B00QZLVCU0|     210935604|Valve - DOTA 2 St...|     Video Games|          1|           21|         34|   N|                N|What store doesn'...|Who pays 4 dollar...| 2015-08-31|
|         US|    1085641|R2CI0Y288CC7E2|B00RHI62GY|     626589765|ONE PIECE Pirate ...|     Video Games|          1|    

In [29]:
# Determine the total number of reviews
vine_df.count()
print("total number of vine reviews : %f" % vine_df.count())
no_vine_df.count()
print("Total number of reviews not paid for by vine: %f" % no_vine_df.count())
total_vine_reviews = vine_df.count() + no_vine_df.count()
print("Total number of Vine reviews: %f" % total_vine_reviews)


total number of vine reviews : 94.000000
Total number of reviews not paid for by vine: 40471.000000
Total number of Vine reviews: 40565.000000


In [30]:
# Determine the number of 5-star reviews
# paid 5-star count
vine_fivestars_df = vine_df.filter(col("star_rating") == 5)
print("Number of vine 5-star reviews was: %f" % vine_fivestars_df.count())
# unpaid 5-star count
no_vine_fivestars_df = no_vine_df.filter(col("star_rating") == 5)
print("Number of 5-star reviews not paid for by vine: %f" % no_vine_fivestars_df.count())
# total 5-star count
total_five_stars = vine_fivestars_df.count() + no_vine_fivestars_df.count()
print("Total number of 5-star Vine reviews was: %f" % total_five_stars)


Number of vine 5-star reviews was: 48.000000
Number of 5-star reviews not paid for by vine: 15663.000000
Total number of 5-star Vine reviews was: 15711.000000


In [31]:
# Determine the the percentage of 5-star reviews for the two types of review
# paid vine 5-stars %
vine_fivestars_percentage = vine_fivestars_df.count() * 100 / total_vine_reviews
print("Percentage of vine 5-star reviews: %f" % vine_fivestars_percentage )
# unpaid 5-star reviews %
no_vine_fivestars_percentage = no_vine_fivestars_df.count() * 100 / total_vine_reviews
print("Percentage of 5-star reviews not paid for by vine: %f" % no_vine_fivestars_percentage)


Percentage of vine 5-star reviews: 0.118329
Percentage of 5-star reviews not paid for by vine: 38.612104


In [35]:
# Determine the % of paid 5-star reviews vs. total 5-star reviews
paid_fivestars_percent = vine_fivestars_df.count() * 100 / total_five_stars
print("Share of paid 5-star reviews was: %f" % paid_fivestars_percent + "% of 5-star reviews")

# Determine the % of unpaid 5-star reviews vs. total 5-star reviews
unpaid_fivestars_percent= no_vine_fivestars_df.count() * 100 / total_five_stars
print("Share of unpaid 5-star reviews was: %f" % unpaid_fivestars_percent + "% of 5-star reviews")

Share of paid 5-star reviews was: 0.305518% of 5-star reviews
Share of unpaid 5-star reviews was: 99.694482% of 5-star reviews
