In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.3.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [985 kB]
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 k

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-10-21 01:37:14--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-10-21 01:37:14 (11.4 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Vine_Review_Analysis").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [5]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
df.show(10)

+-----------+-----------+--------------+----------+--------------+--------------------+-------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|   product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+-------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   45610553| RMDCHWD0Y5OZ9|B00HH62VB6|     618218723|AGPtek® 10 Isolat...|Musical Instruments|          3|            0|          1|   N|                N|         Three Stars|Works very good, ...|2015-08-31 00:00:00|
|         US|   14640079| RZSL0BALIYUNU|B003LRN53I|     986692292|Sennheiser

In [6]:
# Create the vine_table. DataFrame
vine_df = df.select([
    "review_id",
    "star_rating",
    "helpful_votes",
    "total_votes",
    "vine",
    "verified_purchase"
])
vine_df.show(5)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
| RMDCHWD0Y5OZ9|          3|            0|          1|   N|                N|
| RZSL0BALIYUNU|          5|            0|          0|   N|                Y|
| RIZR67JKUDBI0|          3|            0|          1|   N|                Y|
|R27HL570VNL85F|          5|            0|          0|   N|                Y|
|R34EBU9QDWJ1GD|          5|            0|          0|   N|                Y|
+--------------+-----------+-------------+-----------+----+-----------------+
only showing top 5 rows



In [13]:
# get reviews where total votes greater than 20
top_vine_df = vine_df.filter(vine_df["total_votes"] >= 20)
top_vine_df.show(5)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|         61|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|         23|   N|                Y|
| RX4D22YSXEF4P|          1|           37|         38|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|         37|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|         24|   N|                Y|
+--------------+-----------+-------------+-----------+----+-----------------+
only showing top 5 rows



In [14]:
# get all reviews where helpful and total ratio > 0.5
helpful_vine_df = top_vine_df.filter(
    top_vine_df["helpful_votes"] / top_vine_df["total_votes"] >= 0.5
  )
helpful_vine_df.show(5)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|         61|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|         23|   N|                Y|
| RX4D22YSXEF4P|          1|           37|         38|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|         37|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|         24|   N|                Y|
+--------------+-----------+-------------+-----------+----+-----------------+
only showing top 5 rows



In [15]:
# compare amounts of past 2 dfs
helpful_vine_df.count(), top_vine_df.count()

(14537, 16520)

In [17]:
# get reviews that are sponsored (YES)
yes_vine_df = helpful_vine_df.filter(helpful_vine_df["vine"] == "Y")
yes_vine_df.show(5)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R1R9RU7JW0MFR2|          4|           20|         23|   Y|                N|
|R19EFYNN3W8Q07|          5|           26|         32|   Y|                N|
|R34DJ1R8AEU0SG|          5|           29|         35|   Y|                N|
|R25P5CXK5L9RHF|          5|          146|        161|   Y|                N|
|R2E9VZB3I4LSN5|          5|           55|         59|   Y|                N|
+--------------+-----------+-------------+-----------+----+-----------------+
only showing top 5 rows



In [18]:
# get reviews not sponsored (NO)
no_vine_df = helpful_vine_df.filter(helpful_vine_df["vine"] == "N")
no_vine_df.show(5)

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2243Y3OD8U6KQ|          5|           47|         61|   N|                N|
|R2TGT0CDTCAAHW|          5|           21|         23|   N|                Y|
| RX4D22YSXEF4P|          1|           37|         38|   N|                Y|
|R3FL2NTLFUSPTQ|          5|           33|         37|   N|                N|
|R3QTP3YNZXAPPF|          3|           23|         24|   N|                Y|
+--------------+-----------+-------------+-----------+----+-----------------+
only showing top 5 rows



In [19]:
# compare amounts
yes_vine_df.count(), no_vine_df.count()

(60, 14477)

5. Determine the total number of reviews, the number of 5-star reviews, and the percentage of 5-star reviews for the two types of review (paid vs unpaid).

In [23]:
# Total number of Reviews
total_count = helpful_vine_df.count()
# Number of 5-star Reviews
five_stars_count = helpful_vine_df.filter(helpful_vine_df["star_rating"] == 5).count()
total_count, five_stars_count

In [33]:
# Count of paid and unpaid 5-star Reviews
paid_five_stars_count = yes_vine_df.filter(yes_vine_df["star_rating"] == 5).count()
unpaid_five_stars_count = no_vine_df.filter(no_vine_df["star_rating"] == 5).count()
# Count of all paid and unpaid reviews
paid_count = yes_vine_df.count()
unpaid_count = no_vine_df.count()
print(
    "Unpaid:", unpaid_five_stars_count, unpaid_count,
    "\nPaid:",  paid_five_stars_count, paid_count, 
)

Unpaid: 8212 14477 
Paid: 34 60


In [35]:
# Percentages of paid and unpaid 5-star Reviews
paid_percentage = 100 * paid_five_stars_count / paid_count
unpaid_percentage = 100 * unpaid_five_stars_count / unpaid_count
print(
    "Percentages of unpaid, paid:", 
    paid_percentage, unpaid_percentage
)

Percentages of unpaid, paid: 56.666666666666664 56.72445948746287


In [59]:
# Report
print(
    f"{'-'*30}\n"
    "From the Most Helpful Reviews:\n"
    f"{'-'*30}\n"
    f"Total Count: {total_count}\n"
    f"Total Count of 5-star Reviews: {five_stars_count}\n"
    f"{'-'*30}\n"
    f"Total Count of Unpaid Reviews: {unpaid_count}\n"
    f"Total Count of Paid Reviews: {paid_count}\n"
    f"Percentage of 5-star Unpaid Reviews: {unpaid_percentage:.2f}%\n"
    f"Percentage of 5-star Paid Reviews: {paid_percentage:.2f}%\n"
    f"{'-'*30}\n"
    f"Comments:\n"
    f"{'-'*30}\n"
    f"There are {unpaid_five_stars_count/paid_five_stars_count*100:.2f}% more Unpaid reviews compared to Paid ones.\n"
    f"However, the 5-star review distribution from both samples is almost identical.\n"
    f"This means that the Paid reviews are artificially closer to the mean\n"
    f"The Paid Reviews are not as trustworthy as the Unpaid ones."
)

------------------------------
From the Most Helpful Reviews:
------------------------------
Total Count: 14537
Total Count of 5-star Reviews: 8246
------------------------------
Total Count of Unpaid Reviews: 14477
Total Count of Paid Reviews: 60
Percentage of 5-star Unpaid Reviews: 56.72%
Percentage of 5-star Paid Reviews: 56.67%
------------------------------
Comments:
------------------------------
There are 24152.94% more Unpaid reviews compared to Paid ones.
However, the 5-star review distribution from both samples is almost identical.
This means that the Paid reviews are artificially closer to the mean
The Paid Reviews are not as trustworthy as the Unpaid ones.
