<a href="https://colab.research.google.com/github/Bag0niku/Amazon_Vine_Analysis/blob/main/Vine_Review_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Spark and Java
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
import os
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
spark_version = 'spark-3.3.0'
os.environ['SPARK_VERSION']=spark_version
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# the SQL database we wil be using is Postgres
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

!pip install -q findspark # dependency to find the pyspark installation
import findspark  
findspark.init()  # initialize the search for pyspark

# Import remaining dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
import pyspark.sql
from getpass import getpass

# database password
password = getpass()


Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Packages [948 kB]
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 k

In [None]:
# start Spark 
spark = SparkSession.builder.appName("AmazonReviews").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://amazon-vine-analysis.comqyjfkggk2.us-west-2.rds.amazonaws.com:5432/postgres"
config = {"user":"postgres",
          "password": password,
          "driver":"org.postgresql.Driver"}


# Deliverable 2: Determine Bias of Vine member reviews
Is any bias towards reviews that were written as part of the Vine program?  
 - Determine if having a paid Vine review makes a difference in the percentage of 5-star reviews.

In [None]:
# pull the vine table from the SQL database on AWS for this analysis
vine_table = spark.read.jdbc(url=jdbc_url, table="vine_table", properties=config)
vine_table.createOrReplaceTempView("vine_table")

# display the row count and the first 10 rows
spark.sql("SELECT COUNT(*) AS row_count FROM vine_table;").show()
vine_table.show(10)


In [None]:
# create a table with products that have enough votes to run an analysis.
# a minimum 20 total votes for any review.
usable_vine_table = spark.sql("SELECT * FROM vine_table WHERE total_votes > 19;")
usable_vine_table.createOrReplaceTempView("usable_vine_table")

# display the row count and the first 10 rows
spark.sql("SELECT COUNT(*) AS row_count FROM usable_vine_table;").show()
usable_vine_table.show(10)

In [None]:
# filter usable_vine_table for where helpful votes is atleast 50% of the total votes
helpful_votes_table = spark.sql("SELECT * FROM usable_vine_table WHERE (helpful_votes/total_votes) <= 0.5;")
helpful_votes_table.createOrReplaceTempView("helpful_votes_table")

# display the row count and the first 10 rows
spark.sql("select count(*) as row_count from helpful_votes_table;").show()
helpful_votes_table.show(10)

In [None]:
# filter usable_vine_reviews for only vine member reviews
vine_reviews_table = spark.sql("SELECT * FROM usable_vine_table WHERE vine == 'Y' ;")
vine_reviews_table.createOrReplaceTempView("vine_reviews_table")

# display the row count and the first 10 rows
spark.sql("SELECT COUNT(*) AS row_count FROM vine_reviews_table;").show()
vine_reviews_table.show(10)


In [None]:
# filter usable reviews for only non-vine member reviews
not_member_reviews_table = spark.sql("SELECT * FROM usable_vine_table WHERE vine =='N';")
not_member_reviews_table.createOrReplaceTempView("not_member_reviews_table")
spark.sql("SELECT COUNT(*) AS row_count FROM not_member_reviews_table;").show()
not_member_reviews_table.show(10)

In [None]:
# usable reviews summary table
spark.sql("""SELECT (SELECT COUNT(*) FROM usable_vine_table) AS usable_reviews, 
                    (SELECT COUNT(*) FROM usable_vine_table WHERE star_rating == 5) AS total_usable_5_star_reviews,
                    (SELECT COUNT(*) FROM vine_reviews_table WHERE (star_rating == 5)) AS total_usable_vine_5_star,
                    ((SELECT COUNT(*) FROM vine_reviews_table WHERE (star_rating == 5))/(SELECT COUNT(*) FROM usable_vine_table)) AS vine_%_5_star
                    (SELECT COUNT(*) FROM not_member_reviews_table WHERE (star_rating == 5)) AS total_usable_not_vine_5_star
                    ((SELECT COUNT(*) FROM not_member_reviews_table WHERE (star_rating == 5))/(SELECT COUNT(*) FROM vine_reviews_table WHERE (star_rating == 5))) AS not_vine_%_5_star
;""").show()

In [None]:
# total summary table
spark.sql("""SELECT (SELECT COUNT(*) FROM vine_table) AS total_reviews, 
                    (SELECT COUNT(*) FROM vine_table WHERE star_rating == 5) AS total_5_star_reviews,
                    (SELECT COUNT(*) FROM vine_table WHERE (star_rating == 5) AND (vine == 'Y')) AS total_vine_5_star,
                    ((SELECT COUNT(*) FROM vine_table WHERE (star_rating == 5) AND (vine == 'Y'))/(SELECT COUNT(*) FROM vine_table))*100 + '%' AS vine_%_5_star
                    (SELECT COUNT(*) FROM vine_table WHERE (star_rating == 5) AND (vine == 'N')) AS total_not_vine_5_star
                    ((SELECT COUNT(*) FROM vine_table WHERE (star_rating == 5) AND (vine == 'N'))/(SELECT COUNT(*) FROM vine_table))*100 + '%' AS not_vine_%_5_star
;""").show()


Based on the two summary tables above I would say there is not a bias towards the vine members based soley on the reviews, in fact I would make the opposite claim. Next lets check to see if there is a bias for vine members when you include the products aswell.

In [11]:
# import the review_id_table to join the analysis
review_id_table = spark.read.jdbc(url=jdbc_url, table="review_id_table", properties=config)
review_id_table.createOrReplaceTempView("review_id_table")
review_id_table.show(10)

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R3BFCCY087FHCZ|   39647090|B005B8DRVU|     322065043| 2014-10-17|
| RGEV9ZHM58A8S|   13165573|B00002SW9Z|     306819164| 2014-10-17|
|R2W7Z9I9ZTOPR1|    6822230|B00NCA8VJC|     417080633| 2014-10-17|
|R2YEKMA2HDOEHN|   23693692|B002BCOC3G|     424917491| 2014-10-17|
|R3IMVI8568X6QJ|   11701832|B00DP6Q1L8|     654246495| 2014-10-17|
|R2DVQS6WXF6RM0|   27604638|B00F4H2N3G|     982274353| 2014-10-17|
|R1FT1DU18JEZ1Z|    2943285|B00BGA9ZZ4|     307251827| 2014-10-17|
|R37CTBJNQ0S1I0|   45171788|B005EZ5GUU|     664089138| 2014-10-17|
|R1AO4UUJ1506UQ|   46709185|B00GANWVJE|     511800795| 2014-10-17|
|R3D813I5LEUVLN|   52443880|B0049LPV7S|     840743636| 2014-10-17|
+--------------+-----------+----------+--------------+-----------+
only showing top 10 rows



In [18]:
spark.sql("""SELECT vine.*, review.product_id
             FROM usable_vine_table AS vine
             JOIN review_id_table AS review ON review.review_id == vine.review_id 
             ;""").show(10)

+--------------+-----------+-------------+-----------+----+-----------------+----------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|product_id|
+--------------+-----------+-------------+-----------+----+-----------------+----------+
| RWL8NELB9YVUK|          1|           12|         21|   N|                N|B00YO2PRPS|
|R35SW52TDDF87S|          1|          158|        299|   N|                N|B00GLZQO30|
|R22X5WVKP9ZS3K|          5|           10|         23|   N|                Y|B00J48MUS4|
| R1ZB12NQC1BQ8|          1|           13|         46|   N|                Y|B00KVR4HEC|
|R26I6J47HE2PEN|          4|          140|        162|   N|                N|B00J48C36S|
|R3OQYM8TIVVAOR|          5|           27|         30|   N|                Y|B00ND0EBP4|
|R3UMIE11UREPV8|          1|           17|         26|   N|                N|B00JDOX2PE|
|R2RQBEFRD5W6HP|          1|           10|         24|   N|                Y|B00CJXYTGM|
|R2MT86YP52BQ4N|     