In [1]:
# import dependencies
import pandas as pd

In [2]:
# read in the csv
vine_df = pd.read_csv("resources/vine_table.csv")
vine_df.head(10)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R9CO86UUJCAW5,3.0,0.0,0.0,N,Y
1,R3PR8X6QGVJ8B1,5.0,0.0,0.0,N,Y
2,R39BO2819ABUPF,4.0,0.0,0.0,N,Y
3,R3ADL7V6EGGEEP,4.0,0.0,0.0,N,Y
4,R1OXYPBPLVRMI5,5.0,0.0,0.0,N,Y
5,R1WYM8Z5ATQ98O,3.0,0.0,0.0,N,Y
6,R3LCIANTN1H9EC,4.0,1.0,1.0,N,Y
7,R3U2M23N1P0KQ6,5.0,0.0,0.0,N,Y
8,R29MB6N7HB6NZI,1.0,2.0,2.0,N,Y
9,RGEQ6DGRG7DQG,5.0,0.0,0.0,N,Y


In [3]:
# Create a new DataFrame that filters for total votes greater than or equal to 20
vine_df = vine_df[vine_df["total_votes"] >= 20].sort_values(by="total_votes", ascending=False).dropna()
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
187493,R2F0QWDSFE4FWH,3.0,1992.0,2030.0,N,N
332926,R1388A5JLKT2BY,5.0,1027.0,1059.0,N,Y
105025,R17EFWJBACTRH,5.0,1010.0,1014.0,N,Y
196161,R2ZS9AUN31WJLR,5.0,926.0,948.0,N,N
301947,R1AGAG6A6WKO8L,5.0,922.0,937.0,N,Y


In [4]:
# Create a new DataFrame that filters for helpful_votes divided by total_votes greater than or equal to 50%
vine_df = vine_df[vine_df["helpful_votes"]/vine_df["total_votes"] >= 0.5]
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
187493,R2F0QWDSFE4FWH,3.0,1992.0,2030.0,N,N
332926,R1388A5JLKT2BY,5.0,1027.0,1059.0,N,Y
105025,R17EFWJBACTRH,5.0,1010.0,1014.0,N,Y
196161,R2ZS9AUN31WJLR,5.0,926.0,948.0,N,N
301947,R1AGAG6A6WKO8L,5.0,922.0,937.0,N,Y


In [5]:
# Create a new DataFrame for reviews from the Vine program (paid)
paid_review_df = vine_df[vine_df["vine"]=="Y"]
paid_review_df.head(10)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
44926,RD006IWFCUA5A,5.0,122.0,133.0,Y,N
69653,R3HJAGVD491IFV,5.0,105.0,105.0,Y,N
40555,R2T83WIPG3IX5D,4.0,72.0,76.0,Y,N
69191,R140DK42LYQWL,4.0,62.0,65.0,Y,N
65596,R2336R80ZEOKTX,5.0,63.0,63.0,Y,N
21361,R3SB08XK0M7993,4.0,44.0,49.0,Y,N
66906,R2QSLH5GY4OMNH,5.0,44.0,45.0,Y,N
67955,R2A0F7MYQGBPEG,4.0,33.0,40.0,Y,N
191376,R1MZ7RY5264PE4,5.0,38.0,38.0,Y,N
38418,R17HKKDE9ZRVOO,4.0,31.0,35.0,Y,N


In [6]:
# Determine total number of paid reviews
total_reviews = paid_review_df.loc[:,"review_id"].count()
print("There are", total_reviews, "paid reviews.")

There are 21 paid reviews.


In [7]:
# Find the number of 5-star reviews
star_number = paid_review_df.groupby(['star_rating']).count()["review_id"].loc[5.0]
print("Out of the paid reviews", star_number, "gave 5-stars")

Out of the paid reviews 10 gave 5-stars


In [8]:
# Find the percentage of reviews that are 5-stars
star_percent = (star_number/total_reviews*100).round(2)
print(star_percent,"% of the reviews had a 5-star rating")

47.62 % of the reviews had a 5-star rating


In [9]:
# Create a new DataFrame for reviews not from the Vine program (paid)
unpaid_review_df = vine_df[vine_df["vine"]=="N"]
unpaid_review_df.head(10)

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
187493,R2F0QWDSFE4FWH,3.0,1992.0,2030.0,N,N
332926,R1388A5JLKT2BY,5.0,1027.0,1059.0,N,Y
105025,R17EFWJBACTRH,5.0,1010.0,1014.0,N,Y
196161,R2ZS9AUN31WJLR,5.0,926.0,948.0,N,N
301947,R1AGAG6A6WKO8L,5.0,922.0,937.0,N,Y
299545,R1CHOWQSXF4OT2,4.0,895.0,921.0,N,Y
208838,R24E1CH0MH069W,2.0,726.0,767.0,N,Y
341090,R3L7YM0A9KI364,5.0,749.0,761.0,N,N
335410,RIUE7PEQNJ995,5.0,744.0,751.0,N,Y
340202,R1XJT1MPQH1V6J,1.0,711.0,728.0,N,Y


In [11]:
# Determine total number of paid reviews
total_reg_reviews = unpaid_review_df.loc[:,"review_id"].count()
print("There are", total_reg_reviews, "paid reviews.")

There are 6690 paid reviews.


In [12]:
# Find the number of 5-star reviews
reg_star_number = unpaid_review_df.groupby(['star_rating']).count()["review_id"].loc[5.0]
print("Out of the paid reviews", reg_star_number, "gave 5-stars")

Out of the paid reviews 3448 gave 5-stars


In [13]:
# Find the percentage of reviews that are 5-stars
reg_star_percentage = (reg_star_number/total_reg_reviews*100).round(2)
print(reg_star_percentage,"% of the reviews had a 5-star rating")

51.54 % of the reviews had a 5-star rating


In [14]:
# read dataframes to csv files
unpaid_review_df.to_csv("resources/unpaid_reviews.csv", index=False)
paid_review_df.to_csv("resources/paid_reviews.csv", index=False)