In [1]:
import pandas as pd

In [2]:
# Read in vine_table.csv and change columns names

vine_df = pd.read_csv("Resources/vine_table")
vine_df.rename(columns = {"R35T75OLUGHL5C":"review_id", "4":"star_rating", "0":"helpful_votes", "0.1":"total_votes", "N":"vine", "Y":"verified_purchase"}, inplace = True) 
vine_df.dropna()
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R2BV735O46BN33,5.0,0.0,0.0,N,Y
1,R2NBEUGPQQGXP1,4.0,0.0,0.0,N,Y
2,R17LLAOJ8ITK0S,3.0,1.0,1.0,N,Y
3,R39PEQBT5ISEF4,1.0,0.0,0.0,N,Y
4,R3GNM3SU9VHJFT,4.0,1.0,1.0,N,Y


In [4]:
vine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2302400 entries, 0 to 2302399
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   review_id          object 
 1   star_rating        float64
 2   helpful_votes      float64
 3   total_votes        float64
 4   vine               object 
 5   verified_purchase  object 
dtypes: float64(3), object(3)
memory usage: 105.4+ MB


## Filter Total Votes and Determine Vine Bias Reviews

In [5]:
# Filter DF to retrieve rows where total_votes >= 20
filtered_vine_df = vine_df[vine_df['total_votes'] >= 20]
filtered_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
64,R2FP3U4NHNFNL2,5.0,25.0,29.0,N,Y
158,R1UUK1977O38MU,5.0,31.0,31.0,N,Y
602,RXO216LWUDV6O,3.0,29.0,31.0,N,Y
654,R3NMJF7EBMM19V,3.0,26.0,27.0,N,Y
934,R2ZY0ZBDUO0XUY,3.0,20.0,21.0,N,Y


In [16]:
filtered_vine_df.count()

review_id            43574
star_rating          43574
helpful_votes        43574
total_votes          43574
vine                 43574
verified_purchase    43574
dtype: int64

In [6]:
# Create new DF where number of helpful votes/total votes is over 50%

helpful_df = filtered_vine_df.loc[filtered_vine_df["helpful_votes"]/filtered_vine_df["total_votes"]>=0.5]
helpful_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
64,R2FP3U4NHNFNL2,5.0,25.0,29.0,N,Y
158,R1UUK1977O38MU,5.0,31.0,31.0,N,Y
602,RXO216LWUDV6O,3.0,29.0,31.0,N,Y
654,R3NMJF7EBMM19V,3.0,26.0,27.0,N,Y
934,R2ZY0ZBDUO0XUY,3.0,20.0,21.0,N,Y


In [7]:
# Create a new DF from helpful_df where written as part of the vine program
vine_review_df = helpful_df[helpful_df["vine"] == "Y"]
vine_review_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
30394,R3KPC0NBUDASX3,5.0,25.0,25.0,Y,N
206170,R119P2A95GGXX4,5.0,26.0,28.0,Y,N
222171,R1HKIRME8AJ89Z,5.0,79.0,82.0,Y,N
281436,R3FY3GMBGOBR22,5.0,12.0,20.0,Y,N
287861,R15KH3FBSVYGBU,5.0,32.0,37.0,Y,N


In [8]:
nonvine_review_df = helpful_df[helpful_df["vine"] == "N"]
nonvine_review_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
64,R2FP3U4NHNFNL2,5.0,25.0,29.0,N,Y
158,R1UUK1977O38MU,5.0,31.0,31.0,N,Y
602,RXO216LWUDV6O,3.0,29.0,31.0,N,Y
654,R3NMJF7EBMM19V,3.0,26.0,27.0,N,Y
934,R2ZY0ZBDUO0XUY,3.0,20.0,21.0,N,Y


In [14]:
# Get count, number of 5 star reviews, and percentage of 5 star reviews for vine_review_df

vine_review_count = len(vine_review_df)
vine_review_5stars = len(vine_review_df[vine_review_df["star_rating"]==5])
vine_percent = vine_review_5stars/vine_review_count

print(vine_review_count, vine_review_5stars, vine_percent)

107 56 0.5233644859813084


In [15]:
# Get count, number of 5 star reviews, and percentage of 5 star reviews for nonvine_review_df

nonvine_review_count = len(nonvine_review_df)
nonvine_review_5stars = len(nonvine_review_df[nonvine_review_df["star_rating"]==5])
nonvine_percent = nonvine_review_5stars/nonvine_review_count

print(nonvine_review_count, nonvine_review_5stars, nonvine_percent)


39869 21005 0.5268504351751988


## Results

### Vine (paid) reviews totaled 107 with 56 of them receiving 5 star reviews, which is 52% of the total Vine reviews.

### NonVine (unpaid) reviews totaled 39,869 with 21,005 receiving a 5 star review, which is 53% of the total nonvine reviews.