In [1]:
import pandas as pd
import plotly.graph_objects as go

In [2]:
# Reading the vine_table.csv into a dataframe

vine_df = pd.read_csv('vine_table.csv')

In [3]:
vine_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,R3VR960AHLFKDV,4,0,0,N,Y
1,R16LGVMFKIUT0G,5,0,0,N,Y
2,R1AIMEEPYHMOE4,5,1,1,N,Y
3,R1892CCSZWZ9SR,3,0,0,N,Y
4,R285P679YWVKD1,3,0,0,N,N
...,...,...,...,...,...,...
792108,R1F7BNIK2R72ZC,5,21,21,N,N
792109,RVR3U3QA0D95B,5,10,11,N,N
792110,R1X5JB9UJRZW31,5,3,5,N,N
792111,R235PHRTF7BFK7,4,5,5,N,N


In [4]:
# 1.Filtering the data for the tatal_votes greater than equal to 20

high_votes_df = vine_df[vine_df['total_votes'] >= 20]

In [5]:
high_votes_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
73,RL8D0KJ0J9L0O,5,152,165,N,Y
157,R1BEINAIQFBRJC,5,21,23,N,Y
190,R2L59KIJH302P9,4,26,26,N,Y
221,RR99CPG695T0I,5,215,248,N,N
237,R1XQNKKUPCMWVO,5,43,44,N,Y
...,...,...,...,...,...,...
792079,R1HWY1Z29N8XUU,4,29,30,N,N
792086,R1OBNEXHLWO188,5,59,59,N,Y
792091,R3A76JWPT014A,5,31,35,N,N
792101,R15YY6Z5KN7F9,5,44,45,N,N


In [6]:
# 2.Filtering the dataframe created in step 1 to keep the reviews that have a helpful votes to total votes ratio of 0.5 or higher

helpful_votes_df = high_votes_df[(high_votes_df['helpful_votes'] / high_votes_df['total_votes']) >= 0.5]

In [7]:
helpful_votes_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
73,RL8D0KJ0J9L0O,5,152,165,N,Y
157,R1BEINAIQFBRJC,5,21,23,N,Y
190,R2L59KIJH302P9,4,26,26,N,Y
221,RR99CPG695T0I,5,215,248,N,N
237,R1XQNKKUPCMWVO,5,43,44,N,Y
...,...,...,...,...,...,...
792079,R1HWY1Z29N8XUU,4,29,30,N,N
792086,R1OBNEXHLWO188,5,59,59,N,Y
792091,R3A76JWPT014A,5,31,35,N,N
792101,R15YY6Z5KN7F9,5,44,45,N,N


In [8]:
# 3.Filtering the dataframe created in step 2 to keep the vine reviews

vine_reviews_df = helpful_votes_df[helpful_votes_df['vine'] == 'Y']

In [9]:
vine_reviews_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
5408,R2BQOD1R0228FN,3,17,26,Y,N
8246,RC31RUPFOHBHQ,4,102,117,Y,N
12219,REN3N1WITLF1Y,5,33,37,Y,N
13330,R71RZQ9UZVG47,4,38,47,Y,N
16942,R38NMQBH88HLM6,4,18,24,Y,N
...,...,...,...,...,...,...
763326,R2QLL1VI9MRY2W,4,768,783,Y,N
763464,R15S9ONTV908AJ,4,21,24,Y,N
763976,R3E85BMKO2AWUP,4,155,177,Y,N
764209,RGNC5HFA9IZ5V,5,59,63,Y,N


In [10]:
# 4.Filtering the dataframe created in step 2 to keep the non-vine reviews

non_vine_reviews_df = helpful_votes_df[helpful_votes_df['vine'] == 'N']

In [11]:
non_vine_reviews_df

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
73,RL8D0KJ0J9L0O,5,152,165,N,Y
157,R1BEINAIQFBRJC,5,21,23,N,Y
190,R2L59KIJH302P9,4,26,26,N,Y
221,RR99CPG695T0I,5,215,248,N,N
237,R1XQNKKUPCMWVO,5,43,44,N,Y
...,...,...,...,...,...,...
792079,R1HWY1Z29N8XUU,4,29,30,N,N
792086,R1OBNEXHLWO188,5,59,59,N,Y
792091,R3A76JWPT014A,5,31,35,N,N
792101,R15YY6Z5KN7F9,5,44,45,N,N


In [12]:
# 5-a. Calculating the total number of vine reviews from the dataframe created in step 3

total_vine_reviews = len(vine_reviews_df)
total_vine_reviews

136

In [13]:
# 5-b. Calculating the total number of non-vine reviews from the dataframe created in step 4

total_non_vine_reviews = len(non_vine_reviews_df)
total_non_vine_reviews

18019

In [14]:
# 5-c. Calculating the number of 5 star ratings in vine reviews

five_star_vine = len(vine_reviews_df[vine_reviews_df['star_rating'] == 5])
five_star_vine

74

In [15]:
# 5-d. Calculating the number of 5 star ratings in non-vine reviews

five_star_non_vine = len(non_vine_reviews_df[non_vine_reviews_df['star_rating'] == 5])
five_star_non_vine

8482

In [16]:
# 5-e. Calculating the percentage of 5 star ratings in vine reviews

five_star_percentage_vine = five_star_vine * 100 / total_vine_reviews
five_star_percentage_vine

54.411764705882355

In [17]:
# 5-f. Calculating the percentage of 5 star ratings in non-vine reviews

five_star_percentage_non_vine = five_star_non_vine * 100 / total_non_vine_reviews
five_star_percentage_non_vine

47.072534546867196

In [18]:
# Creating the histogram plot of star ratings for Vine and non-Vine reviews 

fig = go.Figure()
fig.add_trace(go.Histogram(x=vine_reviews_df['star_rating'], histnorm='percent', name='vine reviews'))
fig.add_trace(go.Histogram(x=non_vine_reviews_df['star_rating'], histnorm='percent', name= 'non-vine reviews'))
fig.update_layout(
    title={'text':'Histograms of star ratings for Vine reviews and non-Vine reviews'
    },
    xaxis_title_text='Star-Rating',
    yaxis_title_text='Percent of Total Ratings',
    bargap=0.2,
    bargroupgap=0.1
)


fig.show()