Imports

In [1]:
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

Reading the File

In [5]:
df = pd.read_csv("amazon_review.csv")
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [6]:
df.shape

(4915, 12)

In [7]:
df["overall"].mean()

4.587589013224822

Converting to datetime

In [8]:

df["reviewTime"] = pd.to_datetime(df["reviewTime"])
current_date = df["reviewTime"].max()


In [9]:
df["current_diff"] = (current_date - df["reviewTime"]).dt.days

In [10]:
df["current_diff"].quantile([0.1,0.25,0.50])

0.10000   166.00000
0.25000   280.00000
0.50000   430.00000
Name: current_diff, dtype: float64

In [11]:
df.loc[df["current_diff"] <= 166, "overall"].mean() #4.68986083499006


4.68986083499006

In [13]:
df.loc[(df["current_diff"] > 166) & (df["current_diff"] <= 280), "overall"].mean() #4.699863574351978


4.699863574351978

In [12]:
df.loc[(df["current_diff"] > 280) & (df["current_diff"] <= 430), "overall"].mean() #4.636140637775961


4.636140637775961

In [14]:
df.loc[(df["current_diff"] > 430), "overall"].mean() #4.508957654723127


4.508957654723127

In [15]:
def time_based_weighted_average(dataframe, w1=30, w2=28, w3=24, w4=18):
    return dataframe.loc[df["current_diff"] <= 166, "overall"].mean() * w1 / 100 + \
           dataframe.loc[(dataframe["current_diff"] > 166) & (dataframe["current_diff"] <= 280), "overall"].mean() * w2 / 100 + \
           dataframe.loc[(dataframe["current_diff"] > 280) & (dataframe["current_diff"] <= 430), "overall"].mean() * w3 / 100 + \
           dataframe.loc[(dataframe["current_diff"] > 430), "overall"].mean() * w4 / 100


In [16]:
time_based_weighted_average(df)


4.647206182231965

In [18]:
df["helpful_no"] = (df.total_vote - df.helpful_yes) #yararsız bulunan oy sayısı


In [19]:

def score_pos_neg_diff(up,down):
    return up - down


In [20]:

def score_average_rating(helpful_yes, helpful_no):
    if helpful_yes + helpful_no == 0:
        return 0
    return helpful_yes / (helpful_yes + helpful_no)

In [21]:
def wilson_lower_bound(up, down, confidence=0.95):
    """
    Wilson Lower Bound Score 

    Parameters
    ----------
    up: int
        up count
    down: int
        down count
    confidence: float
        confidence

    Returns
    -------
    wilson score: float

    """
    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [23]:
df["score_pos_neg_diff"] = df.apply(lambda x: score_pos_neg_diff(x["helpful_yes"], x["helpful_no"]), axis=1)

df["score_average_rating"] = df.apply(lambda x: score_average_rating(x["helpful_yes"], x["helpful_no"]), axis=1)

df["wilson_lower_bound"] = df.apply(lambda x: wilson_lower_bound(x["helpful_yes"], x["helpful_no"]), axis=1)

In [24]:
df["score_pos_neg_diff"].sort_values(ascending=False).head(20)


2031    1884
4212    1442
3449    1351
317      349
3981      85
4596      55
1835      52
4672      41
4306      37
315       28
3807      19
4302      12
93         9
1609       7
1465       7
4072       6
2268       6
121        5
323        5
2695       5
Name: score_pos_neg_diff, dtype: int64

In [25]:
df["score_average_rating"].sort_values(ascending=False).head(20)


4277   1.00000
2881   1.00000
1073   1.00000
445    1.00000
3923   1.00000
435    1.00000
2901   1.00000
2204   1.00000
2206   1.00000
3408   1.00000
4538   1.00000
1097   1.00000
2885   1.00000
3418   1.00000
4554   1.00000
1609   1.00000
402    1.00000
2226   1.00000
3901   1.00000
3897   1.00000
Name: score_average_rating, dtype: float64

In [26]:
df.sort_values("wilson_lower_bound", ascending=False).head(20)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote,current_diff,helpful_no,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,A12B7ZMXFI6IXY,B007WTAJTO,"Hyoun Kim ""Faluzure""","[1952, 2020]",[[ UPDATE - 6/19/2014 ]]So my lovely wife boug...,5.0,UPDATED - Great w/ Galaxy S4 & Galaxy Tab 4 10...,1367366400,2013-01-05,702,1952,2020,701,68,1884,0.96634,0.95754
3449,AOEAD7DPLZE53,B007WTAJTO,NLee the Engineer,"[1428, 1505]",I have tested dozens of SDHC and micro-SDHC ca...,5.0,Top of the class among all (budget-priced) mic...,1348617600,2012-09-26,803,1428,1505,802,77,1351,0.94884,0.93652
4212,AVBMZZAFEKO58,B007WTAJTO,SkincareCEO,"[1568, 1694]",NOTE: please read the last update (scroll to ...,1.0,1 Star reviews - Micro SDXC card unmounts itse...,1375660800,2013-05-08,579,1568,1694,578,126,1442,0.92562,0.91214
317,A1ZQAQFYSXL5MQ,B007WTAJTO,"Amazon Customer ""Kelly""","[422, 495]","If your card gets hot enough to be painful, it...",1.0,"Warning, read this!",1346544000,2012-02-09,1033,422,495,1032,73,349,0.85253,0.81858
4672,A2DKQQIZ793AV5,B007WTAJTO,Twister,"[45, 49]",Sandisk announcement of the first 128GB micro ...,5.0,Super high capacity!!! Excellent price (on Am...,1394150400,2014-07-03,158,45,49,157,4,41,0.91837,0.80811
1835,A1J6VSUM80UAF8,B007WTAJTO,goconfigure,"[60, 68]",Bought from BestBuy online the day it was anno...,5.0,I own it,1393545600,2014-02-28,283,60,68,282,8,52,0.88235,0.78465
3981,A1K91XXQ6ZEBQR,B007WTAJTO,"R. Sutton, Jr. ""RWSynergy""","[112, 139]",The last few days I have been diligently shopp...,5.0,"Resolving confusion between ""Mobile Ultra"" and...",1350864000,2012-10-22,777,112,139,776,27,85,0.80576,0.73214
3807,AFGRMORWY2QNX,B007WTAJTO,R. Heisler,"[22, 25]",I bought this card to replace a lost 16 gig in...,3.0,"Good buy for the money but wait, I had an issue!",1361923200,2013-02-27,649,22,25,648,3,19,0.88,0.70044
4306,AOHXKM5URSKAB,B007WTAJTO,Stellar Eller,"[51, 65]","While I got this card as a ""deal of the day"" o...",5.0,Awesome Card!,1339200000,2012-09-06,823,51,65,822,14,37,0.78462,0.67033
4596,A1WTQUOQ4WG9AI,B007WTAJTO,"Tom Henriksen ""Doggy Diner""","[82, 109]",Hi:I ordered two card and they arrived the nex...,1.0,Designed incompatibility/Don't support SanDisk,1348272000,2012-09-22,807,82,109,806,27,55,0.75229,0.66359
