In [1]:
import pandas as pd 
import numpy as np 
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.offline as py 
import plotly.graph_objs as go
import re 
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
cf.go_offline()
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 200)


In [2]:
df=pd.read_csv("amazon.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4,No issues.,23-07-2014,138,0,0,0,0,0.0,0.0
1,1,0mie,5,"Purchased this for my device, it worked as advertised. You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.",25-10-2013,409,0,0,0,0,0.0,0.0
2,2,1K3,4,it works as expected. I should have sprung for the higher capacity. I think its made a bit cheesier than the earlier versions; the paint looks not as clean as before,23-12-2012,715,0,0,0,0,0.0,0.0
3,3,1m2,5,"This think has worked out great.Had a diff. bran 64gb card and if went south after 3 months.This one has held up pretty well since I had my S3, now on my Note3.*** update 3/21/14I've had this for ...",21-11-2013,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5,"Bought it with Retail Packaging, arrived legit, in a orange envelope, english version not asian like the picture shows. arrived quickly, bought a 32 and 16 both retail packaging for my htc one sv ...",13-07-2013,513,0,0,0,0,0.0,0.0


In [3]:
df = df.sort_values("wilson_lower_bound",ascending=False)
df.drop("Unnamed: 0",axis=1,inplace=True)
df.head()

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,"Hyoun Kim ""Faluzure""",5,"[[ UPDATE - 6/19/2014 ]]So my lovely wife bought me a Samsung Galaxy Tab 4 for Father's Day and I've been loving it ever since. Just as other with Samsung products, the Galaxy Tab 4 has the abili...",05-01-2013,702,1952,68,2020,1884,0.966337,0.957544
3449,NLee the Engineer,5,"I have tested dozens of SDHC and micro-SDHC cards. One disturbing trend I noticed is that: the speed class rating for micro-SDHC is typically inflated. For example, a 'class-10' rating means the c...",26-09-2012,803,1428,77,1505,1351,0.948837,0.936519
4212,SkincareCEO,1,NOTE: please read the last update (scroll to the bottom) - I'm leaving this review as 1 star as it appears to help others who purchased and had a similar experience.I give SanDisk 5 stars for cus...,08-05-2013,579,1568,126,1694,1442,0.92562,0.912139
317,"Amazon Customer ""Kelly""",1,"If your card gets hot enough to be painful, it is defective and you need to contact SanDisk!From my experience, the larger micro SDs run hot on format and writing, but not to the pain threshold!I ...",09-02-2012,1033,422,73,495,349,0.852525,0.818577
4672,Twister,5,"Sandisk announcement of the first 128GB micro SD took internet by storm. Our phones evolved into multimedia powerhouses with 5.5&#34;-6&#34; displays, desktop grade processing power, OTG support f...",03-07-2014,158,45,4,49,41,0.918367,0.808109


In [4]:
def missing_values_analysis(df):
    na_columns= [col for col in df.columns if df[col].isnull().sum()>0]
    n_miss=df[na_columns].isnull().sum().sort_values(ascending = True)
    ratio=(df[na_columns].isnull().sum()/df.shape[0]*100).sort_values(ascending = True)
    missing_df=pd.concat([n_miss,np.round(ratio,2)],axis=1,keys=['Missing Values','Ratio'])
    missing_df=pd.DataFrame(missing_df)
    return missing_df

def check_dataframe(df, head=5 , tail=5):
    print("Shape".center(82,"~"))
    print("Rows: ",df.shape[0])
    print("Columns: ",df.shape[1])
    print("Types".center(82,"~"))
    print(df.dtypes)
    print("".center(82,"~"))
    print(missing_values_analysis(df))
    print("Duplicated Values".center(82,"~"))
    print(df.duplicated().sum())
    print("Quantiles".center(82,"~"))
    print(df.quantile([0,0.05,0.50,0.95,0.99,1]).T)

check_dataframe(df)


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Shape~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Rows:  4915
Columns:  11
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Types~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
reviewerName             object
overall                   int64
reviewText               object
reviewTime               object
day_diff                  int64
helpful_yes               int64
helpful_no                int64
total_vote                int64
score_pos_neg_diff        int64
score_average_rating    float64
wilson_lower_bound      float64
dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              Missing Values  Ratio
reviewerName               1   0.02
reviewText                 1   0.02
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Duplicated Values~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Quantiles~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                       0.00  0.05   0.50        0.95       0.99         1.00
overall

In [5]:
def check_class(dataframe):
    nunique_df=pd.DataFrame({"Variables": dataframe.columns,
                            "classes":[dataframe[i].nunique()  
                                       for i in dataframe.columns]})
    nunique_df=nunique_df.sort_values("classes",ascending=False)
    nunique_df=nunique_df.reset_index(drop=True)
    return nunique_df

check_class(df)

Unnamed: 0,Variables,classes
0,reviewText,4912
1,reviewerName,4594
2,reviewTime,690
3,day_diff,690
4,wilson_lower_bound,40
5,score_average_rating,28
6,score_pos_neg_diff,27
7,total_vote,26
8,helpful_yes,23
9,helpful_no,17


In [6]:
contraints=["#B34D22","#EBE00C","#1FEB0C","0C92EB","#EB0CD5"]

def categorical_variable_summary(df, column_name):
    fig = make_subplots(rows=1,cols=2 , 
                        subplot_titles=("CountPlot" , "Percentage"),
                        specs=[[{'type': 'xy'},{'type' : 'domain'}]])

    fig.add_traces(go.Bar( y=df[column_name].value_counts().values.tolist(),
                        x=[str(i) for i in df[column_name].value_counts().index],
                        text=df[column_name].value_counts().values.tolist(),
                        textfont=dict(size=15,color="white"),
                        name=column_name,
                        textPosition="auto",
                        showlegend=False,
                        marker=dict(color=contraints , line=dict(color="#DBE6EC",width=1))),
                    row=1,col=1)
    
    fig.add_traces(go.Pie(labels=df[column_name].values_counts().keys(),
                        values=df[column_name].value_counts().values,
                        name=column_name,
                        textinfo="label+percent",
                        textfont=dict(size=15,color="white"),
                        textposition="inside",
                        showlegend=False,
                        marker=dict(colors=contraints))
                        ,row=1,col=2)
    fig.update_layout(title={"text":column_name,"y":0.9 , "x":0.5,'xanchor':'center' ,"yanchor":"top"},
                        template="plotly_white")
    
    iplot(fig)
        
    


In [17]:
categorical_variable_summary(df,"overall")

ValueError: 
    Invalid element(s) received for the 'color' property of bar.marker
        Invalid elements include: ['0C92EB']

    The 'color' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, rebeccapurple, saddlebrown, salmon,
            sandybrown, seagreen, seashell, sienna, silver,
            skyblue, slateblue, slategray, slategrey, snow,
            springgreen, steelblue, tan, teal, thistle, tomato,
            turquoise, violet, wheat, white, whitesmoke,
            yellow, yellowgreen
      - A number that will be interpreted as a color
        according to bar.marker.colorscale
      - A list or array of any of the above

In [7]:
df.reviewText.head()

2031    [[ UPDATE - 6/19/2014 ]]So my lovely wife bought me a Samsung Galaxy Tab 4 for Father's Day and I've been loving it ever since.  Just as other with Samsung products, the Galaxy Tab 4 has the abili...
3449    I have tested dozens of SDHC and micro-SDHC cards. One disturbing trend I noticed is that: the speed class rating for micro-SDHC is typically inflated. For example, a 'class-10' rating means the c...
4212    NOTE:  please read the last update (scroll to the bottom) - I'm leaving this review as 1 star as it appears to help others who purchased and had a similar experience.I give SanDisk 5 stars for cus...
317     If your card gets hot enough to be painful, it is defective and you need to contact SanDisk!From my experience, the larger micro SDs run hot on format and writing, but not to the pain threshold!I ...
4672    Sandisk announcement of the first 128GB micro SD took internet by storm. Our phones evolved into multimedia powerhouses with 5.5&#34;-6&#34; displays, desktop g

In [8]:
review_exp=df.reviewText[2031]
review_exp

'[[ UPDATE - 6/19/2014 ]]So my lovely wife bought me a Samsung Galaxy Tab 4 for Father\'s Day and I\'ve been loving it ever since.  Just as other with Samsung products, the Galaxy Tab 4 has the ability to add a microSD card to expand the memory on the device.  Since it\'s been over a year, I decided to do some more research to see if SanDisk offered anything new.  As of 6/19/2014, their product lineup for microSD cards from worst to best (performance-wise) are the as follows:SanDiskSanDisk UltraSanDisk Ultra PLUSSanDisk ExtremeSanDisk Extreme PLUSSanDisk Extreme PRONow, the difference between all of these cards are simply the speed in which you can read/write data to the card.  Yes, the published rating of most all these cards (except the SanDisk regular) are Class 10/UHS-I but that\'s just a rating... Actual real world performance does get better with each model, but with faster cards come more expensive prices.  Since Amazon doesn\'t carry the Ultra PLUS model of microSD card, I had 

In [9]:
review_example=re.sub("[^a-zA-Z]"," ",review_exp)
review_example

'   UPDATE               So my lovely wife bought me a Samsung Galaxy Tab   for Father s Day and I ve been loving it ever since   Just as other with Samsung products  the Galaxy Tab   has the ability to add a microSD card to expand the memory on the device   Since it s been over a year  I decided to do some more research to see if SanDisk offered anything new   As of            their product lineup for microSD cards from worst to best  performance wise  are the as follows SanDiskSanDisk UltraSanDisk Ultra PLUSSanDisk ExtremeSanDisk Extreme PLUSSanDisk Extreme PRONow  the difference between all of these cards are simply the speed in which you can read write data to the card   Yes  the published rating of most all these cards  except the SanDisk regular  are Class    UHS I but that s just a rating    Actual real world performance does get better with each model  but with faster cards come more expensive prices   Since Amazon doesn t carry the Ultra PLUS model of microSD card  I had to do

In [10]:
review_example=review_example.lower().split()
review_example

['update',
 'so',
 'my',
 'lovely',
 'wife',
 'bought',
 'me',
 'a',
 'samsung',
 'galaxy',
 'tab',
 'for',
 'father',
 's',
 'day',
 'and',
 'i',
 've',
 'been',
 'loving',
 'it',
 'ever',
 'since',
 'just',
 'as',
 'other',
 'with',
 'samsung',
 'products',
 'the',
 'galaxy',
 'tab',
 'has',
 'the',
 'ability',
 'to',
 'add',
 'a',
 'microsd',
 'card',
 'to',
 'expand',
 'the',
 'memory',
 'on',
 'the',
 'device',
 'since',
 'it',
 's',
 'been',
 'over',
 'a',
 'year',
 'i',
 'decided',
 'to',
 'do',
 'some',
 'more',
 'research',
 'to',
 'see',
 'if',
 'sandisk',
 'offered',
 'anything',
 'new',
 'as',
 'of',
 'their',
 'product',
 'lineup',
 'for',
 'microsd',
 'cards',
 'from',
 'worst',
 'to',
 'best',
 'performance',
 'wise',
 'are',
 'the',
 'as',
 'follows',
 'sandisksandisk',
 'ultrasandisk',
 'ultra',
 'plussandisk',
 'extremesandisk',
 'extreme',
 'plussandisk',
 'extreme',
 'pronow',
 'the',
 'difference',
 'between',
 'all',
 'of',
 'these',
 'cards',
 'are',
 'simply',
 

In [11]:
rt = lambda x: re.sub("[^a-zA-Z]"," ",str(x))
df["reviewText"]=df["reviewText"].map(rt)
df["reviewText"]=df["reviewText"].str.lower()
df["reviewText"].head()

2031       update               so my lovely wife bought me a samsung galaxy tab   for father s day and i ve been loving it ever since   just as other with samsung products  the galaxy tab   has the abili...
3449    i have tested dozens of sdhc and micro sdhc cards  one disturbing trend i noticed is that  the speed class rating for micro sdhc is typically inflated  for example  a  class     rating means the c...
4212    note   please read the last update  scroll to the bottom    i m leaving this review as   star as it appears to help others who purchased and had a similar experience i give sandisk   stars for cus...
317     if your card gets hot enough to be painful  it is defective and you need to contact sandisk from my experience  the larger micro sds run hot on format and writing  but not to the pain threshold i ...
4672    sandisk announcement of the first    gb micro sd took internet by storm  our phones evolved into multimedia powerhouses with                 displays  desktop g

In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
df[['polarity' , 'subjectivity']]=df['reviewText'].apply(lambda Text:pd.Series(TextBlob(Text).sentiment))

for index , row in df['reviewText'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    if neg>pos:
        df.loc[index,'sentiment']='negative'
    elif pos>neg:
        df.loc[index,'sentiment']='postive'
    else:
        df.loc[index,'sentiment']='neutral'