In [52]:
import pandas as pd
import json
import numpy as np
import re
import csv
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
reviews = open('CellPhoneReview.json', 'r').read().strip().split('\n')
reviews = [json.loads(s) for s in reviews]
review_df = pd.DataFrame.from_dict(reviews)
review_df

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,summary,unixReviewTime
0,120401325X,5.0,These stickers work like the review says they ...,"01 14, 2014",ASY55RVNIL0UD,Really great product.,1389657600
1,120401325X,5.0,These are awesome and make my phone look so st...,"06 26, 2014",A2TMXE2AFO7ONB,LOVE LOVE LOVE,1403740800
2,120401325X,4.0,Item arrived in great time and was in perfect ...,"10 21, 2013",AWJ0WZQYMYFQ4,Cute!,1382313600
3,120401325X,5.0,"awesome! stays on, and looks great. can be use...","02 3, 2013",ATX7CZYFXI1KW,leopard home button sticker for iphone 4s,1359849600
4,120401325X,3.0,These make using the home button easy. My daug...,"10 12, 2013",APX47D16JOP7H,Cute,1381536000
5,120401325X,5.0,Came just as described.. It doesn't come unstu...,"08 22, 2013",A1JVVYYO7G56DS,best thing ever..,1377129600
6,3998899561,1.0,it worked for the first week then it only char...,"11 21, 2013",A6FGO4TBZ3QFZ,not a good Idea,1384992000
7,3998899561,5.0,"Good case, solid build. Protects phone all aro...","09 25, 2013",A2JWEDW5FSVB0F,Solid Case,1380067200
8,3998899561,5.0,This is a fantastic case. Very stylish and pro...,"04 3, 2014",A8AJS1DW7L3JJ,Perfect Case,1396483200
9,3998899561,5.0,this case fits perfectly on the s4 and keeps m...,"04 13, 2014",A2YO4SCWAWNYBI,Just what I needed,1397347200


In [54]:
# split dataset into poor reviews <3 and good reviews >3. reviews of rating 3 are ommitted as they are neutral
good_reviews = review_df [(review_df['overall']>3.0)]
poor_reviews = review_df [(review_df['overall']<3.0)]
good_reviewText = good_reviews['reviewText']
poor_reviewText = poor_reviews['reviewText']
good_summ = good_reviews['summary']
poor_summ = poor_reviews['summary']
poor_summ

6                                           not a good Idea
18                                                 Horrible
25                          This doesn't work with my iPad.
27                                                Two Stars
31        don't waste your money, pay more and buy one a...
34                                       Works for a while.
39                                           not high power
43                            Works Fine, But It Died On ME
45                                               Be careful
62                                        Loved it at first
68                                  Don't waste your money!
69                                           bad experience
86                          Poor Quality Cell Phone Charger
100                                                    FAIL
102                                               Not good.
103                                      LIGHT DUTY FAILURE
105                                     

In [55]:
# cleaning and preprocessing functions
replace_punctuation = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")

def clean_reviews(reviews):
    reviews = [replace_punctuation.sub("", line.lower()) for line in reviews]
    
    return reviews

# cleaning all datasets and saving as csv
good_reviewText = clean_reviews(good_reviewText)
with open('good_reviewText.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL,delimiter='\n')
    wr.writerow(good_reviewText)
    
poor_reviewText = clean_reviews(poor_reviewText)
with open('poor_reviewText.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL,delimiter='\n')
    wr.writerow(poor_reviewText)
    
good_summ = clean_reviews(good_summ)
with open('good_summ.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL,delimiter='\n')
    wr.writerow(good_summ)
    
poor_summ = clean_reviews(poor_summ)
with open('poor_summ.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL,delimiter='\n')
    wr.writerow(poor_summ)  

## Vectorizing

In [56]:
# reading in for vectorizing
gd_rt_df = pd.read_csv('good_reviewText.csv', header=None)
bd_rt_df = pd.read_csv('poor_reviewText.csv', header=None)
gd_summ_df = pd.read_csv('good_summ.csv', header=None)
bd_summ_df = pd.read_csv('poor_summ.csv', header=None)

In [57]:
# size of each list
print('size of gd_rt_df is',len(gd_rt_df.index))
print('size of bd_rt_df is',len(bd_rt_df.index))
print('size of gd_summ_df is',len(gd_summ_df.index))
print('size of bd_summ_df is',len(bd_summ_df.index))

size of gd_rt_df is 145877
size of bd_rt_df is 23967
size of gd_summ_df is 145877
size of bd_summ_df is 23967


In [72]:
with open('good_summ.csv', 'r') as f:
    reader = csv.reader(f)
    good_summary = list(reader)

with open('poor_summ.csv', 'r') as f:
    reader = csv.reader(f)
    poor_summary = list(reader)
    
mid_good = int(len(good_summary)/2)
mid_poor = int(len(poor_summary)/2)
summaries_train_clean = good_summary[:mid_good] + poor_summary[:mid_poor]
summaries_test_clean = good_summary[mid_good+1:] + poor_summary[mid_poor+1:]

In [79]:
print('len of good summ[:mid_good]',len(good_summary[:mid_good]))
print('len of poor summ[:mid_poor]',len(poor_summary[:mid_poor]))
print('len of good summ[mid_good:]',len(good_summary[mid_good+1:]))
print('len of poor summ[mid_poor:]',len(poor_summary[mid_poor+1:]))

print('len of X',len(summaries_train_clean))
print('len of X_test',len(summaries_test_clean))

len of good summ[:mid_good] 72938
len of poor summ[:mid_poor] 11983
len of good summ[mid_good:] 72938
len of poor summ[mid_poor:] 11983
len of X 84921
len of X_test 84921


In [80]:
# count vectorizer
cv = CountVectorizer(binary=True, lowercase=False)
cv.fit(summaries_train_clean)
# X = cv.transform(summaries_train_clean)
# X_test = cv.transform(summaries_test_clean)

# X

TypeError: expected string or bytes-like object