In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

In [2]:
# Load data
data = pd.read_json("dataset_1M.json", lines=True)

In [3]:
data.count()

overall           1000000
verified          1000000
reviewTime        1000000
reviewerID        1000000
asin              1000000
style              358331
reviewerName       999917
reviewText         999636
summary            999862
unixReviewTime    1000000
vote               121947
image               16345
dtype: int64

In [4]:
# Remove all non-verified buyers
data = data[data.verified]
data.count()

overall           947137
verified          947137
reviewTime        947137
reviewerID        947137
asin              947137
style             337257
reviewerName      947064
reviewText        946798
summary           947005
unixReviewTime    947137
vote              111510
image              14906
dtype: int64

In [5]:
# Remove all reviews without text
data = data[data.reviewText.notna()]
data.count()

overall           946798
verified          946798
reviewTime        946798
reviewerID        946798
asin              946798
style             337128
reviewerName      946725
reviewText        946798
summary           946674
unixReviewTime    946798
vote              111487
image              14832
dtype: int64

In [6]:
# Leave only "overall", "reviewText" and "summary"
data = data[["overall", "reviewText", "summary"]]
data.head()

Unnamed: 0,overall,reviewText,summary
1,1,It sucks barely picks up anything definitely n...,sucks
2,1,"Well to write a short one, it blew 2 fuses of ...",Defective
3,3,I have absolutely no memory of buying this but...,Looks cool! Probably works
4,5,it ok it does it job,Five Stars
5,5,Have 3 big dogs. this have been great for my F...,this have been great for my Ford transit connect


In [7]:
# Fill empty summaries
data = data.fillna("")
data.count()

overall       946798
reviewText    946798
summary       946798
dtype: int64

In [8]:
# Join "summary" and "reviewText" into one feature "text"
data["space"] = " "
data["text"] = data.summary + data.space + data.reviewText
data = data[["overall", "text"]]
data.head()

Unnamed: 0,overall,text
1,1,sucks It sucks barely picks up anything defini...
2,1,"Defective Well to write a short one, it blew 2..."
3,3,Looks cool! Probably works I have absolutely n...
4,5,Five Stars it ok it does it job
5,5,this have been great for my Ford transit conne...


In [None]:
# Remove HTML from text
CLEANR_HTML = re.compile('<.*?>')

def clean_html(text, axis=1):
    cleantext = [re.sub(CLEANR, '', line) for line in text]
    return cleantext

data.text_cleaned = data.text.apply(clean_html)
data.head()

In [19]:
# Remove special characters from text
CLEANR_SPECIAL = re.compile("\W")

def clean_special(text):
    cleantext = [re.sub(CLEANR_SPECIAL, '', line) for line in text]
    return cleantext

data.text_cleaned = data.text.apply(clean_special)

In [28]:
data.head()

Unnamed: 0,overall,text
1,1,"[s, u, c, k, s, , I, t, , s, u, c, k, s, , ..."
2,1,"[D, e, f, e, c, t, i, v, e, , W, e, l, l, , ..."
3,3,"[L, o, o, k, s, , c, o, o, l, !, , P, r, o, ..."
4,5,"[F, i, v, e, , S, t, a, r, s, , i, t, , o, ..."
5,5,"[t, h, i, s, , h, a, v, e, , b, e, e, n, , ..."
