In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

# spacy
import spacy

# Gensim
import gensim

In [2]:
# Load data
data = pd.read_json("dataset_1M.json", lines=True)

In [3]:
data.count()

overall           1000000
verified          1000000
reviewTime        1000000
reviewerID        1000000
asin              1000000
style              358331
reviewerName       999917
reviewText         999636
summary            999862
unixReviewTime    1000000
vote               121947
image               16345
dtype: int64

In [4]:
# Remove all non-verified buyers
data = data[data.verified]
data.count()

overall           947137
verified          947137
reviewTime        947137
reviewerID        947137
asin              947137
style             337257
reviewerName      947064
reviewText        946798
summary           947005
unixReviewTime    947137
vote              111510
image              14906
dtype: int64

In [5]:
# Remove all reviews without text
data = data[data.reviewText.notna()]
data.count()

overall           946798
verified          946798
reviewTime        946798
reviewerID        946798
asin              946798
style             337128
reviewerName      946725
reviewText        946798
summary           946674
unixReviewTime    946798
vote              111487
image              14832
dtype: int64

In [6]:
# Leave only "overall", "reviewText" and "summary"
data = data[["overall", "reviewText", "summary"]]
data.head()

Unnamed: 0,overall,reviewText,summary
1,1,It sucks barely picks up anything definitely n...,sucks
2,1,"Well to write a short one, it blew 2 fuses of ...",Defective
3,3,I have absolutely no memory of buying this but...,Looks cool! Probably works
4,5,it ok it does it job,Five Stars
5,5,Have 3 big dogs. this have been great for my F...,this have been great for my Ford transit connect


In [7]:
# Fill empty summaries
data = data.fillna("")
data.count()

overall       946798
reviewText    946798
summary       946798
dtype: int64

In [8]:
# Join "summary" and "reviewText" into one feature "text"
data["space"] = " "
data["text"] = data.summary + data.space + data.reviewText
data = data[["overall", "text"]]
data.head()

Unnamed: 0,overall,text
1,1,sucks It sucks barely picks up anything defini...
2,1,"Defective Well to write a short one, it blew 2..."
3,3,Looks cool! Probably works I have absolutely n...
4,5,Five Stars it ok it does it job
5,5,this have been great for my Ford transit conne...


In [121]:
# Remove noise
data["text_cleaned"] = data["text"].replace(to_replace="(<.*?>|&nbsp;)", value="", regex=True)
data["text_cleaned"] = data["text_cleaned"].replace(to_replace="[.,\/#!$%\^&\*;:{}=\-_`~()]", value="", regex=True)
data["text_cleaned"] = data["text_cleaned"].replace(to_replace="[0-9]+", value="", regex=True)
data["text_cleaned"] = data["text_cleaned"].replace(to_replace="(?<![\w])(?:[a-zA-Z0-9](?: |$))", value="", regex=True)

In [108]:
def lemmatization(texts):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = [" ".join([token.lemma_ for token in nlp(text)]) for text in texts]
    return (texts_out)

data["lemmatized_text"] = lemmatization(data["text_cleaned"].values.tolist())

In [133]:
# Split texts into lists of words
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data["split_text"] = gen_words(data["lemmatized_text"].tolist())

In [139]:
file_path = "saved_data/data.csv"

In [143]:
# Save data to file
data.to_csv(file_path, index=False)

In [144]:
# Load data from file
data_test = pd.read_csv(file_path)

In [145]:
data_test.head()

Unnamed: 0,overall,text,text_cleaned,lemmatized_text,split_text
0,1,sucks It sucks barely picks up anything defini...,sucks It sucks barely picks up anything defini...,suck it suck barely pick up anything definitel...,"['suck', 'it', 'suck', 'barely', 'pick', 'up',..."
1,1,"Defective Well to write a short one, it blew 2...",Defective Well to write short one it blew fus...,defective Well to write short one it blow fu...,"['defective', 'well', 'to', 'write', 'short', ..."
2,3,Looks cool! Probably works I have absolutely n...,Looks cool Probably works have absolutely no m...,look cool probably work have absolutely no mem...,"['look', 'cool', 'probably', 'work', 'have', '..."
3,5,Five Stars it ok it does it job,Five Stars it ok it does it job,five Stars it ok it do it job,"['five', 'stars', 'it', 'ok', 'it', 'do', 'it'..."
4,5,this have been great for my Ford transit conne...,this have been great for my Ford transit conne...,this have be great for my Ford transit connect...,"['this', 'have', 'be', 'great', 'for', 'my', '..."
