<a href="https://colab.research.google.com/github/Fergus1212/review-star-ranker/blob/master/sparnknlp_review_rating_1and5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.3

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)


In [2]:
# Run this in console to keep Co Lab Running
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
'''

'\nfunction ConnectButton(){\n    console.log("Connect pushed"); \n    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() \n}\nsetInterval(ConnectButton,60000);\n'

In [3]:
# Spark NLP Dependencies
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *

# PySpark Dependencies
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit

# Pandas
import pandas as pd

# Misc
import numpy as np

# Graphs
import matplotlib.pyplot as plt

In [4]:
# Start Spark session
spark = sparknlp.start()

In [5]:
# Best practice to print versions
print("Spark NLP version", sparknlp.version())
print("Apache Spark version", spark.version)

Spark NLP version 2.5.3
Apache Spark version 2.4.4


In [6]:
# Yin genius way of loading Amazon
file = 'amazon_reviews_us_Pet_Products_v1_00.tsv.gz'
os.system(f'wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/{file}')
data = pd.read_csv(file, compression='gzip', header=0, sep='\t', error_bad_lines=False)

%time

b'Skipping line 21404: expected 15 fields, saw 22\nSkipping line 31839: expected 15 fields, saw 22\nSkipping line 37779: expected 15 fields, saw 22\nSkipping line 38344: expected 15 fields, saw 22\nSkipping line 61449: expected 15 fields, saw 22\n'
b'Skipping line 80975: expected 15 fields, saw 22\nSkipping line 85603: expected 15 fields, saw 22\nSkipping line 95950: expected 15 fields, saw 22\nSkipping line 105132: expected 15 fields, saw 22\nSkipping line 115378: expected 15 fields, saw 22\n'
b'Skipping line 152632: expected 15 fields, saw 22\nSkipping line 159310: expected 15 fields, saw 22\nSkipping line 162724: expected 15 fields, saw 22\nSkipping line 168588: expected 15 fields, saw 22\nSkipping line 170412: expected 15 fields, saw 22\nSkipping line 187169: expected 15 fields, saw 22\n'
b'Skipping line 205461: expected 15 fields, saw 22\nSkipping line 210928: expected 15 fields, saw 22\nSkipping line 213691: expected 15 fields, saw 22\nSkipping line 228697: expected 15 fields, sa

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


In [7]:
# For testing smaller set (comment out otherwise)
df0 = data.iloc[np.r_[1:50000]]
df0.reset_index(inplace=True, drop=True)

# # Leave un-commented to ignore slicing
# df0 = data

In [8]:
df0.count()

marketplace          49999
customer_id          49999
review_id            49999
product_id           49999
product_parent       49999
product_title        49999
product_category     49999
star_rating          49999
helpful_votes        49999
total_votes          49999
vine                 49999
verified_purchase    49999
review_headline      49999
review_body          49986
review_date          49999
dtype: int64

In [9]:
# Display data
df0.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,11488901,R3NU7OMZ4HQIEG,B00MBW5O9W,912374672,Warren Eckstein's Hugs & Kisses Vitamin Minera...,Pet Products,2.0,0.0,1.0,N,Y,My dogs love Hugs and Kisses,"My dogs love Hugs and Kisses. However, the la...",2015-08-31
1,US,43214993,R14QJW3XF8QO1P,B0084OHUIO,902215727,Tyson's True Chews Premium Jerky - 12 ounce Ch...,Pet Products,5.0,0.0,0.0,N,Y,I have been purchasing these for a long time. ...,I have been purchasing these for a long time. ...,2015-08-31
2,US,12835065,R2HB7AX0394ZGY,B001GS71K2,568880110,"Soft Side Pet Crate, Navy/Tan",Pet Products,5.0,0.0,0.0,N,Y,it is easy to open and close,"It is extremely well constructed, it is easy t...",2015-08-31
3,US,26334022,RGKMPDQGSAHR3,B004ABH1LG,692846826,"EliteField 3-Door Folding Soft Dog Crate, Indo...",Pet Products,5.0,0.0,0.0,N,Y,Dog crate,Worked really well. Very pleased with my purc...,2015-08-31
4,US,22283621,R1DJCVPQGCV66E,B00AX0LFM4,590674141,Carlson 68-Inch Wide Adjustable Freestanding P...,Pet Products,5.0,0.0,0.0,N,Y,Five Stars,I love my gates! They look beautiful and they...,2015-08-31


In [10]:
# Select out labels and classes
df = df0[['review_body', 'star_rating']]

In [11]:
# Drop duplicates
df = df.drop_duplicates(subset='review_body').copy()
df.head()

Unnamed: 0,review_body,star_rating
0,"My dogs love Hugs and Kisses. However, the la...",2.0
1,I have been purchasing these for a long time. ...,5.0
2,"It is extremely well constructed, it is easy t...",5.0
3,Worked really well. Very pleased with my purc...,5.0
4,I love my gates! They look beautiful and they...,5.0


In [12]:
# Reset index from drop
df.reset_index(drop=True, inplace=True)

In [13]:
# View classification col counts
df.star_rating.value_counts()

5.0    29788
4.0     6096
1.0     4635
3.0     3815
2.0     2763
Name: star_rating, dtype: int64

In [14]:
# Condense ratings
# 2 down to 1
df.loc[df["star_rating"] == 2, ['star_rating']] = 1

# 3 down to 1
df.loc[df["star_rating"] == 3, ['star_rating']] = 1

# 4 up to 5
df.loc[df["star_rating"] == 4, ['star_rating']] = 5

In [15]:
# View classification col again
df.star_rating.value_counts()

5.0    35884
1.0    11213
Name: star_rating, dtype: int64

In [16]:
# Store the min value_count for balancing
label_min = df.star_rating.value_counts().min()

In [17]:
# Sample based on label min
df_1_bal = df.loc[df["star_rating"] == 1].sample(n = label_min, replace = True)
df_5_bal = df.loc[df["star_rating"] == 5].sample(n = label_min, replace = True)

frames = [
          df_1_bal,
          df_5_bal,
]

df_balanced = pd.concat(frames)

In [18]:
df_balanced.dtypes

review_body     object
star_rating    float64
dtype: object

In [19]:
df_balanced['review_body'] = df_balanced['review_body'].astype('str')

In [20]:
# Select out features and labels
entireDataset = spark.createDataFrame(df_balanced[['star_rating', 'review_body']])

In [21]:
entireDataset.dtypes

[('star_rating', 'double'), ('review_body', 'string')]

In [22]:
# Set label as integer
import pyspark.sql.functions as F

entireDataset = entireDataset.withColumn("star_rating", F.round(entireDataset["star_rating"]).cast('integer'))
entireDataset.show()

+-----------+--------------------+
|star_rating|         review_body|
+-----------+--------------------+
|          1|I like the design...|
|          1|This is soft whic...|
|          1|Nice looking, did...|
|          1|This product woul...|
|          1|One star because ...|
|          1|said size 3 to me...|
|          1|I bought 2 of the...|
|          1|I have given this...|
|          1|       Dog eats them|
|          1|I generally love ...|
|          1|Cute dress but it...|
|          1|I can see how thi...|
|          1|We had used this ...|
|          1|            Too thin|
|          1|99% of my worms d...|
|          1|to thin and stret...|
|          1|I'm a bit disappo...|
|          1|          Too small.|
|          1|This product work...|
|          1|My cat Luvz this ...|
+-----------+--------------------+
only showing top 20 rows



In [23]:
# Check counts
entireDataset.groupBy('star_rating').count().show()

+-----------+-----+
|star_rating|count|
+-----------+-----+
|          1|11213|
|          5|11213|
+-----------+-----+



In [24]:
# Check infered values
entireDataset.printSchema()

root
 |-- star_rating: integer (nullable = true)
 |-- review_body: string (nullable = true)



In [25]:
# View col names
entireDataset.columns

['star_rating', 'review_body']

In [26]:
# Prepare for NLP pipeline with initial DocumentAssembler
# Content is inside review_body
document = DocumentAssembler() \
  .setInputCol("review_body") \
  .setOutputCol("document") \
  .setCleanupMode("shrink")

In [27]:
# Using sentence detector (pretrained)
# Sentence detector
use = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["document"]) \
  .setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [28]:
# Our labels / classes are the 1 - 5 star ratings
# These are in the star_rating col

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("star_rating")\
  .setMaxEpochs(30)\
  .setLr(.01)\
  .setEnableOutputLogs(True)

# Additional hyperparamters
# .setBatchSize(5)\
# .setDropout(0.5)\

In [29]:
# Declare the pipeline
use_clf_pipeline = Pipeline(
  stages = [
    document,
    use,
    classifierdl
  ]
)

In [30]:
# Split into training and testing data
trainDataset, testDataset = entireDataset.randomSplit([0.75, 0.25], seed=12345)
print(trainDataset.count())
print(testDataset.count())

16784
5642


In [31]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)
%time

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [32]:
!cd ~/annotator_logs && ls -l

total 4
-rw-r--r-- 1 root root 2694 Jul 20 22:26 ClassifierDLApproach_b542caf92019.log


In [47]:
# Copy the result from above
# to replace ClassifierDLApproach_<this>.log
!cat ~/annotator_logs/ClassifierDLApproach_b542caf92019.log

Training started - total epochs: 30 - learning rate: 0.01 - batch size: 64 - training examples: 16784
Epoch 0/30 - 4.376667109%.2fs - loss: 129.39015 - accuracy: 0.80993557 - batches: 263
Epoch 1/30 - 3.823645488%.2fs - loss: 125.00921 - accuracy: 0.84351146 - batches: 263
Epoch 2/30 - 3.491970967%.2fs - loss: 122.591125 - accuracy: 0.85370946 - batches: 263
Epoch 3/30 - 3.550107562%.2fs - loss: 121.71795 - accuracy: 0.86247617 - batches: 263
Epoch 4/30 - 3.634410469%.2fs - loss: 119.120705 - accuracy: 0.87076575 - batches: 263
Epoch 5/30 - 3.730388251%.2fs - loss: 117.95643 - accuracy: 0.87714696 - batches: 263
Epoch 6/30 - 3.753051176%.2fs - loss: 117.94272 - accuracy: 0.8838263 - batches: 263
Epoch 7/30 - 3.457601836%.2fs - loss: 116.69407 - accuracy: 0.8893726 - batches: 263
Epoch 8/30 - 3.418908509%.2fs - loss: 117.381065 - accuracy: 0.89360684 - batches: 263
Epoch 9/30 - 3.447795253%.2fs - loss: 117.97113 - accuracy: 0.8942629 - batches: 263
Epoch 10/30 - 3.417889307%.2fs - loss:

In [34]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)
predictions.count()
predictions.show()

+-----------+--------------------+--------------------+--------------------+--------------------+
|star_rating|         review_body|            document| sentence_embeddings|               class|
+-----------+--------------------+--------------------+--------------------+--------------------+
|          1|1 star is because...|[[document, 0, 91...|[[sentence_embedd...|[[category, 0, 91...|
|          1|1. Did absolutely...|[[document, 0, 83...|[[sentence_embedd...|[[category, 0, 83...|
|          1|2 ADF in a ten ga...|[[document, 0, 28...|[[sentence_embedd...|[[category, 0, 28...|
|          1|28.6 pounds? Real...|[[document, 0, 98...|[[sentence_embedd...|[[category, 0, 98...|
|          1|3 dogs at the nei...|[[document, 0, 15...|[[sentence_embedd...|[[category, 0, 15...|
|          1|3 out of 12 cans ...|[[document, 0, 23...|[[sentence_embedd...|[[category, 0, 23...|
|          1|A cheaply made bu...|[[document, 0, 20...|[[sentence_embedd...|[[category, 0, 20...|
|          1|A good 

In [35]:
# Run predictions
view = use_pipelineModel.transform(testDataset).select('star_rating','review_body', 'class.result').toPandas()

In [36]:
type(view)

pandas.core.frame.DataFrame

In [37]:
# Import reports for quantifying results
from sklearn.metrics import classification_report, accuracy_score

df = view

df['result'] = df['result'].apply(lambda x: x[0])

In [38]:
df.result = df.result.astype('int64')

In [39]:
print(classification_report(df['star_rating'], df.result))
print(accuracy_score(df['star_rating'], df.result))

              precision    recall  f1-score   support

           1       0.85      0.85      0.85      2849
           5       0.85      0.85      0.85      2793

    accuracy                           0.85      5642
   macro avg       0.85      0.85      0.85      5642
weighted avg       0.85      0.85      0.85      5642

0.8502304147465438


In [42]:
# Random test strings
series_reviews = [
                  'This is terrible yuck.',
                  'It made me sad.',
                  'Could be better but ok.',
                  'Absolutely love it'
]

df_input_test = spark.createDataFrame(pd.DataFrame({'review_body': series_reviews}))
df_input_test

DataFrame[review_body: string]

In [43]:
df_prediction = use_pipelineModel.transform(df_input_test).select('review_body', 'class.result').toPandas()
df_prediction

Unnamed: 0,review_body,result
0,This is terrible yuck.,[1]
1,It made me sad.,[1]
2,Could be better but ok.,[1]
3,Absolutely love it,[5]


# For Saving / Loading Model

In [40]:
# # Save the model
# from pyspark.ml import Pipeline, PipelineModel

# use_pipelineModel.save("model_name")

# # Load the model
# reloaded_model = PipelineModel.load("model_name")

# # Predictions based on reload
# predictions = reloaded_model.transform(df_input_test)
# predictions.count()
# predictions.show()

## DOWNLOAD MODEL
# from google.colab import files
# files.download("/content/file.zip")