In [2]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.3

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
[K     |████████████████████████████████| 215.7MB 22kB/s 
[K     |████████████████████████████████| 204kB 62.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 133kB 7.8MB/s 
[?25h

In [None]:
# Run this in console to keep Co Lab Running
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
'''

'\nfunction ConnectButton(){\n    console.log("Connect pushed"); \n    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() \n}\nsetInterval(ConnectButton,60000);\n'

In [3]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit
import pandas as pd

In [4]:
# Start Spark session
spark = sparknlp.start()

In [5]:
# Best practice to print versions
print("Spark NLP version", sparknlp.version())
print("Apache Spark version", spark.version)

Spark NLP version 2.5.3
Apache Spark version 2.4.4


In [7]:
# Yin genius way of loading Amazon Data
file = 'amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv.gz'
os.system(f'wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/{file}')
df = pd.read_csv(file, compression='gzip', header=0, sep='\t', error_bad_lines=False)

b'Skipping line 1598: expected 15 fields, saw 22\nSkipping line 25768: expected 15 fields, saw 22\n'


In [8]:
df1= df
#headline_updated = df1["review_headline"].str.contains("One Star|Two Stars|Three Stars|Four Stars|Five Stars",na=False)
#df1.drop(headline_updated[headline_updated].index,axis=0,inplace=True)
df1[["star_rating","review_body"]]= df1[["star_rating","review_body"]].astype(str)
df1.star_rating.value_counts()


5    48865
4    13644
1    11034
3     7043
2     5338
Name: star_rating, dtype: int64

In [None]:
df1.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,32114233,R1QX6706ZWJ1P5,B00OYRW4UE,223980852,Elite Sportz Exercise Sliders are Double Sided...,Personal_Care_Appliances,5,0,0,N,Y,Good quality. Shipped,Exactly as described. Good quality. Shipped fast,2015-08-31
1,US,18125776,R3QWMLJHIW6P37,B0000537JQ,819771537,Ezy Dose Weekly,Personal_Care_Appliances,5,0,0,N,Y,Five Stars,It is great,2015-08-31
2,US,19917519,R14Z1VR1N0Z9G6,B00HXXO332,849307176,"Pulse Oximeter, Blood Oxygen Monitor",Personal_Care_Appliances,5,1,1,N,Y,It's really nice it works great,It's really nice it works great. You have the ...,2015-08-31
3,US,18277171,R25ZRJL0GH0U0,B00EOB0JA2,700864740,SE Tools Tool Kit Watch Watch Repair Kit (20 P...,Personal_Care_Appliances,2,0,0,N,Y,Two Stars,The kit works fine... simple cheap plastic tho,2015-08-31
4,US,2593270,R3837KYH7AZNIY,B00OC2O1UC,794298839,"doTERRA HD Clear Facial Kit - Facial Lotion, F...",Personal_Care_Appliances,4,0,1,N,Y,Four Stars,It works better than anything else ive tried,2015-08-31


In [9]:

df_1 = df1.loc[df1["star_rating"]=="1"]
df_2 = df1.loc[df1["star_rating"]=="2"]
df_3 = df1.loc[df1["star_rating"]=="3"]
df_4 = df1.loc[df1["star_rating"]=="4"]
df_5 = df1.loc[df1["star_rating"]=="5"]

df_star1= df_1.sample(n = 5338, replace = True) 
df_star2= df_2.sample(n = 5338, replace = True) 
df_star3= df_3.sample(n = 5338, replace = True) 
df_star4= df_4.sample(n = 5338, replace = True) 
df_star5= df_5.sample(n = 5338, replace = True) 

frames = [df_star1, df_star2, df_star3, df_star4, df_star5]

df_sample = pd.concat(frames)
df_sample[["star_rating","review_body"]]= df[["star_rating","review_body"]].astype(str)

In [10]:
# Select out features and labels
entireDataset = spark.createDataFrame(df_sample[['star_rating', 'review_body']])

In [11]:
# Check data types
entireDataset.dtypes

[('star_rating', 'string'), ('review_body', 'string')]

In [12]:
# View col names
entireDataset.columns

['star_rating', 'review_body']

In [13]:
# Change object to string
entireDataset.review_body = entireDataset.review_body.astype('string')

In [14]:
# Prepare for NLP pipeline with initial DocumentAssembler
# Content is inside review_body
document = DocumentAssembler() \
  .setInputCol("review_body") \
  .setOutputCol("document")

In [15]:
# Using sentence detector (pretrained)
# Sentence detector
use = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["document"]) \
  . setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [16]:
# Our labels / classes are the 1 - 5 star ratings
# These are in the star_rating col

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("star_rating")\
  .setMaxEpochs(15)\
  .setEnableOutputLogs(True)

In [17]:
# Declare the pipeline
use_clf_pipeline = Pipeline(
  stages = [
    document,
    use,
    classifierdl
  ]
)



In [18]:
# Split into training and testing data
trainDataset, testDataset = entireDataset.randomSplit([0.75, 0.25], seed=12345)


In [19]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [20]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)
predictions.count()
predictions.show()

+-----------+--------------------+--------------------+--------------------+--------------------+
|star_rating|         review_body|            document| sentence_embeddings|               class|
+-----------+--------------------+--------------------+--------------------+--------------------+
|          1|(Note:  This revi...|[[document, 0, 63...|[[sentence_embedd...|[[category, 0, 63...|
|          1|* Good point of v...|[[document, 0, 18...|[[sentence_embedd...|[[category, 0, 18...|
|          1|***Edited since 1...|[[document, 0, 67...|[[sentence_embedd...|[[category, 0, 67...|
|          1|... and once that...|[[document, 0, 87...|[[sentence_embedd...|[[category, 0, 87...|
|          1|....... but don't...|[[document, 0, 59...|[[sentence_embedd...|[[category, 0, 59...|
|          1|..the battery was...|[[document, 0, 41...|[[sentence_embedd...|[[category, 0, 41...|
|          1|A couple of minut...|[[document, 0, 89...|[[sentence_embedd...|[[category, 0, 89...|
|          1|A frien

In [21]:

# Run predictions
predictions.select('star_rating','review_body', 'class.result').show(10, truncate=80)

+-----------+--------------------------------------------------------------------------------+------+
|star_rating|                                                                     review_body|result|
+-----------+--------------------------------------------------------------------------------+------+
|          1|(Note:  This review was based on a purchase in Dec. 2010 when the product sol...|   [1]|
|          1|* Good point of views ; ^^*     1. The design is perfect to use this on the b...|   [2]|
|          1|***Edited since 1 star review was published***    I had some issues with rece...|   [1]|
|          1|... and once that happens, it's trashed. Three much nicer, sturdier shoehorns...|   [1]|
|          1|....... but don't buy the camo version thinking that they will be suitable fo...|   [2]|
|          1|..the battery wasn't dead on arrival and the resistance knob had actually wor...|   [2]|
|          1|A couple of minutes was all it took for flames to start leaping out, 

In [22]:
# Import reports for quantifying results
from sklearn.metrics import classification_report, accuracy_score

df = use_pipelineModel.transform(testDataset).select('star_rating','review_body', 'class.result').toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

In [23]:
df.result = df.result.astype(str)

In [24]:
print(classification_report(df['star_rating'], df.result))
print(accuracy_score(df['star_rating'], df.result))

              precision    recall  f1-score   support

           1       0.59      0.62      0.60      1363
           2       0.44      0.41      0.43      1321
           3       0.40      0.44      0.42      1329
           4       0.41      0.32      0.35      1368
           5       0.59      0.67      0.63      1335

    accuracy                           0.49      6716
   macro avg       0.49      0.49      0.49      6716
weighted avg       0.49      0.49      0.49      6716

0.4921083978558666
