In [34]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.3

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)


In [None]:
# Run this in console to keep Co Lab Running
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
'''

'\nfunction ConnectButton(){\n    console.log("Connect pushed"); \n    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() \n}\nsetInterval(ConnectButton,60000);\n'

In [42]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit
import pandas as pd

In [43]:
# Start Spark session
spark = sparknlp.start()

In [44]:
# Best practice to print versions
print("Spark NLP version", sparknlp.version())
print("Apache Spark version", spark.version)

Spark NLP version 2.5.3
Apache Spark version 2.4.4


In [None]:
# Yin genius way of loading Amazon Data
file = 'amazon_reviews_us_Pet_Products_v1_00.tsv.gz'
os.system(f'wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/{file}')
df = pd.read_csv(file, compression='gzip', header=0, sep='\t', error_bad_lines=False)

b'Skipping line 21404: expected 15 fields, saw 22\nSkipping line 31839: expected 15 fields, saw 22\nSkipping line 37779: expected 15 fields, saw 22\nSkipping line 38344: expected 15 fields, saw 22\nSkipping line 61449: expected 15 fields, saw 22\n'
b'Skipping line 80975: expected 15 fields, saw 22\nSkipping line 85603: expected 15 fields, saw 22\nSkipping line 95950: expected 15 fields, saw 22\nSkipping line 105132: expected 15 fields, saw 22\nSkipping line 115378: expected 15 fields, saw 22\n'
b'Skipping line 152632: expected 15 fields, saw 22\nSkipping line 159310: expected 15 fields, saw 22\nSkipping line 162724: expected 15 fields, saw 22\nSkipping line 168588: expected 15 fields, saw 22\nSkipping line 170412: expected 15 fields, saw 22\nSkipping line 187169: expected 15 fields, saw 22\n'
b'Skipping line 205461: expected 15 fields, saw 22\nSkipping line 210928: expected 15 fields, saw 22\nSkipping line 213691: expected 15 fields, saw 22\nSkipping line 228697: expected 15 fields, sa

In [45]:
df1= df
#headline_updated = df1["review_headline"].str.contains("One Star|Two Stars|Three Stars|Four Stars|Five Stars",na=False)
#df1.drop(headline_updated[headline_updated].index,axis=0,inplace=True)
df1[["star_rating","review_body"]]= df1[["star_rating","review_body"]].astype(str)
df1.star_rating.value_counts()


5.0    1643151
4.0     380780
1.0     248519
3.0     216335
2.0     151067
nan          1
Name: star_rating, dtype: int64

In [53]:

df_1 = df1.loc[df1["star_rating"]=="1.0"]
df_2_3_4 = df1[(df1["star_rating"]=="2.0") | (df1["star_rating"]=="3.0") | (df1["star_rating"]=="4.0")]
df_5 = df1.loc[df1["star_rating"]=="5.0"]

df_star1= df_1.sample(n = 248519, replace = True) 
df_star234= df_2.sample(n = 248519, replace = True) 
df_star5= df_5.sample(n = 248519, replace = True) 

frames = [df_star1, df_star234, df_star5]

df_sample = pd.concat(frames)
df_sample[["star_rating","review_body"]]= df[["star_rating","review_body"]].astype(str)

In [54]:
# Select out features and labels
entireDataset = spark.createDataFrame(df_sample[['star_rating', 'review_body']])

In [55]:
# Check data types
entireDataset.dtypes

[('star_rating', 'string'), ('review_body', 'string')]

In [56]:
# View col names
entireDataset.columns

['star_rating', 'review_body']

In [57]:
# Change object to string
entireDataset.review_body = entireDataset.review_body.astype('string')

In [58]:
# Prepare for NLP pipeline with initial DocumentAssembler
# Content is inside review_body
document = DocumentAssembler() \
  .setInputCol("review_body") \
  .setOutputCol("document")

In [59]:
# Using sentence detector (pretrained)
# Sentence detector
use = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["document"]) \
  . setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [60]:
# Our labels / classes are the 1 - 5 star ratings
# These are in the star_rating col

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("star_rating")\
  .setMaxEpochs(15)\
  .setEnableOutputLogs(True)

In [61]:
# Declare the pipeline
use_clf_pipeline = Pipeline(
  stages = [
    document,
    use,
    classifierdl
  ]
)



In [62]:
# Split into training and testing data
trainDataset, testDataset = entireDataset.randomSplit([0.75, 0.25], seed=12345)


In [63]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [64]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)
predictions.count()
predictions.show()

+-----------+--------------------+--------------------+--------------------+--------------------+
|star_rating|         review_body|            document| sentence_embeddings|               class|
+-----------+--------------------+--------------------+--------------------+--------------------+
|        1.0|!yr old showed no...|[[document, 0, 15...|[[sentence_embedd...|[[category, 0, 15...|
|        1.0|#1 the descriptio...|[[document, 0, 81...|[[sentence_embedd...|[[category, 0, 81...|
|        1.0|#1, my daughter a...|[[document, 0, 24...|[[sentence_embedd...|[[category, 0, 24...|
|        1.0|$108 for 60 chews...|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|        1.0|$11 for one stick...|[[document, 0, 21...|[[sentence_embedd...|[[category, 0, 21...|
|        1.0|$11.50 per box? W...|[[document, 0, 97...|[[sentence_embedd...|[[category, 0, 97...|
|        1.0|$16.99 for a 12oz...|[[document, 0, 17...|[[sentence_embedd...|[[category, 0, 17...|
|        1.0|$20 for

In [65]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

KeyboardInterrupt: ignored

In [66]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)

In [67]:

# Run predictions
predictions.select('star_rating','review_body', 'class.result').show(10, truncate=80)

+-----------+--------------------------------------------------------------------------------+------+
|star_rating|                                                                     review_body|result|
+-----------+--------------------------------------------------------------------------------+------+
|        1.0|!yr old showed no interest even with the cat nip inside. never use it. My cat...| [1.0]|
|        1.0|#1 the description is completly wrong. There is no grate to catch litter as t...| [2.0]|
|        1.0|#1, my daughter and I could not put this together unless we bent the metal ba...| [2.0]|
|        1.0|$108 for 60 chews?  Are you out of your minds?  I purchased these for $25 for...| [1.0]|
|        1.0|$11 for one stick???? I purchased this because of th pictures (3 wands) with ...| [1.0]|
|        1.0|$11.50 per box? What kind of treat have they been eating! Buy the 6-pack of 1...| [1.0]|
|        1.0|$16.99 for a 12oz bag is highway robbery. I pay $7.99 when I buy this

In [68]:
# Import reports for quantifying results
from sklearn.metrics import classification_report, accuracy_score

df = use_pipelineModel.transform(testDataset).select('star_rating','review_body', 'class.result').toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

In [69]:
df.result = df.result.astype(str)

In [70]:
print(classification_report(df['star_rating'], df.result))
print(accuracy_score(df['star_rating'], df.result))

              precision    recall  f1-score   support

         1.0       0.71      0.70      0.70     62129
         2.0       0.66      0.68      0.67     62202
         5.0       0.87      0.87      0.87     61922

    accuracy                           0.75    186253
   macro avg       0.75      0.75      0.75    186253
weighted avg       0.75      0.75      0.75    186253

0.7463611324381352
