In [17]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.3

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)


In [18]:
# Run this in console to keep Co Lab Running
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
'''

'\nfunction ConnectButton(){\n    console.log("Connect pushed"); \n    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() \n}\nsetInterval(ConnectButton,60000);\n'

In [19]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit
import pandas as pd

In [20]:
# Start Spark session
spark = sparknlp.start()

In [21]:
# Best practice to print versions
print("Spark NLP version", sparknlp.version())
print("Apache Spark version", spark.version)

Spark NLP version 2.5.3
Apache Spark version 2.4.4


In [22]:
# Yin genius way of loading Amazon Data
file = 'amazon_reviews_us_Pet_Products_v1_00.tsv.gz'
os.system(f'wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/{file}')
df = pd.read_csv(file, compression='gzip', header=0, sep='\t', error_bad_lines=False)

b'Skipping line 21404: expected 15 fields, saw 22\nSkipping line 31839: expected 15 fields, saw 22\nSkipping line 37779: expected 15 fields, saw 22\nSkipping line 38344: expected 15 fields, saw 22\nSkipping line 61449: expected 15 fields, saw 22\n'
b'Skipping line 80975: expected 15 fields, saw 22\nSkipping line 85603: expected 15 fields, saw 22\nSkipping line 95950: expected 15 fields, saw 22\nSkipping line 105132: expected 15 fields, saw 22\nSkipping line 115378: expected 15 fields, saw 22\n'
b'Skipping line 152632: expected 15 fields, saw 22\nSkipping line 159310: expected 15 fields, saw 22\nSkipping line 162724: expected 15 fields, saw 22\nSkipping line 168588: expected 15 fields, saw 22\nSkipping line 170412: expected 15 fields, saw 22\nSkipping line 187169: expected 15 fields, saw 22\n'
b'Skipping line 205461: expected 15 fields, saw 22\nSkipping line 210928: expected 15 fields, saw 22\nSkipping line 213691: expected 15 fields, saw 22\nSkipping line 228697: expected 15 fields, sa

In [23]:
df1= df
#headline_updated = df1["review_headline"].str.contains("One Star|Two Stars|Three Stars|Four Stars|Five Stars",na=False)
#df1.drop(headline_updated[headline_updated].index,axis=0,inplace=True)
df1[["star_rating","review_body"]]= df1[["star_rating","review_body"]].astype(str)
df1.star_rating.value_counts()


5.0    1643151
4.0     380780
1.0     248519
3.0     216335
2.0     151067
nan          1
Name: star_rating, dtype: int64

In [24]:
#df1.loc[df1["star_rating"]=="2.0","star_rating"]="3.0" 
#df1.loc[df1["star_rating"]=="4.0","star_rating"]="3.0"


df_1 = df1.loc[df1["star_rating"]=="1.0"]
df_2 = df1.loc[df1["star_rating"]=="2.0"]
df_3 = df1.loc[df1["star_rating"]=="3.0"]
df_4 = df1.loc[df1["star_rating"]=="4.0"]
df_5 = df1.loc[df1["star_rating"]=="5.0"]

df_star1= df_1.sample(n = 151067, replace = True) 
df_star2= df_2.sample(n = 151067, replace = True) 
df_star3= df_3.sample(n = 151067, replace = True) 
df_star4= df_4.sample(n = 151067, replace = True) 
df_star5= df_5.sample(n = 151067, replace = True) 

frames = [df_star1, df_star2, df_star3, df_star4, df_star5]

df_sample = pd.concat(frames)
df_sample[["star_rating","review_body"]]= df[["star_rating","review_body"]].astype(str)

df_sample.star_rating.value_counts()

3.0    151067
5.0    151067
4.0    151067
2.0    151067
1.0    151067
Name: star_rating, dtype: int64

In [25]:
# Select out features and labels
entireDataset = spark.createDataFrame(df_sample[['star_rating', 'review_body']])

In [26]:
# Check data types
entireDataset.dtypes

[('star_rating', 'string'), ('review_body', 'string')]

In [27]:
# View col names
entireDataset.columns

['star_rating', 'review_body']

In [28]:
# Change object to string
entireDataset.review_body = entireDataset.review_body.astype('string')

In [29]:
# Prepare for NLP pipeline with initial DocumentAssembler
# Content is inside review_body
document = DocumentAssembler() \
  .setInputCol("review_body") \
  .setOutputCol("document")

In [30]:
# Using sentence detector (pretrained)
# Sentence detector
use = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["document"]) \
  . setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [31]:
# Our labels / classes are the 1 - 5 star ratings
# These are in the star_rating col

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("star_rating")\
  .setMaxEpochs(30)\
  .setEnableOutputLogs(True)

In [32]:
# Declare the pipeline
use_clf_pipeline = Pipeline(
  stages = [
    document,
    use,
    classifierdl
  ]
)



In [33]:
# Split into training and testing data
trainDataset, testDataset = entireDataset.randomSplit([0.75, 0.25], seed=12345)


In [34]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [35]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)
predictions.count()
predictions.show()

+-----------+--------------------+--------------------+--------------------+--------------------+
|star_rating|         review_body|            document| sentence_embeddings|               class|
+-----------+--------------------+--------------------+--------------------+--------------------+
|        1.0|#1, my daughter a...|[[document, 0, 24...|[[sentence_embedd...|[[category, 0, 24...|
|        1.0|$1,000 for 4 bott...|[[document, 0, 52...|[[sentence_embedd...|[[category, 0, 52...|
|        1.0|$1,000,000? Your ...|[[document, 0, 14...|[[sentence_embedd...|[[category, 0, 14...|
|        1.0|$15 for 2 hose cl...|[[document, 0, 10...|[[sentence_embedd...|[[category, 0, 10...|
|        1.0|$15/pound? Are yo...|[[document, 0, 16...|[[sentence_embedd...|[[category, 0, 16...|
|        1.0|$150 bucks for 16...|[[document, 0, 26...|[[sentence_embedd...|[[category, 0, 26...|
|        1.0|$4.50 at chewy.co...|[[document, 0, 31...|[[sentence_embedd...|[[category, 0, 31...|
|        1.0|$52? Ta

In [36]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [37]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)

In [38]:

# Run predictions
predictions.select('star_rating','review_body', 'class.result').show(10, truncate=80)

+-----------+--------------------------------------------------------------------------------+------+
|star_rating|                                                                     review_body|result|
+-----------+--------------------------------------------------------------------------------+------+
|        1.0|#1, my daughter and I could not put this together unless we bent the metal ba...| [2.0]|
|        1.0|                           $1,000 for 4 bottles of nail polish?  Don't think so.| [1.0]|
|        1.0|$1,000,000? Your lack of intelligence is so astounding I am unable to find hu...| [1.0]|
|        1.0|$15 for 2 hose clamps! Really!? Thats is beyond ripoff. Goto lowes or home de...| [4.0]|
|        1.0|$15/pound? Are you kidding? You can give your cat a grilled chicken breast fr...| [3.0]|
|        1.0|$150 bucks for 160 gallon bucket of salt?!?!?!  You're joking right?  This sa...| [1.0]|
|        1.0|                                                $4.50 at chewy.com  c

In [39]:
# Import reports for quantifying results
from sklearn.metrics import classification_report, accuracy_score

df = use_pipelineModel.transform(testDataset).select('star_rating','review_body', 'class.result').toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

In [40]:
df.result = df.result.astype(str)

In [41]:
print(classification_report(df['star_rating'], df.result))
print(accuracy_score(df['star_rating'], df.result))

              precision    recall  f1-score   support

         1.0       0.59      0.65      0.62     37902
         2.0       0.45      0.42      0.43     37659
         3.0       0.45      0.38      0.41     37819
         4.0       0.45      0.40      0.42     37742
         5.0       0.60      0.72      0.65     37585

    accuracy                           0.51    188707
   macro avg       0.51      0.51      0.51    188707
weighted avg       0.51      0.51      0.51    188707

0.5145808051635604
