# This First Cell is For Colab Setup Only!

In [None]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.1

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
[K     |████████████████████████████████| 215.7MB 55kB/s 
[K     |████████████████████████████████| 204kB 45.8MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 122kB 2.8MB/s 
[?25h

In [None]:
# You also need this for collab in your console
'''
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
'''

In [None]:
import sparknlp

In [None]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.ml.tuning import TrainValidationSplit
import pandas as pd

In [None]:
spark = sparknlp.start()

In [None]:
print("Spark NLP version", sparknlp.version())
print("Apache Spark version", spark.version)

In [None]:
# This is for Databricks load of data from S3
# Import Sample Data
file_uri = "/mnt/mnt_s3/proj_sample.csv"

entireDataset = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(file_uri) \

entireDataset.show(10, truncate=50)

In [None]:
# This is for data load in Colab
entireDataset = spark.read.csv('/content/proj_sample.csv')

In [None]:
entireDataset.columns

['_c0',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 '_c5',
 '_c6',
 '_c7',
 '_c8',
 '_c9',
 '_c10',
 '_c11',
 '_c12',
 '_c13',
 '_c14']

In [None]:
trainDataset, testDataset = entireDataset.randomSplit([0.75, 0.25], seed=12345)

In [None]:
# Content is inside review_body
document = DocumentAssembler() \
  .setInputCol("_c13") \
  .setOutputCol("document")

In [None]:
# Sentence detector
use = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["document"]) \
  . setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
# Our labels / classes are the 1 - 5 star ratings
# These are in the star_rating col

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("_c7")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

In [None]:
# Declare the pipeline
use_clf_pipeline = Pipeline(
  stages = [
    document,
    use,
    classifierdl
  ]
)

In [None]:
# Begin fitting on the data
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [None]:
# Set predictions on testDataset
predictions = use_pipelineModel.transform(testDataset)

In [None]:
predictions.select('_c7','_c13', 'class.result').show(10, truncate=80)

+---+--------------------------------------------------------------------------------+------+
|_c7|                                                                            _c13|result|
+---+--------------------------------------------------------------------------------+------+
|  5|                                                  Love it great add to collecton|   [5]|
|  5|These paper lanterns are adorable! The colors are bright, the patterns are fu...|   [5]|
|  1|         Showed up not how it's shown . Was someone's old toy. with paint on it.|   [5]|
|  4|i like it but i absoloutely hate that some dolls don't have pets like this on...|   [5]|
|  2|IT's OK, but not as good as the old Bite Meez puppets. This puppet is very th...|   [5]|
|  5|                     It was a birthday present for my grandson and he LOVES IT!!|   [5]|
|  1|This was to be a gift for my husband for our new pool. Did not receive the co...|   [5]|
|  5|                                     We play this game 

In [None]:
# Import reports for quantifying results
from sklearn.metrics import classification_report, accuracy_score

df = use_pipelineModel.transform(testDataset).select('_c7','_c13', 'class.result').toPandas()

df['result'] = df['result'].apply(lambda x: x[0])

In [None]:
print(classification_report(df._c7, df.result))
print(accuracy_score(df._c7, df.result))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.44      1.00      0.62         4

    accuracy                           0.44         9
   macro avg       0.09      0.20      0.12         9
weighted avg       0.20      0.44      0.27         9

0.4444444444444444


  _warn_prf(average, modifier, msg_start, len(result))
