#BERT Embedding + ClassifierDLApproach

In [None]:
!pip install spark-nlp
!pip install pyspark

Collecting spark-nlp
  Downloading spark_nlp-5.0.1-py2.py3-none-any.whl (499 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.0/499.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-5.0.1
Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=76cab1e5c3dc61f3ba59f22b8cb454c7271829d805e3218613f5544021d1e646
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed py

In [None]:
from pyspark.sql import SparkSession, DataFrame

In [None]:
import sparknlp

# Start Spark Session
spark = sparknlp.start()

In [None]:
spark = SparkSession.builder.appName("sentimentanalysis").getOrCreate()

In [None]:
df = spark.read.option("multiLine", True).option("header", True).option("escape", "\"").csv("/content/Amazon_product_review.csv")


In [None]:
df.show(10)
df.printSchema()

In [None]:
  data = df.randomSplit([0.7,0.3])
  train_set = data[0]
  test_set = data[1]
  train_count = train_set.count()
  test_count = test_set.count()
  print(train_count)
  print(test_count)

14036
5964


In [None]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("short_review") \
    .setOutputCol("document")
sent_embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")
classifierDl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("pred_label")\
  .setLabelColumn("Sentiment")\
  .setMaxEpochs(125)\
  .setLr(0.0007)
pipeline = Pipeline().setStages([
    documentAssembler,
    sent_embeddings,
    classifierDl
])



sent_small_bert_L2_128 download started this may take some time.
Approximate size to download 16.1 MB
[OK!]


In [None]:
model = pipeline.fit(train_set)


In [None]:
preds = model.transform(test_set)
preds.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|        short_review|Sentiment|            document| sentence_embeddings|          pred_label|
+--------------------+---------+--------------------+--------------------+--------------------+
|"Fast service, th...| positive|[{document, 0, 39...|[{sentence_embedd...|[{category, 0, 39...|
|"Really impressed...| positive|[{document, 0, 36...|[{sentence_embedd...|[{category, 0, 36...|
|"Super extended l...| negative|[{document, 0, 19...|[{sentence_embedd...|[{category, 0, 19...|
|"The armband is g...| positive|[{document, 0, 35...|[{sentence_embedd...|[{category, 0, 35...|
|&34;BUYER BEWARE ...| negative|[{document, 0, 35...|[{sentence_embedd...|[{category, 0, 35...|
|&34;you get what ...| negative|[{document, 0, 25...|[{sentence_embedd...|[{category, 0, 25...|
|*** UPDATE Novemb...| positive|[{document, 0, 10...|[{sentence_embedd...|[{category, 0, 10...|
|***ORIGINAL REVIE...| positive|[{docume

In [None]:
df = preds.select(['Sentiment','pred_label.result']).toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

In [None]:
df

DataFrame[short_review: string, Sentiment: string]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df['result'],df['Sentiment']))

              precision    recall  f1-score   support

    negative       0.48      0.66      0.56      1024
    positive       0.92      0.85      0.89      4940

    accuracy                           0.82      5964
   macro avg       0.70      0.76      0.72      5964
weighted avg       0.85      0.82      0.83      5964



#BERT in Spark

In [None]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("short_review") \
    .setOutputCol("document")
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
sequenceClassifier = BertForSequenceClassification.pretrained() \
    .setInputCols(["token", "document"]) \
    .setOutputCol("label") \
    .setCaseSensitive(True) \
    .setThreshold(0.5) \
    .setMaxSentenceLength(128) \
    .setBatchSize(32)
pipeline = Pipeline().setStages([
    documentAssembler,
    tokenizer,
    sequenceClassifier
])

bert_base_sequence_classifier_imdb download started this may take some time.
Approximate size to download 387.6 MB
[OK!]


In [None]:
model = pipeline.fit(train_set)

In [None]:
preds = model.transform(test_set)
preds.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|        short_review|Sentiment|            document|               token|               label|
+--------------------+---------+--------------------+--------------------+--------------------+
|"Fast service, th...| positive|[{document, 0, 39...|[{token, 0, 0, ",...|[{category, 0, 39...|
|"Really impressed...| positive|[{document, 0, 36...|[{token, 0, 0, ",...|[{category, 0, 36...|
|"Super extended l...| negative|[{document, 0, 19...|[{token, 0, 0, ",...|[{category, 0, 19...|
|"The armband is g...| positive|[{document, 0, 35...|[{token, 0, 0, ",...|[{category, 0, 35...|
|&34;BUYER BEWARE ...| negative|[{document, 0, 35...|[{token, 0, 8, &3...|[{category, 0, 35...|
|&34;you get what ...| negative|[{document, 0, 25...|[{token, 0, 6, &3...|[{category, 0, 25...|
|*** UPDATE Novemb...| positive|[{document, 0, 10...|[{token, 0, 2, **...|[{category, 0, 10...|
|***ORIGINAL REVIE...| positive|[{docume

In [None]:
df = preds.select(['Sentiment','label.result']).toPandas()
df['result'] = df['result'].apply(lambda x: x[0])

In [None]:
df['Sentiment'] = df['Sentiment'].str[:3]
df

Unnamed: 0,Sentiment,result
0,pos,pos
1,pos,pos
2,neg,neg
3,pos,pos
4,neg,neg
...,...,...
5959,pos,neg
5960,neg,neg
5961,neg,neg
5962,neg,neg


In [None]:
from sklearn.metrics import classification_report
print(classification_report(df['result'],df['Sentiment']))

              precision    recall  f1-score   support

         neg       0.91      0.58      0.71      2208
         pos       0.80      0.97      0.87      3756

    accuracy                           0.83      5964
   macro avg       0.86      0.78      0.79      5964
weighted avg       0.84      0.83      0.81      5964



In [None]:
model_path = "/content/Model"
model.write().overwrite().save(model_path)

print("Model saved successfully at:", model_path)

Model saved successfully at: /content/Model
