# Text processing and topic modelling of Amazon reviews 
* Automotive category analysis

### Importing libs

In [1]:
import os
import re
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from delta import *
import pyspark.sql.functions as F

from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer
from sparknlp.annotator import Lemmatizer
from sparknlp.annotator import LemmatizerModel
from sparknlp.annotator import Normalizer
from sparknlp.annotator import StopWordsCleaner
from sparknlp.base import Finisher

c:\Python38\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
c:\Python38\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
load_dotenv()

STORAGE_ACCOUNT_NAME = os.getenv('STORAGE_ACCOUNT_NAME')
STORAGE_ACCOUNT_KEY = os.getenv('STORAGE_ACCOUNT_KEY')

builder = SparkSession.builder\
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.4,\
io.delta:delta-core_2.12:2.4.0,\
io.delta:delta-storage:2.4.0,\
com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.0")\
        .config(f"fs.azure.account.key.{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net", f"{STORAGE_ACCOUNT_KEY}")\
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()



In [3]:
root = 'abfss://default@stdatalakeakita.dfs.core.windows.net/synapse/workspaces/syn-synfactoreddatathon01-dev'

df = spark.read\
.format('delta')\
.load(f'{root}/silver/silver_amazon_reviews_automotive')

df.show()

+--------------+----------+--------------------+------+-------+--------------------+--------------------+--------------------+--------------------+--------------+-----------------+------------+------------+--------------------+------------+---------+-------------+
|   reviewer_id|      asin|               title|  rank|overall|flg_positive_overall|flg_negative_overall|             summary|         review_text|         brand|     2nd_category|3rd_category|4th_category|            also_buy|also_buy_qty|also_view|also_view_qty|
+--------------+----------+--------------------+------+-------+--------------------+--------------------+--------------------+--------------------+--------------+-----------------+------------+------------+--------------------+------------+---------+-------------+
|A2TYWZQNOGX2YS|B0001EVUCM|Auto Ventshade 77...|364973|      5|                true|               false|          Vent Shade|Did great for me....|Auto Ventshade|Replacement Parts| Body & Trim|        Body

### Pre-processing review text

#### 1. Removing ponctuation

In [4]:
def clean_text(c):
  c = F.lower(c)
  c = F.regexp_replace(c, "(https?\://)\S+", "") # Remove links
  c = F.regexp_replace(c, "(\\n)|\n|\r|\t", "") # Remove CR, tab, and LR
  c = F.regexp_replace(c, "(?:(?:[0-9]{2}[:\/,]){2}[0-9]{2,4})", "") # Remove dates
  c = F.regexp_replace(c, "@([A-Za-z0-9_]+)", "") # Remove usernames
  c = F.regexp_replace(c, "[0-9]", "") # Remove numbers
  c = F.regexp_replace(c, "\{|\}|\[|\]|\(|\)|\;|\:|\/|\#|\.|\?|\!|\&|\"|\,", "") # Remove symbols
  c = F.regexp_replace(c, ' +', ' ')# Remove multiple whitespaces
  c = F.trim(c)# Remove trailing whitespaces
  return c

df = df.withColumn("review_text_process", clean_text(F.col("review_text")))

#### 2. Document Assembler

In [5]:
# Removing null texts
df_reviews_automotive = (
    df
    .withColumn('review_text_process', F.coalesce(F.col('review_text_process'), F.lit('Null Review')))                    
)

# Step 1: Transforms raw texts to `document` annotation
document_assembler = (
    DocumentAssembler()
    .setInputCol("review_text_process")
    .setOutputCol("document")
)

#### 3. Tokenizer

In [6]:
# Step 3: Tokenization
tokenizer = (
    Tokenizer()
    .setInputCols(["document"])
    .setOutputCol("token")

)

#### 4. Normalizer

In [7]:
normalizer= Normalizer()\
    .setInputCols(["token"])\
    .setOutputCol("normalized")\
    .setLowercase(True)

#### 5. Removing Stop Words

In [8]:
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

stopwordsCleaner =StopWordsCleaner()\
    .setInputCols(["normalized"])\
    .setOutputCol("no_stop_words")\
    .setStopWords(eng_stopwords)

#### 6. Lemmatizer

In [9]:
lemmatizer = LemmatizerModel.pretrained() \
.setInputCols(["no_stop_words"]) \
.setOutputCol("lemma")

lemma_antbnc download started this may take some time.
Approximate size to download 907,6 KB
[OK!]


#### 7. POSTagger

In [10]:
from sparknlp.annotator import PerceptronModel
pos_tagger = PerceptronModel.pretrained('pos_anc') \
     .setInputCols(['document', 'lemma']) \
     .setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 3,9 MB
[OK!]


#### 8. Chunk

In [11]:
from sparknlp.annotator import Chunker
allowed_tags = ['<JJ>+<NN>', '<NN>+<NN>']
chunker = Chunker() \
     .setInputCols(['document', 'pos']) \
     .setOutputCol('ngrams') \
     .setRegexParsers(allowed_tags)

#### 9. Finisher

In [12]:
finisher = Finisher() \
.setInputCols(['lemma', 'ngrams'])

#### 10. Pipeline

In [13]:
from pyspark.ml import Pipeline
pipeline = Pipeline() \
     .setStages([document_assembler,
                 tokenizer,
                 normalizer,
                 stopwordsCleaner,
                 lemmatizer,
                 pos_tagger,
                 chunker,
                 finisher])

df_reviews_automotive = pipeline.fit(df_reviews_automotive).transform(df_reviews_automotive)

In [14]:
df_reviews_automotive = df_reviews_automotive.withColumn('final',
     F.concat(F.col('finished_lemma'), 
            F.col('finished_ngrams')))

df_reviews_automotive.limit(10).show(truncate=False)

+--------------+----------+---------------------------------------------------------------------------+-----+-------+--------------------+--------------------+-------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
df_reviews_automotiveProcessed = (
    df_reviews_automotive
    .select('reviewer_id',
            'asin',
            'title',
            'rank',
            'overall',
            'flg_positive_overall',
            'flg_negative_overall',
            'summary',
            F.when(
                F.length(F.col('review_text_process')) > F.lit(4000), F.substring(F.col('review_text_process'), 1, 3999)
            )
            .otherwise(F.col('review_text_process')).alias('review_text'),
            F.col('finished_lemma'),
            F.col('final'),
            'brand',
            '2nd_category',
            '3rd_category',
            '4th_category')
)
df_reviews_automotiveProcessed.write.format('delta').save(f'{root}/gold/gold_amazon_reviews_automotive_text_processed')

#### 11. Count Vectorizer

In [15]:
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
#tfizer = CountVectorizer(inputCol='finished_lemma',
#                         outputCol='tf_features')
#tf_model = tfizer.fit(df_reviews_automotive)
tf_model = CountVectorizerModel.load('models/vectorizer/count-vectorizer-model')
df_reviews_automotive_tf = tf_model.transform(df_reviews_automotive)

#### 12. IDF

In [16]:
from pyspark.ml.feature import IDF, IDFModel
#idfizer = IDF(inputCol='tf_features', 
#              outputCol='tf_idf_features')
#idf_model = idfizer.fit(df_reviews_automotive_tf)
idf_model = IDFModel.load('models/idf/idf-model')
df_reviews_automotive_tfidf = idf_model.transform(df_reviews_automotive_tf)

#### 13. LDA

In [17]:
from pyspark.ml.clustering import LDA
lda = LDA(k=4, seed=1, optimizer="online", featuresCol='tf_idf_features')
lda.setMaxIter(50)
lda.clear(lda.maxIter)
model = lda.fit(df_reviews_automotive_tfidf)
model.setSeed(1)

#### 14. Save Model

In [None]:
lda_path = 'models/lda' + "/lda_k4"
lda.save(lda_path)
lda_model_path = 'models/lda' + "/lda_model_20230108_k4"
model.save(lda_model_path)

NameError: name 'lda' is not defined

#### 15. Load Model

In [None]:
from pyspark.ml.clustering import LocalLDAModel

modelLoaded = LocalLDAModel.load('models/lda/lda_model_20230108_k4')
df_reviews_automotiveModel = modelLoaded.transform(df_reviews_automotive_tfidf)

#### 16. Show principal words

In [33]:
import pyspark.sql.types as T

vocab = tf_model.vocabulary

num_top_words = 15
topics = (modelLoaded
     .describeTopics(num_top_words)
     .withColumn('vocabulary', F.lit(vocab))
     .withColumn('topicWords', F.transform(F.col('termIndices'), lambda x: F.element_at(F.col('vocabulary'), x+1))))
topics.select('topic', 'topicWords').show(truncate=False)

+-----+--------------------------------------------------------------------------------------------------+
|topic|topicWords                                                                                        |
+-----+--------------------------------------------------------------------------------------------------+
|0    |[great, work, good, install, product, fit, easy, part, price, well, look, quality, one, use, need]|
|1    |[use, battery, get, work, one, car, time, love, buy, plug, charge, well, year, power, good]       |
|2    |[light, fit, perfect, bright, well, look, bulb, great, buy, use, cover, like, get, make, one]     |
|3    |[filter, get, fit, look, one, nice, key, car, make, seat, cover, like, back, use, well]           |
+-----+--------------------------------------------------------------------------------------------------+



#### 17. Save vectorizer

In [39]:
countVectorizerPath = 'models/vectorizer' + "/count-vectorizer"
tfizer.save(countVectorizerPath)
modelPath = 'models/vectorizer' + "/count-vectorizer-model"
tf_model.save(modelPath)
idfPath = 'models/idf' + "/idf"
idfizer.save(idfPath)
modelPath = 'models/idf' + "/idf-model"
idf_model.save(modelPath)


#### 18. Write to ADLS GEN 2

In [21]:
from pyspark.ml.functions import vector_to_array


df_reviews_automotiveModel = (
    df_reviews_automotiveModel
    .withColumn('topicDistribution', vector_to_array(F.col('topicDistribution')))
    .withColumn('topic', F.expr('array_position(topicDistribution, array_max(topicDistribution))'))
    .select('reviewer_id',
            'topic',
            'asin',
            'title',
            'rank',
            'overall',
            'flg_positive_overall',
            'flg_negative_overall',
            'summary',
            F.when(
                F.length(F.col('review_text_process')) > F.lit(4000), F.substring(F.col('review_text_process'), 1, 3999)
            )
            .otherwise(F.col('review_text_process')).alias('review_text'),
            F.col('finished_lemma'),
            F.col('final'),
            'brand',
            '2nd_category',
            '3rd_category',
            '4th_category',
            F.col('topicDistribution').getItem(0).alias('Topic_1'),
            F.col('topicDistribution').getItem(1).alias('Topic_2'),
            F.col('topicDistribution').getItem(2).alias('Topic_3'),
            F.col('topicDistribution').getItem(3).alias('Topic_4'))
)
df_reviews_automotiveModel.write.format('delta').save(f'{root}/gold/gold_amazon_reviews_ctg_automotive')

In [27]:
import pyLDAvis
import numpy as np
pyLDAvis.enable_notebook()
df_reviews_automotiveModel = modelLoaded.transform(df_reviews_automotive_tfidf)
xxx = df_reviews_automotiveModel.select((F.explode(df_reviews_automotiveModel.final)).alias("words")).groupby("words").count()
word_counts = {r['words']:r['count'] for r in xxx.collect()}
word_counts = [word_counts[w] for w in tf_model.vocabulary]
vis = pyLDAvis.prepare(vocab = tf_model.vocabulary, 
                       topic_term_dists= np.array([row for row in modelLoaded.describeTopics(maxTermsPerTopic=len(tf_model.vocabulary)).select(F.col('termWeights')).toPandas()['termWeights']]),
                       doc_topic_dists=np.array([x.toArray() for x in df_reviews_automotiveModel.select(["topicDistribution"]).toPandas()['topicDistribution']]),
                       doc_lengths= [r[0] for r in df_reviews_automotiveModel.select(F.size(df_reviews_automotiveModel.finished_lemma)).collect()],
                       term_frequency=word_counts
)

ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.

In [32]:
df_reviews_automotiveModel.select('finished_lemma', 'topicDistribution').where(F.size(vector_to_array(F.col('topicDistribution'))) == F.lit(0)).show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Python38\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "c:\Python38\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "c:\Python38\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 