In [0]:
# Explore databricks' datasets
display(dbutils.fs.ls('/databricks-datasets'))

path,name,size
dbfs:/databricks-datasets/COVID/,COVID/,0
dbfs:/databricks-datasets/README.md,README.md,976
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359
dbfs:/databricks-datasets/adult/,adult/,0
dbfs:/databricks-datasets/airlines/,airlines/,0
dbfs:/databricks-datasets/amazon/,amazon/,0
dbfs:/databricks-datasets/asa/,asa/,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0


In [0]:
f = open('/dbfs/databricks-datasets/amazon/README.md', 'r')
print(f.read())

In [0]:
%sh
/databricks/python3/bin/pip install spacy 
/databricks/python3/bin/python3 -m spacy download en_core_web_sm

In [0]:
# Import packages
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.sql.types import IntegerType, StructType, StructField, StringType, ArrayType
from pyspark.sql.functions import col, explode, split, regexp_replace, lower, length, countDistinct, current_date, lit, count
import spacy
nlp = spacy.load("en_core_web_sm")
import textacy

In [0]:
text = "I bought when there was a sale. The product was received in good condition but missing some parts for mobile hand phones and ear headsets. So I called customer service and the staff shared that it was because they had no more stock. This is very bad service from their end!"
doc = nlp(text)
patterns = ["POS:NOUN POS:NOUN:+"]
#patterns = []
#patterns.append("POS:NOUN POS:NOUN:+")
matches = textacy.extract.matches.token_matches(doc, patterns = patterns)
print(matches)

In [0]:
patterns

In [0]:
patterns2 = ["POS:NOUN POS:NOUN:+"]
patterns2

In [0]:
nounPhrase = ["-".join([j.lemma_ for j in i]) for i in matches]
nounPhrase

In [0]:
# Extract noun phrases
def extractNounPhrase(text, precedingPOS = ["NOUN"], sep = "_"):
  doc = nlp(text)
  patterns = []
  for POS in precedingPOS: # POS stands for part of speech
    patterns.append(f"POS:{POS} POS:NOUN:+") # look for phrases starting with selected POS and ending with one or more nouns
  matches = textacy.extract.matches.token_matches(doc, patterns = patterns)
  nounPhrase = [sep.join([j.lemma_ for j in i]) for i in matches]
  return nounPhrase
udfNounPhrase = udf(extractNounPhrase, ArrayType(StringType()))

In [0]:
extractNounPhrase(text)

In [0]:
def main():
  
  # Import data
  df = spark.read.parquet("dbfs:/databricks-datasets/amazon/test4K/part-r-00000-64a9bd4a-25fc-48e6-8a60-2fd057bddd27.gz.parquet")
  # Select relevant columns
  dfSelect = df.select("asin", "review")
  # Using text analytics to start analysing word frequency
  # To standardise responses, convert them all to lowercase
  dfLower = dfSelect.withColumn("review_lower", lower(col("review")))
  # Remove the usual punctuations so we don't include them into our analysis
  dfReplace = dfLower.withColumn("review_replace", regexp_replace(col("review_lower"), r'[.,!]', ' '))
  # Replace "svc" with "service" as part of standardisation
  dfReplace = dfReplace.withColumn("review_replace", regexp_replace(col("review_replace"), 'svc', 'service'))
  # Apply lemmatisation, which also tokenises responses in the process
  dfNounPhrase = dfReplace.withColumn("review_nounphrase", udfNounPhrase("review_replace"))
  # For every noun phrase, put it into another row, instead of sharing a row with multiple other noun phrases in an array
  dfExplode = dfNounPhrase.withColumn("review_split", explode(col("review_nounphrase")))
  # Remove rows with empty tokens or with a single character token
  dfFilter = dfExplode.withColumn("review_length", length(col("review_split")))
  dfFilter = dfFilter.filter(col("review_length") > 1)
  # Let's count the number of unique users who mentioned each token (this is the word frequency table)
  dfFrequency = dfFilter.groupBy(col("review_split")).agg(count(col("review")).alias("reviews"))
  # Show popular noun phrases
  dfFrequency.createOrReplaceTempView("dffrequency")
  print("Retrieving popular noun phrases...")
  spark.sql("SELECT review_split, SUM(reviews) AS sum_reviews FROM dffrequency GROUP BY review_split ORDER BY sum_reviews DESC").show(truncate = False)
  
  return dfFrequency

In [0]:
if __name__ == '__main__':
  dfOut = main()