# Official

In [31]:
from pyspark.sql import SparkSession
import numpy as np
import os
from pyspark.sql import functions as F
from pyspark.sql.functions import col, concat_ws, rand, lit
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder \
        .appName("Hackathon") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.driver.maxResultSize", "0") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0") \
        .getOrCreate()

25/08/30 22:58:36 WARN Utils: Your hostname, Asyrafs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.18.78 instead (on interface en0)
25/08/30 22:58:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/trigger/.ivy2/cache
The jars for the packages stored in: /Users/trigger/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e85da8c3-7ddc-42a5-be27-b5de6aa2782d;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/trigger/Documents/GitHub/tiktok-hackathon/.venv_hackathon/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.5.0 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in local-m2-cache
	found commons-codec#commons-codec;1.15 in local-m2-cache
	found org.apache.httpcomponents#httpclient;4.5.13 in local-m2-cache
	found org.apache.httpcomponents#httpcore;4.4.13 in local-m2-cache
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.uni

In [3]:
pathing_review = "datasets/review_data/"
arr = np.array(os.listdir(pathing_review))
reviewData_files = pathing_review + arr

pathing_metadata = "datasets/review_metadata/"
arr = np.array(os.listdir(pathing_metadata))
reviewMetadata_files = pathing_metadata + arr

df_review = spark.read.json(list(reviewData_files)).dropna(subset="text").drop_duplicates()
df_metadata = spark.read.json(list(reviewMetadata_files)).dropna(subset="category").drop_duplicates().withColumnRenamed("name", "business_name").select(["gmap_id", "category", "business_name"])

df_joined = df_review.join(df_metadata, on="gmap_id", how="inner").withColumn("category_str", concat_ws(", ", col("category"))).withColumn("random_order", rand()).orderBy("random_order").drop("random_order")

                                                                                

In [6]:
df_metadata.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- business_name: string (nullable = true)



In [7]:
df_review.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [8]:
df_joined.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- business_name: string (nullable = true)
 |-- category_str: string (nullable = false)



In [4]:
from pyspark.sql.functions import col, when, count, size, trim
from pyspark.sql.types import StringType, FloatType, DoubleType, ArrayType, MapType

# 1) Drop unwanted columns
df_filtered = df_joined.drop("pics", "resp", "time", "category", "user_id")

# 2) Build a "is missing" condition per column based on its data type
missing_conds = []
for f in df_filtered.schema.fields:
    c = col(f.name)
    dt = f.dataType

    if isinstance(dt, (FloatType, DoubleType)):
        # floats/doubles: NULL or NaN
        cond = c.isNull() | c.isnan()
    elif isinstance(dt, StringType):
        # strings: NULL or empty after trim
        cond = c.isNull() | (trim(c) == "")
    elif isinstance(dt, (ArrayType, MapType)):
        # arrays/maps: NULL or empty
        cond = c.isNull() | (size(c) == 0)
    else:
        # ints/longs/booleans/date/timestamp/structs: only NULL
        cond = c.isNull()

    missing_conds.append(count(when(cond, True)).alias(f.name))


In [5]:
missing_counts = df_filtered.select(missing_conds)
missing_counts.show(truncate=False)



+-------+----+------+----+-------------+------------+
|gmap_id|name|rating|text|business_name|category_str|
+-------+----+------+----+-------------+------------+
|0      |0   |0     |0   |0            |0           |
+-------+----+------+----+-------------+------------+



                                                                                

### Some reviews have translation, we only need English ones

In [5]:
df_joined.filter(df_joined.text.contains("(Translated by Google)")).select("text").show(5, truncate=100)



+-----------------------------------------------------------------------------------------------------------+
|                                                                                                       text|
+-----------------------------------------------------------------------------------------------------------+
|                                               (Translated by Google) Nice place\n\n(Original)\nLindo lugar|
|       (Translated by Google) Visit with respect and respect ..., very nice experience!\n\n(Original)\nM...|
|       (Translated by Google) It is a nice place the only detail is that it does not have bathrooms and ...|
|(Translated by Google) It's delicious and very good.\nPromise to go back\n\n(Original)\n맛있고 아주 좋아...|
|       (Translated by Google) Very friendly staff and all their fresh products\n\n(Original)\nEl persona...|
+-----------------------------------------------------------------------------------------------------------+
only showing top 

                                                                                

### Some reviews have many spacings, remove spacings for one-line reviews

In [6]:
df_joined.filter(F.col("text").rlike("\n")).select("text").show(5, truncate=100)



+----------------------------------------------------------------------------------------------------+
|                                                                                                text|
+----------------------------------------------------------------------------------------------------+
|          Been doing there a long time. Always get a great haircut -\nMichelle's the Best!! 👍 👍 👍|
|Very impressed! Dawn and Jazz are AMAZING!\nMake your appointments now for all of your holiday ga...|
|Awesome customer service. We visited this store for backsplash tiles and one of their design cons...|
|BEST BBQ IN PHOENIX!!!\n\nStopped Today and it didn’t disappoint!!! It was soo good!!! I got the ...|
+----------------------------------------------------------------------------------------------------+
only showing top 5 rows



                                                                                

In [5]:
# Removing (Translated by Google) prefix and (Original) languages to get the English reviews 
# Removing all newlines for one-lined reviews
# Removing quotation marks

df_joined = df_joined.withColumn(
    "text",
    F.when(
        F.col("text").contains("(Translated by Google)"),
        # extract the English text, remove newlines, remove quotes
        F.regexp_replace(
            F.regexp_replace(
                F.regexp_extract(F.col("text"), r"\(Translated by Google\)\s*([^\n]+)", 1),
                r"\n+", " "
            ),
            r"\"", ""
        )
    ).otherwise(
        # for rows without Google Translate tag, remove newlines and quotes
        F.regexp_replace(
            F.regexp_replace(F.col("text"), r"\n+", " "),
            r"\"", ""
        )
    )
)


In [6]:
spark_nlp = sparknlp.start(apple_silicon=True)



25/08/30 22:58:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
# -------------------------------
# 1. Document Assembler
# -------------------------------
customer_review = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("customer_review")

# -------------------------------
# 2. Tokenizer
# -------------------------------
customer_review_token = Tokenizer() \
    .setInputCols(["customer_review"]) \
    .setOutputCol("customer_review_token")

# -------------------------------
# 3. Spell Checker
# -------------------------------
customer_review_spell_checker = NorvigSweetingModel.pretrained() \
    .setInputCols(["customer_review_token"]) \
    .setOutputCol("customer_review_corrected")

# -------------------------------
# 4. Normalizer (lowercasing, clean text)
# -------------------------------
customer_review_normalizer = Normalizer() \
    .setInputCols(["customer_review_corrected"]) \
    .setOutputCol("customer_review_normalized") \
    .setLowercase(True)

# -------------------------------
# 5. StopWords Cleaner
# -------------------------------
customer_review_stopwordsCleaner = StopWordsCleaner() \
    .setInputCols(["customer_review_normalized"]) \
    .setOutputCol("customer_review_cleaned")

# -------------------------------
# 6. Lemmatizer
# -------------------------------
customer_review_lemma = LemmatizerModel.pretrained() \
    .setInputCols(["customer_review_token"]) \
    .setOutputCol("customer_review_lemma")

# -------------------------------
# 7. Word Embeddings (GloVe)
# -------------------------------
glove_embeddings = WordEmbeddingsModel.pretrained("glove_100d") \
    .setInputCols(["customer_review_token", "customer_review"]) \
    .setOutputCol("word_embeddings")

# -------------------------------
# 8. Sentence Embeddings (average pooling)
# -------------------------------
sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["customer_review", "word_embeddings"]) \
    .setOutputCol("customer_review_embeddings") \
    .setPoolingStrategy("AVERAGE")

# -------------------------------
# 9. Embeddings Finisher (convert to Spark vector/array)
# -------------------------------
customer_review_finisher = EmbeddingsFinisher() \
    .setInputCols(["customer_review_embeddings"]) \
    .setOutputCols(["customer_review_vector"]) \
    .setOutputAsVector(True) \
    .setCleanAnnotations(False)

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[ | ]

25/08/30 22:59:02 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


[ — ]

25/08/30 22:59:07 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
Download done! Loading the resource.
[ \ ]

                                                                                

[OK!]
lemma_antbnc download started this may take some time.


25/08/30 22:59:15 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


Approximate size to download 907.6 KB
[ | ]

25/08/30 22:59:16 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/08/30 22:59:17 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[OK!]
glove_100d download started this may take some time.


25/08/30 22:59:22 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


Approximate size to download 145.3 MB
[ | ]

25/08/30 22:59:22 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/08/30 22:59:23 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[ / ]Download done! Loading the resource.
[OK!]


In [8]:
# -------------------------------
# 1. Document Assembler
# -------------------------------
business_category = DocumentAssembler() \
    .setInputCol("category_str") \
    .setOutputCol("business_category")

# -------------------------------
# 2. Tokenizer
# -------------------------------
business_category_token = Tokenizer() \
    .setInputCols(["business_category"]) \
    .setOutputCol("business_category_token")

# -------------------------------
# 3. Spell Checker
# -------------------------------
business_category_spell_checker = NorvigSweetingModel.pretrained() \
    .setInputCols(["business_category_token"]) \
    .setOutputCol("business_category_corrected")

# -------------------------------
# 4. Normalizer (lowercasing, clean text)
# -------------------------------
business_category_normalizer = Normalizer() \
    .setInputCols(["business_category_corrected"]) \
    .setOutputCol("business_category_normalized") \
    .setLowercase(True)

# -------------------------------
# 5. StopWords Cleaner
# -------------------------------
business_category_stopwordsCleaner = StopWordsCleaner() \
    .setInputCols(["business_category_normalized"]) \
    .setOutputCol("business_category_cleaned")

# -------------------------------
# 6. Lemmatizer
# -------------------------------
business_category_lemma = LemmatizerModel.pretrained() \
    .setInputCols(["business_category_token"]) \
    .setOutputCol("business_category_lemma")

# -------------------------------
# 7. Word Embeddings (GloVe)
# -------------------------------
business_category_glove_embeddings = WordEmbeddingsModel.pretrained("glove_100d") \
    .setInputCols(["business_category_token", "business_category"]) \
    .setOutputCol("word_embeddings")

# -------------------------------
# 8. Sentence Embeddings (average pooling)
# -------------------------------
business_category_sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["business_category", "word_embeddings"]) \
    .setOutputCol("business_category_embeddings") \
    .setPoolingStrategy("AVERAGE")

# -------------------------------
# 9. Embeddings Finisher (convert to Spark vector/array)
# -------------------------------
business_category_finisher = EmbeddingsFinisher() \
    .setInputCols(["business_category_embeddings"]) \
    .setOutputCols(["business_category_vector"]) \
    .setOutputAsVector(True) \
    .setCleanAnnotations(False)

spellcheck_norvig download started this may take some time.


25/08/30 22:59:27 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


Approximate size to download 4.2 MB
[OK!]
lemma_antbnc download started this may take some time.


25/08/30 22:59:31 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.


25/08/30 22:59:34 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


Approximate size to download 145.3 MB
[OK!]


In [9]:
# Combine both into one pipeline
pipeline = Pipeline(stages=[
    # --- Review branch ---
    customer_review,
    customer_review_token,
    customer_review_spell_checker,
    customer_review_normalizer,
    customer_review_stopwordsCleaner,
    customer_review_lemma,
    glove_embeddings,
    sentence_embeddings,
    customer_review_finisher,

    # --- Business category branch ---
    business_category,
    business_category_token,
    business_category_spell_checker,
    business_category_normalizer,
    business_category_stopwordsCleaner,
    business_category_lemma,
    business_category_glove_embeddings,
    business_category_sentence_embeddings,
    business_category_finisher
])


In [10]:
result = pipeline.fit(df_joined).transform(df_joined)



In [None]:
result.select("text", "customer_review_vector").show(5, truncate=50, vertical=True)

### Cosine similarity between review and business category for relevancy

In [11]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
import numpy as np

def cosine_similarity(v1, v2):
    if v1 is None or v2 is None:
        return None
    a = np.asarray(v1, dtype=float)
    b = np.asarray(v2, dtype=float)

    # squeeze 1xN / Nx1 or nested singletons to 1-D
    if a.ndim > 1:
        a = a.reshape(-1)
    if b.ndim > 1:
        b = b.reshape(-1)

    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))

cosine_sim_udf = udf(cosine_similarity, DoubleType())

result = result.withColumn(
    "cosine_similarity",
    cosine_sim_udf("customer_review_vector", "business_category_vector")
)


In [None]:
result.select("category_str", "text", "cosine_similarity").show(5, truncate=50)

In [12]:
from pyspark.storagelevel import StorageLevel

base = (result
    .select("business_name","category_str","text","rating","cosine_similarity")
    .filter(F.col("cosine_similarity").isNotNull())
    .persist(StorageLevel.MEMORY_AND_DISK))

In [None]:
test = result.select("cosine_similarity").filter(F.col("cosine_similarity").isNotNull())

# Global quantiles
q = test.approxQuantile("cosine_similarity", [0.25, 0.5, 0.75], 0.01)
LO, MED, HI = q[0], q[1], q[2]
print("LO, MED, HI =", LO, MED, HI)


In [13]:
def cache_once(df, level=StorageLevel.MEMORY_AND_DISK):
    # only cache if this instance isn’t already cached
    if not df.is_cached:
        df = df.persist(level)
    return df

benchmark = 0.70
low = 0.60

relevant_df  = cache_once(base.filter(F.col("cosine_similarity") >= benchmark))
irrelevant_df = cache_once(base.filter(F.col("cosine_similarity") <= low))


### Operations to find promotional links or advertisments in both datasets

In [14]:
# Helper funtions

def add_ad_key(df):
    cols = set(df.columns)

    def safe(name, cast_str=False):
        if name in cols:
            c = F.col(name)
            if cast_str:
                c = c.cast("string")
            return F.coalesce(c, F.lit(""))
        else:
            return F.lit("")

    # Build a stable key from available fields (order matters).
    # Include text + business_name + category_str + rating + cosine_similarity; add ids/time if present.
    return df.withColumn(
        "ad_key",
        F.sha2(F.concat_ws("||",
            safe("gmap_id"),
            safe("user_id"),
            safe("business_name"),
            safe("category_str"),
            safe("time", cast_str=True),
            safe("text"),
            safe("rating", cast_str=True),
            safe("cosine_similarity", cast_str=True)
        ), 256)
    )



def with_ads_flags(df):
    txt = F.coalesce(F.col("text"), F.lit(""))

    return (df
      # URLs / domains / shorteners / obfuscations
      .withColumn("has_url",          txt.rlike(r"(?i)\bhttps?://\S+|\bwww\.\S+"))
      .withColumn("has_domain",       txt.rlike(r"(?i)\b[a-z0-9][a-z0-9\-]*\.(?:com|net|org|co|io|info|biz|app|shop|store|sg|uk|au|ca|de|fr|my|ph|id|in)(?:/\S*)?\b"))
      .withColumn("has_shortener",    txt.rlike(r"(?i)\b(bit\.ly|t\.co|goo\.gl|tinyurl\.com|ow\.ly|wa\.me|linktr\.ee)/\S+"))
      .withColumn("has_obfus_domain", txt.rlike(r"(?i)\b[a-z0-9][a-z0-9\-]*\s*(?:\.|dot|\[\.]|\(dot\))\s*(?:com|net|org|co|io|sg|au|uk)\b"))

      # Contact info / WhatsApp
      .withColumn("has_email",        txt.rlike(r"(?i)[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}"))
      .withColumn("has_phone",        txt.rlike(r"(?i)(?:\+?\d[\s\-().]{0,3}){7,}\d"))
      .withColumn("has_whatsapp",     txt.rlike(r"(?i)\bwhatsapp\b|\bwa\.me/\S+"))

      # Promo / CTA phrases
      .withColumn("has_promo_words",  txt.rlike(r"(?i)\b(promo(?:\s*code)?|discount(?:\s*code)?|coupon|use\s+code|deal|sale|flash\s*sale|limited\s*time|special\s*offer|[0-9]{1,3}%\s*off|buy\s*now|order\s*now|book\s*now|free\s*shipping|visit\s+(?:our\s+)?website|click\s+(?:here|link))\b"))

      # Final flag + triggers for auditability
      .withColumn("policy_ads",
          F.col("has_url") | F.col("has_domain") | F.col("has_shortener") |
          F.col("has_obfus_domain") | F.col("has_email") | F.col("has_phone") |
          F.col("has_whatsapp") | F.col("has_promo_words")
      )
      .withColumn("ads_triggers", F.array_remove(F.array(
          F.when(F.col("has_url"),          F.lit("url")),
          F.when(F.col("has_domain"),       F.lit("domain")),
          F.when(F.col("has_shortener"),    F.lit("shortener")),
          F.when(F.col("has_obfus_domain"), F.lit("obfus_domain")),
          F.when(F.col("has_email"),        F.lit("email")),
          F.when(F.col("has_phone"),        F.lit("phone")),
          F.when(F.col("has_whatsapp"),     F.lit("whatsapp")),
          F.when(F.col("has_promo_words"),  F.lit("promo_words"))
      ), None))
    )

In [15]:
relevant_flagged   = add_ad_key(with_ads_flags(relevant_df))
irrelevant_flagged = add_ad_key(with_ads_flags(irrelevant_df))

In [16]:
ads_union = (relevant_flagged.withColumn("source_split", F.lit("relevant"))
             .unionByName(irrelevant_flagged.withColumn("source_split", F.lit("irrelevant"))))

ads_only = (ads_union
            .filter(F.col("policy_ads"))
            .dropDuplicates(["ad_key"]))

In [17]:
ads_keys = ads_only.select("ad_key")

relevant_clean = (relevant_flagged.join(ads_keys, on="ad_key", how="left_anti")
                  .drop("has_url","has_domain","has_shortener","has_obfus_domain",
                        "has_email","has_phone","has_whatsapp","has_promo_words",
                        "policy_ads","ads_triggers"))

irrelevant_clean = (irrelevant_flagged.join(ads_keys, on="ad_key", how="left_anti")
                    .drop("has_url","has_domain","has_shortener","has_obfus_domain",
                          "has_email","has_phone","has_whatsapp","has_promo_words",
                          "policy_ads","ads_triggers"))


### Find reviews that are rants without visit in irrelevant dataset

In [18]:
# Choose the base irrelevant dataframe safely
try:
    irr_base = irrelevant_clean
except NameError:
    try:
        irr_base = irrelevant_df
    except NameError:
        irr_base = irrelevant_df  

irr_base = irr_base.cache()


In [19]:
def ensure_sentiment(df):
    if "sentiment_num" in df.columns:
        return df

    # Prefer corrected tokens if available
    use_corr = "customer_review_corrected" in df.columns
    text_col = "text"

    if use_corr:
        from sparknlp.base import Finisher
        df = (Finisher()
              .setInputCols(["customer_review_corrected"])
              .setOutputCols(["corr_tokens"])
              .setOutputAsArray(True)
              .setCleanAnnotations(True)
             ).transform(df)
        df = df.withColumn("text_corrected", F.array_join("corr_tokens", " "))
        text_col = "text_corrected"

   

    document = DocumentAssembler().setInputCol(text_col).setOutputCol("doc")
    token    = Tokenizer().setInputCols(["doc"]).setOutputCol("tok")
    viv      = ViveknSentimentModel.pretrained().setInputCols(["doc","tok"]).setOutputCol("sent")
    pipe     = Pipeline(stages=[document, token, viv]).fit(df)

    out = pipe.transform(df)
    out = (out
        .withColumn("sentiment_str", F.expr("sent[0].result"))
        .withColumn("prob_pos", F.expr("cast(sent[0].metadata['positive'] as double)"))
        .withColumn("prob_neg", F.expr("cast(sent[0].metadata['negative'] as double)"))
        .drop("sent")
    )
    # margin to avoid shaky labels
    out = out.withColumn(
        "sentiment_num",
        F.when(F.col("prob_pos") - F.col("prob_neg") >= 0.2, 1.0)
         .when(F.col("prob_neg") - F.col("prob_pos") >= 0.2, -1.0)
         .otherwise(0.0)
    )
    return out

irr_sent = ensure_sentiment(irr_base).cache()


sentiment_vivekn download started this may take some time.


25/08/30 22:59:40 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


Approximate size to download 873.6 KB
[ | ]

25/08/30 22:59:41 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/08/30 22:59:42 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
Download done! Loading the resource.
[OK!]


In [20]:
# Regexes
nonvisit_rx = r"""(?i)\b(
    never\s+been|haven'?t\s+been|didn'?t\s+visit|did\s+not\s+visit|
    phone\s+call|called\s+(them|store)|left\s+voicemail|email(ed)?|
    website|online\s+(order|booking|application|support|chat)|
    delivery\s+app|uber\s*eats|doordash|grab\s*food|foodpanda
)\b"""

rumor_rx = r"(?i)\b(i\s*(just\s*)?heard|people\s+say|someone\s+told\s+me|my\s+friend\s+said)\b"

irr_rules = (irr_sent
    .withColumn("text_nn", F.coalesce(F.col("text"), F.lit("")))
    .withColumn("char_len", F.length("text_nn"))
    .withColumn("excl_count", F.size(F.split(F.regexp_replace("text_nn", r"[^!]", ""), "")))
    .withColumn("nonvisit_clues", F.col("text_nn").rlike(nonvisit_rx))
    .withColumn("rumor_clues",    F.col("text_nn").rlike(rumor_rx))
    # core policy flag: negative + (explicit non-visit OR strong proxy)
    .withColumn("policy_nonvisitor_rant",
        (F.col("sentiment_num") < 0) &
        ( F.col("nonvisit_clues") |
          F.col("rumor_clues") |
          (F.col("char_len") < 40) |           # very short angry blurt
          (F.col("excl_count") >= 3)           # lots of exclamation marks
        )
    )
)


In [21]:
def add_key(df):
    cols = set(df.columns)
    def safe(name, cast=False):
        if name in cols:
            c = F.col(name)
            if cast: c = c.cast("string")
            return F.coalesce(c, F.lit(""))
        return F.lit("")

    return df.withColumn("rant_key", F.sha2(F.concat_ws("||",
        safe("gmap_id"),
        safe("user_id"),
        safe("business_name"),
        safe("category_str"),
        safe("time", cast=True),
        safe("text"),
        safe("rating", cast=True)
    ), 256))

irr_flagged = add_key(irr_rules)

rant_only = (irr_flagged
    .filter(F.col("policy_nonvisitor_rant"))
    .dropDuplicates(["rant_key"])     # just in case
    .persist(StorageLevel.MEMORY_AND_DISK)
)


25/08/30 22:59:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [22]:
rant_keys = rant_only.select("rant_key")

irrelevant_no_rant = (irr_flagged.join(rant_keys, on="rant_key", how="left_anti")
    .drop("text_nn","char_len","excl_count","nonvisit_clues","rumor_clues",
          "policy_nonvisitor_rant","rant_key")  # keep your frame clean
    .persist(StorageLevel.MEMORY_AND_DISK)
)


## Preparing datasets for BERT model

In [33]:
relevant_clean.withColumn("label", lit("RELEVANT")).select("text", "label").printSchema()  # these reviews are to be labelled as "relevant"

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = false)



In [34]:
irrelevant_no_rant.withColumn("label", lit("RELEVANT")).select("text", "label").printSchema()        # these reviews are to be labelled as "irrelevant"

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = false)



In [35]:
ads_only.withColumn("label", lit("RELEVANT")).select("text", "label").printSchema()      # these reviews are to be labelled as "advertisment"

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = false)



In [37]:
rant_only.withColumn("label", lit("RELEVANT")).select("text", "label").printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = false)

