# Official

In [1]:
from pyspark.sql import SparkSession
import numpy as np
import os
from pyspark.sql import functions as F
from pyspark.sql.functions import col, concat_ws, rand

In [2]:
spark = SparkSession.builder \
        .appName("Hackathon") \
        .master("local[*]") \
        .getOrCreate()

pathing_review = "datasets/review_data/"
arr = np.array(os.listdir(pathing_review))
reviewData_files = pathing_review + arr

pathing_metadata = "datasets/review_metadata/"
arr = np.array(os.listdir(pathing_metadata))
reviewMetadata_files = pathing_metadata + arr

df_review = spark.read.json(list(reviewData_files)).dropna(subset="text").drop_duplicates()
df_metadata = spark.read.json(list(reviewMetadata_files)).dropna(subset="category").drop_duplicates().select(["gmap_id", "category"])

df_joined = df_review.join(df_metadata, on="gmap_id", how="inner").withColumn("category_str", concat_ws(", ", col("category"))).withColumn("random_order", rand()).orderBy("random_order").drop("random_order").limit(20000)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/28 18:49:03 WARN Utils: Your hostname, Asyrafs-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.18.78 instead (on interface en0)
25/08/28 18:49:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/28 18:49:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
df_metadata.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [4]:
df_review.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



In [5]:
df_joined.printSchema()

root
 |-- gmap_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- url: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- rating: long (nullable = true)
 |-- resp: struct (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- time: long (nullable = true)
 |-- text: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_str: string (nullable = false)



In [6]:
system_prompt = """
    TASK: Classify this review for policy violations using STRICT criteria and respond with the appropriate label ONLY.

    CONTEXT:
    Business Category: %s
    Customer Rating: %d stars

    STRICT CLASSIFICATION RULES:

    1. ADVERTISEMENT - Only if contains:
    - Actual website URLs (www.site.com, http://)
    - Specific promo codes ("use code SAVE20")
    - Phone numbers with "call us at"
    - Direct business promotion language
    - Pricing information with intent to sell
    NOT customer enthusiasm or recommendations
    Example: "Best pizza! Visit www.pizzapromo.com for discounts!"

    2. IRRELEVANT CONTENT - Only if review explicitly discusses:
    - Topics completely unrelated to business type
    - Wrong business entirely  
    - Personal matters unrelated to the service
    - External factors not controlled by business
    NOT food reviews for restaurants or service reviews for services
    Example: "I love my new phone, but this place is too noisy."

    3. RANT WITHOUT VISIT - Only if explicitly states:
    - "Never been here but..."
    - "Haven't visited but heard..."
    - "Based on what others told me..."
    - Planning to visit but hasn't yet
    NOT detailed negative experiences (these show actual visits)
    Example: "Never been here, but I heard it's terrible."

    4. LOW QUALITY REVIEW - Only if review is:
    - Extremely short (under 5 words)
    - Completely uninformative ("ok", "meh", "...")
    - Only emojis or symbols
    - Lacks any substantive information
    NOT brief but informative reviews
    Example: "Bad" or "👍👍👍"

    5. ACCEPTABLE REVIEW - Default for legitimate customer experiences:
    - Any review discussing actual business experience
    - Positive, negative, or neutral customer feedback
    - Reviews matching business category appropriately
    - Constructive criticism or praise

    IMPORTANT: 
    - Respond with ONLY the classification label
    - Default to "ACCEPTABLE REVIEW" when in doubt
    - Most customer reviews should be acceptable unless clear violations exist
    """

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# 1) Create a stable custom_id column first
window_spec = Window.orderBy(F.monotonically_increasing_id())
df_with_id = df_joined.withColumn(
    "custom_id",
    F.concat(F.lit("request-"), (row_number().over(window_spec) - 1))
)

df_batch = df_with_id.withColumn(
    "json_request",
    F.to_json(F.struct(
        F.col("custom_id").alias("custom_id"),
        F.lit("POST").alias("method"),
        F.lit("/v1/chat/completions").alias("url"),
        F.struct(
            F.lit("gpt-4o-mini").alias("model"),
            F.array(
                F.struct(
                    F.lit("system").alias("role"),
                    F.format_string(
                        system_prompt,          
                        F.col("category_str"),  # %s -> category
                        F.col("rating"),        # %s -> rating
                        F.col("text")           # %s -> review text
                    ).alias("content")
                ),
                F.struct(
                    F.lit("user").alias("role"),
                    F.col("text").alias("content")
                )
            ).alias("messages"),
            F.lit(50).alias("max_tokens")
        ).alias("body")
    ))
)


In [8]:
df_batch.select("json_request") \
    .coalesce(1) \
    .write.mode("overwrite") \
    .text("output/batchinput_temp")

import glob
import shutil

# Find the part file written by Spark (.txt when using .text)
part_file = glob.glob("output/batchinput_temp/part-*.txt")[0]
shutil.move(part_file, "batchinput.jsonl")


25/08/28 18:49:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/28 18:49:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/28 18:49:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/28 18:49:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/28 18:49:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/28 18:49:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/28 1

'batchinput.jsonl'

In [9]:
import os

def split_jsonl(input_path, lines_per_file=1000):
    os.makedirs('batches', exist_ok=True)
    with open(input_path, 'r') as infile:
        file_count = 0
        lines = []
        for i, line in enumerate(infile, 1):
            lines.append(line)
            if i % lines_per_file == 0:
                with open(f'batches/batch_{file_count}.jsonl', 'w') as out:
                    out.writelines(lines)
                lines = []
                file_count += 1
        if lines:
            with open(f'batches/batch_{file_count}.jsonl', 'w') as out:
                out.writelines(lines)
    print(f"Split into {file_count+1} files in 'batches/' directory.")

split_jsonl('batchinput.jsonl', lines_per_file=1000)

Split into 21 files in 'batches/' directory.


In [None]:
import glob
from openai import OpenAI
import time
import os

client = OpenAI()

batch_files = sorted(glob.glob("batches/batch_*.jsonl"))
os.makedirs("batch_results", exist_ok=True)

for batch_path in batch_files:
    # 1. Upload batch file
    upload = client.files.create(file=open(batch_path, "rb"), purpose="batch")
    print(f"Uploaded {batch_path} as file ID: {upload.id}")

    # 2. Submit batch job
    job = client.batches.create(
        input_file_id=upload.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": f"Batch job for {batch_path}"}
    )
    print(f"Submitted batch job ID: {job.id}")

    # 3. Poll for job completion
    while True:
        status = client.batches.retrieve(job.id)
        print(f"Job {job.id} status: {status.status}")
        if status.status in ["completed", "failed", "cancelled"]:
            break
        time.sleep(30)  # Wait before checking again

    # 4. Download results if completed
    if status.status == "completed" and status.output_file_id:
        print(f"Batch {batch_path} completed. Downloading results...")
        result_file = client.files.retrieve(status.output_file_id)
        content = client.files.content(status.output_file_id)
        out_path = os.path.join("batch_results", f"results_{os.path.basename(batch_path)}")
        with open(out_path, "wb") as f:
            f.write(content.read())
        print(f"Results saved to {out_path}")
    else:
        print(f"Batch {batch_path} did not complete successfully. Check errors.")

    # Optional: Pause between batches to avoid rate limits
    time.sleep(10)

In [None]:
# Map results to original ID

import json
import glob
import pandas as pd

# Collect all batch results
result_files = glob.glob("batch_results/results_batch_*.jsonl")

records = []
for file in result_files:
    with open(file, "r") as f:
        for line in f:
            obj = json.loads(line)
            if obj.get("response"):
                label = obj["response"]["body"]["choices"][0]["message"]["content"].strip()
                records.append({"custom_id": obj["custom_id"], "predicted_label": label})

# Convert to DataFrame (custom_id → predicted_label)
df_results = pd.DataFrame(records)

spark = SparkSession.builder.getOrCreate()

# Convert results into Spark DataFrame
df_results_spark = spark.createDataFrame(df_results)

# 1) Extract `custom_id` from the JSON string column `json_request`
df_batch_with_id = df_batch.withColumn(
    "custom_id",
    F.get_json_object(F.col("json_request"), "$.custom_id")
)

# (optional but recommended) sanity check: ensure no null custom_id
# df_batch_with_id.filter(F.col("custom_id").isNull()).count()

# (optional) dedupe by custom_id in both datasets before joining
df_batch_dedup = df_batch_with_id.dropDuplicates(["custom_id"])
df_results_dedup = df_results_spark.dropDuplicates(["custom_id"])

# 2) LEFT join so unprocessed/cancelled requests remain with NULL labels
df_final = df_batch_dedup.join(df_results_dedup, on="custom_id", how="left")

# (optional) quick coverage check





In [33]:
df_final.select(
    F.count("*").alias("total_rows"),
    F.count("predicted_label").alias("rows_with_label")
).show()

25/08/29 03:00:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:00:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:00:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:00:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:00:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:00:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 0

+----------+---------------+
|total_rows|rows_with_label|
+----------+---------------+
|     20000|          15000|
+----------+---------------+



                                                                                

In [36]:
df_final.groupBy("predicted_label").count().orderBy(F.desc("count")).show()

25/08/29 03:04:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:04:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:04:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:04:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:04:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:04:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 0

+------------------+-----+
|   predicted_label|count|
+------------------+-----+
| ACCEPTABLE REVIEW|12966|
|              NULL| 5000|
|LOW QUALITY REVIEW| 1611|
|IRRELEVANT CONTENT|  297|
|     ADVERTISEMENT|   80|
|RANT WITHOUT VISIT|   45|
|       ADVERTISING|    1|
+------------------+-----+



                                                                                

In [39]:
df_export = df_final.drop("pics", "resp", "category", "json_request")
df_export.coalesce(1).write.mode("overwrite").option("header", True).csv("output/classified_reviews_tmp")

25/08/29 03:09:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:09:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:09:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:09:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:09:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 03:09:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/29 0

In [None]:
import glob, os, shutil

src_folder = "output/classified_reviews_tmp"
dest_file = "output/classified_reviews.csv"

part_file = glob.glob(os.path.join(src_folder, "part-*.csv"))[0]
shutil.move(part_file, dest_file)

# (Optional) clean up the temporary folder
for leftover in glob.glob(os.path.join(src_folder, "*")):
    try:
        os.remove(leftover)
    except IsADirectoryError:
        shutil.rmtree(leftover)

# Categorical Analysis with NLP