In [0]:
bronze_df = spark.read.format("delta").load(
    "/Volumes/sentiment_analysis/default/sentiment_analysis/bronze"
)
bronze_df.printSchema()

root
 |-- id: double (nullable = true)
 |-- text: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- username: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- language: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- like_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- impression_count: integer (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: boolean (nullable = true)
 |-- is_reply: boolean (nullable = true)
 |-- in_reply_to_user_id: integer (nullable = true)
 |-- conversation_id: double (nullable = true)
 |-- user_followers_count: integer (nullable = true)
 |-- user_following_count: integer (nullable = true)
 |-- user_verified: boolean (nullable = true)
 |-- user_location: string (nullable = true)
 |-- possibly_sensitive: boolean (nulla

In [0]:
from pyspark.sql.functions import col, lower, regexp_replace, trim

silver_df = (
    bronze_df
    .withColumn("clean_text", lower(col("text")))
    .withColumn("clean_text", regexp_replace("clean_text", "http\\S+", ""))
    .withColumn("clean_text", regexp_replace("clean_text", "@\\w+", ""))
    .withColumn("clean_text", regexp_replace("clean_text", "#", ""))
    .withColumn("clean_text", regexp_replace("clean_text", r"[^a-z0-9\s]", ""))
    .withColumn("clean_text", regexp_replace("clean_text", r"\s+", " "))
    .withColumn("clean_text", trim(col("clean_text")))
)


In [0]:
from pyspark.sql.functions import expr


silver_df = silver_df.withColumn(
    "hashtags_array",
    expr("filter(split(text, ' '), x -> x like '#%')")
)

silver_df = silver_df.withColumn(
    "hashtags_array",
    expr("transform(hashtags_array, x -> lower(regexp_replace(x, '#', '')))")
)


In [0]:
silver_df = silver_df.withColumn(
    "mentions_array",
    expr("filter(split(text, ' '), x -> x like '@%')")
)

silver_df = silver_df.withColumn(
    "mentions_array",
    expr("transform(mentions_array, x -> lower(regexp_replace(x, '@', '')))")
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_username", lower(col("username")))
    .withColumn("clean_username", regexp_replace("clean_username", r"[^a-z0-9_]", ""))
    .withColumn("clean_username", trim(col("clean_username")))
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_source", lower(col("source")))
    .withColumn("clean_source", regexp_replace("clean_source", r"[^a-z0-9\s]", ""))
    .withColumn("clean_source", regexp_replace("clean_source", r"\s+", " "))
    .withColumn("clean_source", trim(col("clean_source")))
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_user_location", lower(col("user_location")))
    .withColumn("clean_user_location", regexp_replace("clean_user_location", r"[^a-z0-9\s]", ""))
    .withColumn("clean_user_location", regexp_replace("clean_user_location", r"\s+", " "))
    .withColumn("clean_user_location", trim(col("clean_user_location")))
)


In [0]:
from pyspark.sql.functions import to_timestamp


silver_df = silver_df.withColumn(
    "created_at_ts",
    to_timestamp(col("created_at"), "dd-MM-yyyy HH:mm")
)


In [0]:
from pyspark.sql.types import IntegerType

numeric_cols = [
    "retweet_count", "like_count", "reply_count",
    "quote_count", "impression_count"
]

for c in numeric_cols:
    if c in silver_df.columns:
        silver_df = silver_df.withColumn(c, col(c).cast(IntegerType()))


In [0]:
from pyspark.sql.functions import col

silver_df = bronze_df.select(
    col("clean_text"),
    col("hashtags"),      # âœ… KEEP HASHTAGS
    col("created_at"),
    col("sentiment_label")
)



In [0]:
from pyspark.sql.functions import col, lower, regexp_replace, trim

silver_df = (
    bronze_df
    .withColumn(
        "clean_text",
        trim(
            regexp_replace(
                lower(col("text")),
                r"http\S+|www\S+|[^a-zA-Z\s]",
                ""
            )
        )
    )
    .select(
        "clean_text",
        "hashtags",
        "created_at",
        "username"
    )
)

silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save("/Volumes/sentiment_analysis/default/sentiment_analysis/silver_layerrr")


In [0]:
silver_df.display()

clean_text,hashtags,created_at,username
agent every development say quality throughout beautiful databreach,#DataBreach,11-01-2025 18:16,@fjohnson
night respond red information last everything cve blakeerik,#CVE,07-09-2024 15:01,@jpeterson
here grow gas enough analysis least by infosec cybersecurity mfa,"#InfoSec, #CyberSecurity, #MFA",27-03-2025 10:09,@smiller
product significant world talk term herself player half have decide environment view possible mfa cve amandasanchez ogray,"#MFA, #CVE",18-10-2024 11:26,@gabriellecameron
environment decision wall then fire pretty how trip learn enter east much section investment on gun young catch soc soc phishing ddavis hernandezernest,"#SOC, #SOC, #Phishing",06-03-2025 06:33,@ycarlson
edge network wall quite boy those seem shoulder future fall citizen about mfa teresa harrellkenneth,#MFA,03-03-2025 02:51,@ericfarmer
patch for credential stuffing vulnerability released upon these story film soc allenashley millertodd,#SOC,23-12-2024 22:40,@spenceamanda
campaign little near enter their institution deep hacking phishing soc jenniferross samuel,"#Hacking, #Phishing, #SOC",21-09-2024 06:07,@wrightcaleb
according remain arrive attack all form method everything democrat car very number line six space cve clintonhopkins rodney,#CVE,17-08-2024 07:27,@brownjessica
backup systems engaged after brute force eat couple large instead cybersecurity mfa mfa steven williamsyvette,"#CyberSecurity, #MFA, #MFA",06-11-2024 10:25,@novaksara
