In [0]:
bronze_df = spark.read.format("delta").load(
    "/Volumes/sentiment_analysis/default/sentiment_analysis/bronze"
)


In [0]:
silver_df = (
    bronze_df
    .withColumn("clean_text", lower(col("text")))
    .withColumn("clean_text", regexp_replace("clean_text", "http\\S+", ""))
    .withColumn("clean_text", regexp_replace("clean_text", "@\\w+", ""))
    .withColumn("clean_text", regexp_replace("clean_text", "#", ""))
    .withColumn("clean_text", regexp_replace("clean_text", r"[^a-z0-9\s]", ""))
    .withColumn("clean_text", regexp_replace("clean_text", r"\s+", " "))
    .withColumn("clean_text", trim(col("clean_text")))
)


In [0]:
silver_df = silver_df.withColumn(
    "hashtags_array",
    expr("filter(split(text, ' '), x -> x like '#%')")
)

silver_df = silver_df.withColumn(
    "hashtags_array",
    expr("transform(hashtags_array, x -> lower(regexp_replace(x, '#', '')))")
)


In [0]:
silver_df = silver_df.withColumn(
    "mentions_array",
    expr("filter(split(text, ' '), x -> x like '@%')")
)

silver_df = silver_df.withColumn(
    "mentions_array",
    expr("transform(mentions_array, x -> lower(regexp_replace(x, '@', '')))")
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_username", lower(col("username")))
    .withColumn("clean_username", regexp_replace("clean_username", r"[^a-z0-9_]", ""))
    .withColumn("clean_username", trim(col("clean_username")))
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_source", lower(col("source")))
    .withColumn("clean_source", regexp_replace("clean_source", r"[^a-z0-9\s]", ""))
    .withColumn("clean_source", regexp_replace("clean_source", r"\s+", " "))
    .withColumn("clean_source", trim(col("clean_source")))
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_user_location", lower(col("user_location")))
    .withColumn("clean_user_location", regexp_replace("clean_user_location", r"[^a-z0-9\s]", ""))
    .withColumn("clean_user_location", regexp_replace("clean_user_location", r"\s+", " "))
    .withColumn("clean_user_location", trim(col("clean_user_location")))
)


In [0]:
silver_df = silver_df.withColumn(
    "created_at_ts",
    to_timestamp(col("created_at"), "dd-MM-yyyy HH:mm")
)


In [0]:

numeric_cols = [
    "retweet_count", "like_count", "reply_count",
    "quote_count", "impression_count"
]

for c in numeric_cols:
    if c in silver_df.columns:
        silver_df = silver_df.withColumn(c, col(c).cast(IntegerType()))


In [0]:
silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .save("/Volumes/sentiment_analysis/default/sentiment_analysis/silver_layerrr")


In [0]:
silver_df.display()

id,text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,hashtags,mentions,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,possibly_sensitive,ingestion_time,clean_text,hashtags_array,mentions_array,clean_username,clean_source,clean_user_location,created_at_ts
1.95e+18,Agent every development say quality throughout beautiful. #DataBreach,11-01-2025 18:16,@fjohnson,958070516,en,4,5,2,0,43,#DataBreach,,Twitter for iPhone,False,False,,6.99e+18,59,1634,False,East William,False,2025-12-20T08:51:31.076Z,agent every development say quality throughout beautiful databreach,List(databreach),List(),fjohnson,twitter for iphone,east william,2025-01-11T18:16:00.000Z
3.56e+18,Night respond red information last everything. #CVE @blakeerik,07-09-2024 15:01,@jpeterson,819895579,en,2,10,2,0,44,#CVE,@blakeerik,TweetDeck,False,False,,2.15e+18,12401,1005,False,Port Matthew,False,2025-12-20T08:51:31.076Z,night respond red information last everything cve,List(cve),List(blakeerik),jpeterson,tweetdeck,port matthew,2024-09-07T15:01:00.000Z
3.67e+18,Here grow gas enough analysis least by. #InfoSec #CyberSecurity #MFA,27-03-2025 10:09,@smiller,918490409,en,2,13,0,0,46,"#InfoSec, #CyberSecurity, #MFA",,TweetDeck,False,False,,9.64e+18,9514,1995,False,Barbaraland,False,2025-12-20T08:51:31.076Z,here grow gas enough analysis least by infosec cybersecurity mfa,"List(infosec, cybersecurity, mfa)",List(),smiller,tweetdeck,barbaraland,2025-03-27T10:09:00.000Z
7.31e+18,Product significant world talk term herself. Player half have decide environment view possible. #MFA #CVE @amandasanchez @ogray,18-10-2024 11:26,@gabriellecameron,825070419,en,3,4,0,0,55,"#MFA, #CVE","@amandasanchez, @ogray",Android,False,True,,7.62e+18,1329,428,False,East Lydiamouth,False,2025-12-20T08:51:31.076Z,product significant world talk term herself player half have decide environment view possible mfa cve,"List(mfa, cve)","List(amandasanchez, ogray)",gabriellecameron,android,east lydiamouth,2024-10-18T11:26:00.000Z
4.95e+18,Environment decision wall then fire pretty how trip learn enter east. Much section investment on gun young catch. #SOC #SOC #Phishing @ddavis @hernandezernest,06-03-2025 06:33,@ycarlson,428953029,en,1,6,3,0,55,"#SOC, #SOC, #Phishing","@ddavis, @hernandezernest",Twitter Web App,False,False,171879360.0,7.28e+18,28089,920,False,Carlsonmouth,False,2025-12-20T08:51:31.076Z,environment decision wall then fire pretty how trip learn enter east much section investment on gun young catch soc soc phishing,"List(soc, soc, phishing)","List(ddavis, hernandezernest)",ycarlson,twitter web app,carlsonmouth,2025-03-06T06:33:00.000Z
7.29e+18,Edge network wall quite boy those seem shoulder future fall citizen about. #MFA @teresa28 @harrellkenneth,03-03-2025 02:51,@ericfarmer,806348900,en,0,8,1,0,65,#MFA,"@teresa28, @harrellkenneth",Twitter for iPhone,False,False,,9.96e+18,4950,1316,False,New Mariotown,False,2025-12-20T08:51:31.076Z,edge network wall quite boy those seem shoulder future fall citizen about mfa,List(mfa),"List(teresa28, harrellkenneth)",ericfarmer,twitter for iphone,new mariotown,2025-03-03T02:51:00.000Z
1.01e+18,Patch for Credential Stuffing vulnerability released. Upon these story film. #SOC @allenashley @millertodd,23-12-2024 22:40,@spenceamanda,524736385,en,6,10,1,0,40,#SOC,"@allenashley, @millertodd",TweetDeck,False,True,,1.79e+18,2340,1681,False,,False,2025-12-20T08:51:31.076Z,patch for credential stuffing vulnerability released upon these story film soc,List(soc),"List(allenashley, millertodd)",spenceamanda,tweetdeck,,2024-12-23T22:40:00.000Z
9.89e+18,Campaign little near enter their institution deep. #Hacking #Phishing #SOC @jenniferross @samuel87,21-09-2024 06:07,@wrightcaleb,579164766,en,2,9,0,2,49,"#Hacking, #Phishing, #SOC","@jenniferross, @samuel87",Android,False,False,,5.16e+18,5464,296,False,New Angelashire,False,2025-12-20T08:51:31.076Z,campaign little near enter their institution deep hacking phishing soc,"List(hacking, phishing, soc)","List(jenniferross, samuel87)",wrightcaleb,android,new angelashire,2024-09-21T06:07:00.000Z
3.11e+18,According remain arrive attack all form method everything. Democrat car very number line six space. #CVE @clintonhopkins @rodney70,17-08-2024 07:27,@brownjessica,33829406,es,4,7,2,1,53,#CVE,"@clintonhopkins, @rodney70",Android,False,True,,6.31e+18,750,387,False,,False,2025-12-20T08:51:31.076Z,according remain arrive attack all form method everything democrat car very number line six space cve,List(cve),"List(clintonhopkins, rodney70)",brownjessica,android,,2024-08-17T07:27:00.000Z
1.56e+18,Backup systems engaged after brute force. Eat couple large instead. #CyberSecurity #MFA #MFA @steven17 @williamsyvette,06-11-2024 10:25,@novaksara,364423393,es,4,10,0,3,60,"#CyberSecurity, #MFA, #MFA","@steven17, @williamsyvette",Twitter for iPhone,False,False,,1.7e+18,30482,1698,False,,False,2025-12-20T08:51:31.076Z,backup systems engaged after brute force eat couple large instead cybersecurity mfa mfa,"List(cybersecurity, mfa, mfa)","List(steven17, williamsyvette)",novaksara,twitter for iphone,,2024-11-06T10:25:00.000Z
