In [0]:
df = spark.table("workspace.default.twitter_raw")


In text column Remove null or empty text,Clean text

In [0]:
from pyspark.sql.functions import col, lower, regexp_replace, trim
text_col = "text"
df = df.filter((col(text_col).isNotNull()) & (trim(col(text_col)) != ""))
df = df.withColumn("clean_text", lower(col(text_col)))
df = df.withColumn("clean_text", regexp_replace("clean_text", r"http\S+|www.\S+", ""))  # remove URLs
df = df.withColumn("clean_text", regexp_replace("clean_text", r"@\w+", ""))             # remove mentions
df = df.withColumn("clean_text", regexp_replace("clean_text", r"[^a-z0-9#\s]", " "))   # remove punctuation except #
df = df.withColumn("clean_text", regexp_replace("clean_text", r"\s+", " "))             # collapse spaces
df = df.withColumn("clean_text", trim(col("clean_text")))


leaning the hasttag

In [0]:
from pyspark.sql.functions import split, array, expr, when, regexp_replace

df = df.withColumn(
    "hashtags_array",
    when(
        col("hashtags").isNotNull(),
        split(regexp_replace(col("hashtags"), r"[^\w#]", " "), " ")
    ).otherwise(array())
)

# Remove empty strings
df = df.withColumn("hashtags_array", expr("filter(hashtags_array, x -> x != '')"))

# Lowercase each element
df = df.withColumn("hashtags_array", expr("transform(hashtags_array, x -> lower(x))"))


Cleaning the mentions_array


In [0]:
df = df.withColumn(
    "mentions_array",
    when(
        col("mentions").isNotNull(),
        split(regexp_replace(col("mentions"), r"[^\w@]", " "), " ")
    ).otherwise(array())
)

df = df.withColumn("mentions_array", expr("filter(mentions_array, x -> x != '')"))
df = df.withColumn("mentions_array", expr("transform(mentions_array, x -> lower(x))"))


In [0]:
df = df.withColumn("clean_username", lower(col("username")))
df = df.withColumn("clean_username", regexp_replace("clean_username", r"[^a-z0-9_]", ""))
df = df.withColumn("clean_username", trim(col("clean_username")))


In [0]:
df = df.withColumn("clean_source", lower(col("source")))
df = df.withColumn("clean_source", regexp_replace("clean_source", r"[^a-z0-9\s]", ""))
df = df.withColumn("clean_source", regexp_replace("clean_source", r"\s+", " "))
df = df.withColumn("clean_source", trim(col("clean_source")))


In [0]:
df = df.withColumn("clean_user_location", lower(col("user_location")))
df = df.withColumn("clean_user_location", regexp_replace("clean_user_location", r"[^a-z0-9\s]", ""))
df = df.withColumn("clean_user_location", regexp_replace("clean_user_location", r"\s+", " "))
df = df.withColumn("clean_user_location", trim(col("clean_user_location")))


In [0]:
from pyspark.sql.functions import to_timestamp

df = df.withColumn("created_at_ts", to_timestamp(col("created_at"), "dd-MM-yyyy HH:mm"))


In [0]:
from pyspark.sql.types import IntegerType, DoubleType

numeric_cols = ["retweet_count", "like_count", "reply_count", "quote_count", "impression_count"]
for c in numeric_cols:
    if c in df.columns:
        df = df.withColumn(c, col(c).cast(IntegerType()))

df = df.withColumn("sentiment_score", col("sentiment_score").cast(DoubleType()))
df = df.withColumn("subjectivity", col("subjectivity").cast(DoubleType()))


In [0]:
boolean_cols = ["is_retweet", "is_reply", "user_verified", "possibly_sensitive"]
for c in boolean_cols:
    if c in df.columns:
        df = df.withColumn(c, col(c).cast("boolean"))


In [0]:
from pyspark.ml.feature import StringIndexer

categorical_cols = ["sentiment", "emotion"]
for c in categorical_cols:
    if c in df.columns:
        indexer = StringIndexer(inputCol=c, outputCol=c+"_index")
        df = indexer.fit(df).transform(df)


extraction of hashtags: words beginning with #

In [0]:
from pyspark.sql.functions import regexp_extract_all, expr


df = df.withColumn("hashtags_array", expr("filter(split(clean_text, ' '), x -> x like '#%')"))

display(df.select("clean_text", "hashtags_array").limit(10))


clean_text,hashtags_array
agent every development say quality throughout beautiful #databreach,List(#databreach)
night respond red information last everything #cve,List(#cve)
here grow gas enough analysis least by #infosec #cybersecurity #mfa,"List(#infosec, #cybersecurity, #mfa)"
product significant world talk term herself player half have decide environment view possible #mfa #cve,"List(#mfa, #cve)"
environment decision wall then fire pretty how trip learn enter east much section investment on gun young catch #soc #soc #phishing,"List(#soc, #soc, #phishing)"
edge network wall quite boy those seem shoulder future fall citizen about #mfa,List(#mfa)
patch for credential stuffing vulnerability released upon these story film #soc,List(#soc)
campaign little near enter their institution deep #hacking #phishing #soc,"List(#hacking, #phishing, #soc)"
according remain arrive attack all form method everything democrat car very number line six space #cve,List(#cve)
backup systems engaged after brute force eat couple large instead #cybersecurity #mfa #mfa,"List(#cybersecurity, #mfa, #mfa)"


write cleaned silver table

In [0]:

df.write.format("delta").mode("overwrite").saveAsTable("workspace.default.twitter_clean")
print("Silver table written: workspace.default.twitter_clean")


Silver table written: workspace.default.twitter_clean


In [0]:
%sql
SELECT *
FROM workspace.default.twitter_clean


id,text,cleaned_text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,hashtags,mentions,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,possibly_sensitive,sentiment,sentiment_score,subjectivity,emotion,_ingest_timestamp,_source,clean_text,hashtags_array,mentions_array,clean_username,clean_source,clean_user_location,created_at_ts,sentiment_index,emotion_index
1.94535e+18,Agent every development say quality throughout beautiful. #DataBreach,Agent every development say quality throughout beautiful.,11-01-2025 18:16,@fjohnson,958070516,en,4,5,2,0,43,#DataBreach,,Twitter for iPhone,False,False,,6.99413e+18,59,1634,False,East William,False,positive,0.85,1.0,surprise,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,agent every development say quality throughout beautiful #databreach,List(#databreach),List(),fjohnson,twitter for iphone,east william,2025-01-11T18:16:00.000Z,1.0,5.0
3.56291e+18,Night respond red information last everything. #CVE @blakeerik,Night respond red information last everything.,07-09-2024 15:01,@jpeterson,819895579,en,2,10,2,0,44,#CVE,@blakeerik,TweetDeck,False,False,,2.15132e+18,12401,1005,False,Port Matthew,False,neutral,0.0,0.033,neutral,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,night respond red information last everything #cve,List(#cve),List(@blakeerik),jpeterson,tweetdeck,port matthew,2024-09-07T15:01:00.000Z,0.0,3.0
3.66913e+18,Here grow gas enough analysis least by. #InfoSec #CyberSecurity #MFA,Here grow gas enough analysis least by.,27-03-2025 10:09,@smiller,918490409,en,2,13,0,0,46,"#InfoSec, #CyberSecurity, #MFA",,TweetDeck,False,False,,9.63911e+18,9514,1995,False,Barbaraland,False,negative,-0.15,0.45,sadness,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,here grow gas enough analysis least by #infosec #cybersecurity #mfa,"List(#infosec, #cybersecurity, #mfa)",List(),smiller,tweetdeck,barbaraland,2025-03-27T10:09:00.000Z,2.0,1.0
7.31445e+18,Product significant world talk term herself. Player half have decide environment view possible. #MFA #CVE @amandasanchez @ogray,Product significant world talk term herself. Player half have decide environment view possible.,18-10-2024 11:26,@gabriellecameron,825070419,en,3,4,0,0,55,"#MFA, #CVE","@amandasanchez, @ogray",Android,False,True,,7.62159e+18,1329,428,False,East Lydiamouth,False,neutral,0.069,0.681,neutral,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,product significant world talk term herself player half have decide environment view possible #mfa #cve,"List(#mfa, #cve)","List(@amandasanchez, @ogray)",gabriellecameron,android,east lydiamouth,2024-10-18T11:26:00.000Z,0.0,3.0
4.9516e+18,Environment decision wall then fire pretty how trip learn enter east. Much section investment on gun young catch. #SOC #SOC #Phishing @ddavis @hernandezernest,Environment decision wall then fire pretty how trip learn enter east. Much section investment on gun young catch.,06-03-2025 06:33,@ycarlson,428953029,en,1,6,3,0,55,"#SOC, #SOC, #Phishing","@ddavis, @hernandezernest",Twitter Web App,False,False,171879360.0,7.27633e+18,28089,920,False,Carlsonmouth,False,positive,0.183,0.533,joy,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,environment decision wall then fire pretty how trip learn enter east much section investment on gun young catch #soc #soc #phishing,"List(#soc, #soc, #phishing)","List(@ddavis, @hernandezernest)",ycarlson,twitter web app,carlsonmouth,2025-03-06T06:33:00.000Z,1.0,0.0
7.28767e+18,Edge network wall quite boy those seem shoulder future fall citizen about. #MFA @teresa28 @harrellkenneth,Edge network wall quite boy those seem shoulder future fall citizen about.,03-03-2025 02:51,@ericfarmer,806348900,en,0,8,1,0,65,#MFA,"@teresa28, @harrellkenneth",Twitter for iPhone,False,False,,9.96409e+18,4950,1316,False,New Mariotown,False,neutral,0.0,0.125,neutral,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,edge network wall quite boy those seem shoulder future fall citizen about #mfa,List(#mfa),"List(@teresa28, @harrellkenneth)",ericfarmer,twitter for iphone,new mariotown,2025-03-03T02:51:00.000Z,0.0,3.0
1.00527e+18,Patch for Credential Stuffing vulnerability released. Upon these story film. #SOC @allenashley @millertodd,Patch for Credential Stuffing vulnerability released. Upon these story film.,23-12-2024 22:40,@spenceamanda,524736385,en,6,10,1,0,40,#SOC,"@allenashley, @millertodd",TweetDeck,False,True,,1.79001e+18,2340,1681,False,,False,neutral,0.0,0.0,sadness,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,patch for credential stuffing vulnerability released upon these story film #soc,List(#soc),"List(@allenashley, @millertodd)",spenceamanda,tweetdeck,,2024-12-23T22:40:00.000Z,0.0,1.0
9.89468e+18,Campaign little near enter their institution deep. #Hacking #Phishing #SOC @jenniferross @samuel87,Campaign little near enter their institution deep.,21-09-2024 06:07,@wrightcaleb,579164766,en,2,9,0,2,49,"#Hacking, #Phishing, #SOC","@jenniferross, @samuel87",Android,False,False,,5.16422e+18,5464,296,False,New Angelashire,False,neutral,-0.029,0.433,sadness,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,campaign little near enter their institution deep #hacking #phishing #soc,"List(#hacking, #phishing, #soc)","List(@jenniferross, @samuel87)",wrightcaleb,android,new angelashire,2024-09-21T06:07:00.000Z,0.0,1.0
3.11162e+18,According remain arrive attack all form method everything. Democrat car very number line six space. #CVE @clintonhopkins @rodney70,According remain arrive attack all form method everything. Democrat car very number line six space.,17-08-2024 07:27,@brownjessica,33829406,es,4,7,2,1,53,#CVE,"@clintonhopkins, @rodney70",Android,False,True,,6.31461e+18,750,387,False,,False,positive,0.2,0.3,anger,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,according remain arrive attack all form method everything democrat car very number line six space #cve,List(#cve),"List(@clintonhopkins, @rodney70)",brownjessica,android,,2024-08-17T07:27:00.000Z,1.0,2.0
1.55905e+18,Backup systems engaged after brute force. Eat couple large instead. #CyberSecurity #MFA #MFA @steven17 @williamsyvette,Backup systems engaged after brute force. Eat couple large instead.,06-11-2024 10:25,@novaksara,364423393,es,4,10,0,3,60,"#CyberSecurity, #MFA, #MFA","@steven17, @williamsyvette",Twitter for iPhone,False,False,,1.6953e+18,30482,1698,False,,False,positive,0.214,0.429,fear,2025-12-11T11:39:14.586Z,twitter_sentiment_dataset.csv,backup systems engaged after brute force eat couple large instead #cybersecurity #mfa #mfa,"List(#cybersecurity, #mfa, #mfa)","List(@steven17, @williamsyvette)",novaksara,twitter for iphone,,2024-11-06T10:25:00.000Z,1.0,4.0
