In [0]:
import time
start_time = time.time()


In [0]:
bronze_df = spark.read.table("default.twitter_raw")


In [0]:
from pyspark.sql.functions import col, lower, regexp_replace, trim

silver_df = (
    bronze_df
    .withColumn("clean_text", lower(col("text")))
    .withColumn("clean_text", regexp_replace("clean_text", "http\\S+", ""))
    .withColumn("clean_text", regexp_replace("clean_text", "@\\w+", ""))
    .withColumn("clean_text", regexp_replace("clean_text", "#", ""))
    .withColumn("clean_text", regexp_replace("clean_text", r"[^a-z0-9\s]", ""))
    .withColumn("clean_text", regexp_replace("clean_text", r"\s+", " "))
    .withColumn("clean_text", trim(col("clean_text")))
)


In [0]:
from pyspark.sql.functions import expr


silver_df = silver_df.withColumn(
    "hashtags_array",
    expr("filter(split(text, ' '), x -> x like '#%')")
)

silver_df = silver_df.withColumn(
    "hashtags_array",
    expr("transform(hashtags_array, x -> lower(regexp_replace(x, '#', '')))")
)


In [0]:
silver_df = silver_df.withColumn(
    "mentions_array",
    expr("filter(split(text, ' '), x -> x like '@%')")
)

silver_df = silver_df.withColumn(
    "mentions_array",
    expr("transform(mentions_array, x -> lower(regexp_replace(x, '@', '')))")
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_username", lower(col("username")))
    .withColumn("clean_username", regexp_replace("clean_username", r"[^a-z0-9_]", ""))
    .withColumn("clean_username", trim(col("clean_username")))
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_source", lower(col("source")))
    .withColumn("clean_source", regexp_replace("clean_source", r"[^a-z0-9\s]", ""))
    .withColumn("clean_source", regexp_replace("clean_source", r"\s+", " "))
    .withColumn("clean_source", trim(col("clean_source")))
)


In [0]:
silver_df = (
    silver_df
    .withColumn("clean_user_location", lower(col("user_location")))
    .withColumn("clean_user_location", regexp_replace("clean_user_location", r"[^a-z0-9\s]", ""))
    .withColumn("clean_user_location", regexp_replace("clean_user_location", r"\s+", " "))
    .withColumn("clean_user_location", trim(col("clean_user_location")))
)


In [0]:
from pyspark.sql.functions import to_timestamp


silver_df = silver_df.withColumn(
    "created_at_ts",
    to_timestamp(col("created_at"), "dd-MM-yyyy HH:mm")
)


In [0]:
from pyspark.sql.functions import col

silver_df = bronze_df.select(
    col("clean_text"),
    col("hashtags"),      # âœ… KEEP HASHTAGS
    col("created_at"),
    col("sentiment_label")
)



In [0]:
from pyspark.sql.functions import col, lower, regexp_replace, trim

silver_df = (
    bronze_df
    .withColumn(
        "clean_text",
        trim(
            regexp_replace(
                lower(col("text")),
                r"http\S+|www\S+|[^a-zA-Z\s]",
                ""
            )
        )
    )
)




In [0]:
from pyspark.sql.functions import sum, when, col

null_count_df = silver_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in silver_df.columns
])

null_count_df.display()


id,text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,hashtags,mentions,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,possibly_sensitive,ingestion_time,clean_text
0,0,0,0,0,0,0,0,0,0,0,0,167374,0,0,0,402642,0,0,0,0,150971,0,0,0


In [0]:
from pyspark.sql.functions import col

silver_df = silver_df.fillna(
    {"mentions": ""}
)


In [0]:
silver_df = silver_df.fillna(
    {"in_reply_to_user_id": 0}
)




In [0]:
from pyspark.sql.functions import sum, when, col

null_count_df = silver_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in silver_df.columns
])

null_count_df.display()


id,text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,hashtags,mentions,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,possibly_sensitive,ingestion_time,clean_text
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,150971,0,0,0


In [0]:
from pyspark.sql.functions import current_timestamp, col, when

silver_df = silver_df.withColumn(
    "ingestion_time",
    when(col("ingestion_time").isNull(), current_timestamp())
    .otherwise(col("ingestion_time"))
)


In [0]:
from pyspark.sql.functions import col

silver_df = silver_df.fillna(
    {"user_location": "Unknown"}
)


In [0]:
from pyspark.sql.functions import sum, when, col

null_count_df = silver_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in silver_df.columns
])

null_count_df.display()


id,text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,hashtags,mentions,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,possibly_sensitive,ingestion_time,clean_text
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
silver_df.write \
  .format("delta") \
  .mode("overwrite") \
  .option("overwriteSchema", "true") \
  .saveAsTable("default.twitter_clean")


In [0]:
end_time = time.time()
print(f"Total notebook runtime: {(end_time - start_time)/60:.2f} minutes")


Total notebook runtime: 2.23 minutes
