### Hashtag Trend Analysis




In [0]:
from pyspark.sql.functions import (
    col, split, regexp_replace, explode, to_date, count
)
df = df.withColumn(
    "hashtags_array",
    split(col("hashtags_array"), ",")
)
df_tags = df.select(
    explode(col("hashtags_array")).alias("hashtag"),
    col("created_at")
)

df_tags = df_tags.filter(col("hashtag").isNotNull())
df_tags = df_tags.withColumn("date", to_date(col("created_at")))
top_hashtags = df_tags.groupBy("hashtag") \
                      .agg(count("*").alias("count")) \
                      .orderBy(col("count").desc())

top_hashtags.write.format("delta").mode("overwrite") \
                  .saveAsTable("default.gold_hashtag_trends")

display(top_hashtags.limit(20))


hashtag,count
"""[""""#zeroday""""",175
"""[""""#threatintel""""",164
"""[""""#firewall""""",158
"""[""""#infosec""""",155
"""[""""#phishing""""",150
"""[""""#ransomware""""",149
"""[""""#ddos""""",149
"""[""""#databreach""""",148
"""[""""#cybersecurity""""",147
"""[""""#soc""""",146


In [0]:
from pyspark.sql.functions import to_timestamp, to_date, col, count

df_daily = (
    df.withColumn(
        "date",
        to_date(to_timestamp(col("created_at"), "dd-MM-yyyy HH:mm"))
    )
    .groupBy("date", "sentiment")
    .agg(count("*").alias("count"))
    .orderBy("date")
)

df_daily.write.format("delta").mode("overwrite") \
      .saveAsTable("workspace.default.gold_daily_sentiment")

display(df_daily)


date,sentiment,count
2024-08-01,negative,1
2024-08-01,neutral,7
2024-08-01,positive,3
2024-08-02,positive,5
2024-08-02,neutral,5
2024-08-03,neutral,4
2024-08-03,positive,4
2024-08-03,negative,2
2024-08-04,positive,2
2024-08-04,neutral,7


### Tweet count per date


In [0]:
from pyspark.sql.functions import col, count

top_users = df.groupBy("username") \
              .agg(count("*").alias("tweet_count")) \
              .orderBy(col("tweet_count").desc())

top_users.write.format("delta").mode("overwrite") \
               .saveAsTable("workspace.default.gold_top_users")

display(top_users.limit(20))


username,tweet_count
@davidharris,3
@ndavis,3
@fgarcia,3
@smithlisa,2
@hsmith,2
@bsmith,2
@zbrown,2
@yburton,2
@jessica14,2
@frogers,2


 **Hashtag count per date**

In [0]:
from pyspark.sql.functions import to_timestamp, to_date, col, count
df_tags = df_tags.withColumn(
    "date", 
    to_date(to_timestamp(col("created_at"), "dd-MM-yyyy HH:mm"))
)
df_tags_by_date = (
    df_tags.groupBy("date", "hashtag")
           .agg(count("*").alias("count"))
           .orderBy("date", col("count").desc())
)
df_tags_by_date.write.format("delta").mode("overwrite") \
                     .saveAsTable("workspace.default.gold_daily_hashtags")

display(df_tags_by_date.limit(20))


date,hashtag,count
2024-08-01,"""[""""#cve""""",2
2024-08-01,"""[""""#ransomware""""",2
2024-08-01,"""[""""#soc""""]""",2
2024-08-01,"""[""""#threatintel""""]""",1
2024-08-01,"""[""""#infosec""""",1
2024-08-01,"""[""""#firewall""""]""",1
2024-08-01,"""[""""#phishing""""",1
2024-08-01,"""[""""#cybersecurity""""]""",1
2024-08-02,"""[""""#cybersecurity""""]""",1
2024-08-02,"""[""""#mfa""""]""",1


**Sentiment Percentage**

In [0]:
from pyspark.sql.functions import count, col

total_count = df.count()

gold_sentiment_percent = df.groupBy("sentiment") \
    .agg((count("*") / total_count * 100).alias("percentage")) \
    .orderBy(col("percentage").desc())

gold_sentiment_percent.write.format("delta").mode("overwrite") \
    .saveAsTable("default.gold_sentiment_percent")

display(gold_sentiment_percent)


sentiment,percentage
neutral,52.188782489740085
positive,39.46648426812585
negative,8.344733242134064


**DASHBOARD**


**Dashboard for text** 

In [0]:
%sql
SELECT sentiment, percentage
FROM default.gold_sentiment_percent


sentiment,percentage
neutral,52.188782489740085
positive,39.46648426812585
negative,8.344733242134064


Databricks visualization. Run in Databricks to view.

**Trending Hashtags:**

In [0]:
%sql
SELECT hashtag, count
FROM default.gold_hashtag_trends
ORDER BY count DESC
LIMIT 20


hashtag,count
"""[""""#zeroday""""",175
"""[""""#threatintel""""",164
"""[""""#firewall""""",158
"""[""""#infosec""""",155
"""[""""#phishing""""",150
"""[""""#ddos""""",149
"""[""""#ransomware""""",149
"""[""""#databreach""""",148
"""[""""#cybersecurity""""",147
"""[""""#soc""""",146


Databricks visualization. Run in Databricks to view.

Hashtag trend over time:

**Hashtag trend over time:**

In [0]:
%sql
SELECT username, tweet_count
FROM default.gold_top_users
ORDER BY tweet_count DESC
LIMIT 20;


username,tweet_count
@davidharris,3
@fgarcia,3
@ndavis,3
@karen31,2
@yburton,2
@michael79,2
@smithlisa,2
@michaeljones,2
@lrodriguez,2
@nbrown,2


Databricks visualization. Run in Databricks to view.