In [8]:
import pandas as pd
import pickle
import os

In [12]:
DATA_FOLDER = "data/tweets/ten_years"

We want to find the hashtags used by at least 10 distinct companies.

First we load the list of all hashtags in the dataset. This is distinct usages, so there are no repeats.

In [4]:
with open('hashtag_clustering/all_hashtags.pkl', 'rb') as f:
    all_hashtags = pickle.load(f)
    
print(len(all_hashtags))

129439


Create a dictionary mapping each hashtag to an integer representing the number of companies that used that hashtag.

$\text{hashtag} \rightarrow \text{number of compaies that used it}$

In [10]:
# Function to get hashtags within a companie's df

def get_unique_hashtags(df):
    hashtags_set = set([])
    
    hashtags_series = df[df["hashtags"].notnull()]["hashtags"]

    for tags in hashtags_series:
        cleaned_tags_list = tags.replace("{", "").replace("}", "").replace("'", "").split(", ")
        cleaned_tags_list = [tag.lower() for tag in cleaned_tags_list]
        
        for tag in cleaned_tags_list:
            hashtags_set.add(tag)

    return hashtags_set

In [14]:
hashtag_company_usage_counts = {tag.lower(): 0 for tag in all_hashtags}

for comp_csv in os.listdir(DATA_FOLDER):
        print(f"Updating hashtags {comp_csv}")
        df = pd.read_csv(f"{DATA_FOLDER}/{comp_csv}", lineterminator='\n')
        df_hashtags = get_unique_hashtags(df)
        
        for tag in df_hashtags:
            hashtag_company_usage_counts[tag] += 1

Updating hashtags pncbank_tweets.csv
Updating hashtags teradyneinc_tweets.csv
Updating hashtags bakerhughesco_tweets.csv
Updating hashtags ceridian_tweets.csv
Updating hashtags aiginsurance_tweets.csv
Updating hashtags expediamedia_tweets.csv
Updating hashtags costargroup_tweets.csv
Updating hashtags cboe_tweets.csv
Updating hashtags skyworksinc_tweets.csv
Updating hashtags bathbodyworks_tweets.csv
Updating hashtags raytheontech_tweets.csv
Updating hashtags kelloggcompany_tweets.csv
Updating hashtags truistnews_tweets.csv
Updating hashtags royalcaribbean_tweets.csv
Updating hashtags eastmanchemco_tweets.csv
Updating hashtags dominos_tweets.csv
Updating hashtags edwardslifesci_tweets.csv
Updating hashtags labcorp_tweets.csv
Updating hashtags allegionplc_tweets.csv
Updating hashtags kraftheinzco_tweets.csv
Updating hashtags evergypower_tweets.csv
Updating hashtags fbhs_news_tweets.csv
Updating hashtags ups_tweets.csv
Updating hashtags fti_us_tweets.csv
Updating hashtags guycarpenter_twee

In [22]:
frequent_hashtags = dict(filter(lambda pair: pair[1] >= 10, # check that the value is >= 10
            hashtag_company_usage_counts.items()))

frequent_hashtags

{'100bestcc': 53,
 '100bestcos': 25,
 '100cei': 11,
 '100companiescare': 17,
 '100forhealthyfamilies': 14,
 '10yearchallenge': 12,
 '19thamendment': 12,
 '2021spglobalyearbook': 10,
 '3d': 49,
 '3dprinted': 11,
 '3dprinter': 10,
 '3dprinting': 61,
 '401k': 18,
 '40under40': 25,
 '4g': 13,
 '4ir': 17,
 '4thofjuly': 140,
 '50companiescare': 12,
 '5g': 86,
 '5k': 11,
 '811day': 26,
 '911day': 17,
 'aacr19': 12,
 'aapi': 79,
 'aapiheritagemonth': 105,
 'aapihm': 47,
 'abudhabi': 12,
 'aca': 12,
 'access': 12,
 'accessibility': 25,
 'accounting': 22,
 'acquisition': 46,
 'acquisitions': 22,
 'active': 14,
 'actonclimate': 25,
 'ad': 163,
 'ada': 11,
 'ada30': 15,
 'adalovelaceday': 15,
 'adas': 21,
 'additivemanufacturing': 16,
 'administrativeprofessionalsday': 20,
 'adminprofessionalsday': 13,
 'adobesummit': 11,
 'adoption': 15,
 'adventure': 17,
 'advertising': 24,
 'advice': 20,
 'advisors': 11,
 'advmfgexpo': 11,
 'advocacy': 15,
 'aerospace': 47,
 'aeroweek': 12,
 'affordable': 13,
 

In [23]:
len(frequent_hashtags)

3311

In [25]:
sorted(frequent_hashtags.items(), key=lambda x: x[1], reverse=True)

[('earthday', 382),
 ('internationalwomensday', 343),
 ('sustainability', 320),
 ('covid19', 319),
 ('blackhistorymonth', 306),
 ('veteransday', 302),
 ('innovation', 301),
 ('tbt', 288),
 ('pridemonth', 285),
 ('technology', 274),
 ('hispanicheritagemonth', 271),
 ('womenshistorymonth', 268),
 ('diversity', 259),
 ('icymi', 251),
 ('leadership', 238),
 ('pride', 238),
 ('stem', 238),
 ('memorialday', 233),
 ('givingtuesday', 232),
 ('thanksgiving', 222),
 ('halloween', 219),
 ('esg', 216),
 ('inclusion', 216),
 ('mlkday', 215),
 ('valentinesday', 214),
 ('tech', 212),
 ('ai', 206),
 ('iwd2021', 206),
 ('sustainable', 206),
 ('dyk', 205),
 ('juneteenth', 198),
 ('healthcare', 193),
 ('podcast', 193),
 ('data', 185),
 ('nationalinternday', 183),
 ('veterans', 183),
 ('breakthebias', 180),
 ('csr', 175),
 ('iwd2022', 172),
 ('energy', 171),
 ('dei', 168),
 ('mothersday', 165),
 ('climatechange', 164),
 ('cybersecurity', 164),
 ('lgbtq', 164),
 ('ad', 163),
 ('career', 163),
 ('coronaviru