This is the script used to get some interesting data about tweets from specific consumer-facing companies and tweets from specific firm-facing companies. The goal was to see if there is any difference in the way they tweet which may include the hashtags they use, how long their tweets are, the age of their accounts, etc. Note that this was originally run on the server, where all tweet data is accessible.

The outputs of this script are `consumer_facing_companies.csv` and `firm_facing_companies.csv`

In [None]:
import pandas as pd
import tweepy
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
from twitter_api_xanda import TWITTER_API_BEARER

In [None]:
TEN_YEARS_DATA_PATH = "data/tweets/ten_years/"

### Tweet Info

Fields we want for specified user
- age of account
- avg hashtags per tweet
- number of unique hashtags
- most popular hashtags (top 5)
- average tweet length

In [None]:
client = tweepy.Client(bearer_token=TWITTER_API_BEARER)

USER_FIELDS = ['created_at', 'username']
USER_FIELDS = ['created_at', 'description', 'entities', 'id', 'location', 'name', 'pinned_tweet_id', 'profile_image_url', 'protected', 'public_metrics', 'url', 'username', 'verified', 'withheld']
usernames = ["google","netflix"]
# This isn't working for some reason
user_objs = client.get_users(usernames=usernames, user_fields=USER_FIELDS)

Average length per tweet

In [None]:
tweet_data_file_path = "data/tweets/ten_years/netflix_tweets.csv"
netflix_df = pd.read_csv(tweet_data_file_path)
netflix_df["tweet_length"] = netflix_df.apply(lambda row: len(row["text"]), axis=1)
sum(netflix_df["tweet_length"])/len(netflix_df["tweet_length"])

Average hashtags per tweet

In [None]:
def get_all_hashtags(df):
    hashtags_list = []
    
    hashtags_series = df[df["hashtags"].notnull()]["hashtags"]

    for tags in hashtags_series:
        cleaned_tags_list = tags.replace("{", "").replace("}", "").replace("'", "").split(", ")
        hashtags_list.extend(cleaned_tags_list)

    return hashtags_list

len(get_all_hashtags(netflix_df))/len(netflix_df)

Number of unique hashtags

In [None]:
len(set(get_all_hashtags(netflix_df)))

Top 5 most used hashtags

In [None]:
hashtags_list = get_all_hashtags(netflix_df)
netflix_counter = Counter(hashtags_list)
netflix_counter.most_common(5)

#### Functions

In [None]:
def get_avg_tweet_length(df: pd.DataFrame):
    df["tweet_length"] = df.apply(lambda row: len(row["text"]), axis=1)
    average_length = sum(df["tweet_length"])/len(df["tweet_length"])
    
    return average_length

In [None]:
def get_all_hashtags(df):
    hashtags_list = []
    
    hashtags_series = df[df["hashtags"].notnull()]["hashtags"]

    for tags in hashtags_series:
        cleaned_tags_list = tags.replace("{", "").replace("}", "").replace("'", "").split(", ")
        hashtags_list.extend(cleaned_tags_list)

    return hashtags_list

Getting all the info for our companies of interest

In [None]:
COLUMNS = ["name", "username", "avg_tweet_length", "num_unique_hashtags", "avg_hashtags_per_tweet", "most_common_hashtags"]

In [None]:
CONSUMER_FACING_COMPANIES = [
    ("Bath & Body Works, Inc.", "bathbodyworks"),
    ("Chipotle Mexican Grill", "chipotletweets"),
    ("Delta Air Lines", "delta"),
    ("Disney", "waltdisneyco"),
    ("Expedia Group", "ExpediaGroup"),
    ("Ford Motor Company", "ford"),
    ("Hilton Worldwide", "hiltonnewsroom"),
    ("Kellogg's", "kelloggcompany"),
    ("Netflix", "netflix"),
    ("PepsiCo", "PepsiCo")
]

FIRM_FACING_COMPANIES = [
    ("Broadcom Inc.", "Broadcom"),
    ("Caterpillar Inc.", "caterpillarinc"),
    ("Cisco", "Cisco"),
    ("Dow Inc.", "DowNewsroom"),
    ("Intel", "intel"),
    ("Lockheed Martin", "lockheedmartin"),
    ("Marathon Petroleum", "MarathonPetroCo"),
    ("Moody's Corporation", "MoodysInvSvc"),
    # ("Old Dominion", "odfl_inc"),
    ("Salesforce", "salesforce")
]


In [None]:
def create_tweet_stats_df(companies):
    rows = []
    for name, handle in companies:
        company_df = pd.read_csv(f"{TEN_YEARS_DATA_PATH}{handle}_tweets.csv", lineterminator='\n')
        
        len_per_tweet = get_avg_tweet_length(company_df)
        hashtags_list = get_all_hashtags(company_df)
        num_unique_hashtags = len(set(hashtags_list))
        avg_hashtags_per_tweet = len(hashtags_list)/len(company_df)
        
        most_common_hashtags = Counter(hashtags_list).most_common(5)
        most_common_hashtags_words = set([tag for tag, _ in most_common_hashtags])
        
        rows.append([name, handle, len_per_tweet, num_unique_hashtags, avg_hashtags_per_tweet, most_common_hashtags_words])
        
    output_df = pd.DataFrame(rows, columns=COLUMNS)
    return output_df

In [None]:
consumer_facing_companies_df = create_tweet_stats_df(CONSUMER_FACING_COMPANIES)
consumer_facing_companies_df.to_csv("data/tweets/select_companies/consumer_facing_companies.csv")

In [None]:
firm_facing_companies_df = create_tweet_stats_df(FIRM_FACING_COMPANIES)
firm_facing_companies_df.to_csv("data/tweets/select_companies/firm_facing_companies.csv")

### Visualization

Here we create some word clouds for the hashtags. Note this will probably be biased toward companies that tweet more and use their own hashtags.

In [None]:
handle = "kelloggcompany"
company_df = pd.read_csv(f"{TEN_YEARS_DATA_PATH}{handle}_tweets.csv")
hashtags_list = get_all_hashtags(company_df)
hashtags_count_dict = Counter([tag.lower() for tag in hashtags_list])

cloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = set(STOPWORDS),
                min_font_size = 10).generate_from_frequencies(hashtags_count_dict)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
handle = "hiltonnewsroom"
company_df = pd.read_csv(f"{TEN_YEARS_DATA_PATH}{handle}_tweets.csv")
hashtags_list = get_all_hashtags(company_df)
hashtags_count_dict = Counter([tag.lower() for tag in hashtags_list])

cloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = set(STOPWORDS),
                min_font_size = 10).generate_from_frequencies(hashtags_count_dict)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
handle = "Cisco"
company_df = pd.read_csv(f"{TEN_YEARS_DATA_PATH}{handle}_tweets.csv")
hashtags_list = get_all_hashtags(company_df)
hashtags_count_dict = Counter([tag.lower() for tag in hashtags_list])

cloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = set(STOPWORDS),
                min_font_size = 10).generate_from_frequencies(hashtags_count_dict)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
handle = "MarathonPetroCo"

company_df = pd.read_csv(f"{TEN_YEARS_DATA_PATH}{handle}_tweets.csv")
hashtags_list = get_all_hashtags(company_df)
hashtags_count_dict = Counter([tag.lower() for tag in hashtags_list])

cloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = set(STOPWORDS),
                min_font_size = 10).generate_from_frequencies(hashtags_count_dict)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()